aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@suse.de>2006-01-06 15:59:59 -0500
committerGreg Kroah-Hartman <gregkh@suse.de>2006-01-06 15:59:59 -0500
commitccf18968b1bbc2fb117190a1984ac2a826dac228 (patch)
tree7bc8fbf5722aecf1e84fa50c31c657864cba1daa
parente91c021c487110386a07facd0396e6c3b7cf9c1f (diff)
parentd99cf9d679a520d67f81d805b7cb91c68e1847f0 (diff)
Merge ../torvalds-2.6/
-rw-r--r--Documentation/block/biodoc.txt10
-rw-r--r--Documentation/feature-removal-schedule.txt11
-rw-r--r--Documentation/filesystems/00-INDEX6
-rw-r--r--Documentation/filesystems/configfs/configfs.txt434
-rw-r--r--Documentation/filesystems/configfs/configfs_example.c474
-rw-r--r--Documentation/filesystems/dlmfs.txt130
-rw-r--r--Documentation/filesystems/ocfs2.txt55
-rw-r--r--Documentation/keys.txt18
-rw-r--r--Documentation/md.txt120
-rw-r--r--Documentation/power/interface.txt11
-rw-r--r--Documentation/power/swsusp.txt5
-rw-r--r--MAINTAINERS27
-rw-r--r--arch/alpha/Kconfig13
-rw-r--r--arch/alpha/kernel/alpha_ksyms.c1
-rw-r--r--arch/alpha/kernel/irq.c630
-rw-r--r--arch/cris/arch-v10/kernel/kgdb.c6
-rw-r--r--arch/frv/kernel/Makefile1
-rw-r--r--arch/frv/kernel/entry.S2
-rw-r--r--arch/frv/kernel/futex.c242
-rw-r--r--arch/frv/kernel/signal.c155
-rw-r--r--arch/i386/Kconfig8
-rw-r--r--arch/i386/Kconfig.cpu14
-rw-r--r--arch/i386/Kconfig.debug10
-rw-r--r--arch/i386/kernel/apic.c2
-rw-r--r--arch/i386/kernel/apm.c97
-rw-r--r--arch/i386/kernel/cpu/amd.c7
-rw-r--r--arch/i386/kernel/cpu/common.c8
-rw-r--r--arch/i386/kernel/cpu/cyrix.c27
-rw-r--r--arch/i386/kernel/cpuid.c3
-rw-r--r--arch/i386/kernel/entry.S1
-rw-r--r--arch/i386/kernel/head.S27
-rw-r--r--arch/i386/kernel/i386_ksyms.c3
-rw-r--r--arch/i386/kernel/io_apic.c4
-rw-r--r--arch/i386/kernel/mpparse.c26
-rw-r--r--arch/i386/kernel/msr.c3
-rw-r--r--arch/i386/kernel/process.c16
-rw-r--r--arch/i386/kernel/ptrace.c9
-rw-r--r--arch/i386/kernel/reboot.c6
-rw-r--r--arch/i386/kernel/setup.c8
-rw-r--r--arch/i386/kernel/smpboot.c6
-rw-r--r--arch/i386/kernel/syscall_table.S1
-rw-r--r--arch/i386/kernel/timers/timer_tsc.c2
-rw-r--r--arch/i386/kernel/traps.c33
-rw-r--r--arch/i386/mm/init.c24
-rw-r--r--arch/i386/mm/pageattr.c27
-rw-r--r--arch/i386/pci/irq.c2
-rw-r--r--arch/m32r/Kconfig26
-rw-r--r--arch/m32r/boot/compressed/head.S5
-rw-r--r--arch/m32r/boot/setup.S24
-rw-r--r--arch/m32r/kernel/Makefile1
-rw-r--r--arch/m32r/kernel/entry.S19
-rw-r--r--arch/m32r/kernel/io_m32104ut.c298
-rw-r--r--arch/m32r/kernel/io_m32700ut.c24
-rw-r--r--arch/m32r/kernel/io_mappi.c2
-rw-r--r--arch/m32r/kernel/io_mappi2.c24
-rw-r--r--arch/m32r/kernel/io_mappi3.c51
-rw-r--r--arch/m32r/kernel/io_oaks32r.c2
-rw-r--r--arch/m32r/kernel/io_opsput.c6
-rw-r--r--arch/m32r/kernel/setup.c7
-rw-r--r--arch/m32r/kernel/setup_m32104ut.c156
-rw-r--r--arch/m32r/kernel/setup_m32700ut.c8
-rw-r--r--arch/m32r/kernel/setup_mappi.c6
-rw-r--r--arch/m32r/kernel/setup_mappi2.c6
-rw-r--r--arch/m32r/kernel/setup_mappi3.c6
-rw-r--r--arch/m32r/kernel/setup_oaks32r.c6
-rw-r--r--arch/m32r/kernel/setup_opsput.c8
-rw-r--r--arch/m32r/kernel/setup_usrv.c6
-rw-r--r--arch/m32r/kernel/time.c4
-rw-r--r--arch/m32r/m32104ut/defconfig.m32104ut657
-rw-r--r--arch/m32r/mm/cache.c36
-rw-r--r--arch/m68knommu/kernel/m68k_ksyms.c2
-rw-r--r--arch/m68knommu/kernel/setup.c2
-rw-r--r--arch/ppc/boot/simple/Makefile2
-rw-r--r--arch/ppc/kernel/idle.c4
-rw-r--r--arch/ppc/platforms/4xx/ibm440gx.c2
-rw-r--r--arch/ppc/platforms/4xx/ibm440sp.c1
-rw-r--r--arch/ppc/platforms/lite5200.c2
-rw-r--r--arch/ppc/platforms/mpc5200.c53
-rw-r--r--arch/ppc/syslib/mpc52xx_pci.c95
-rw-r--r--arch/ppc/syslib/mpc52xx_setup.c6
-rw-r--r--arch/s390/Kconfig34
-rw-r--r--arch/s390/Makefile6
-rw-r--r--arch/s390/appldata/appldata_base.c8
-rw-r--r--arch/s390/appldata/appldata_os.c14
-rw-r--r--arch/s390/crypto/Makefile8
-rw-r--r--arch/s390/crypto/aes_s390.c248
-rw-r--r--arch/s390/crypto/crypt_s390.h (renamed from arch/s390/crypto/crypt_z990.h)267
-rw-r--r--arch/s390/crypto/crypt_s390_query.c129
-rw-r--r--arch/s390/crypto/crypt_z990_query.c111
-rw-r--r--arch/s390/crypto/des_s390.c (renamed from arch/s390/crypto/des_z990.c)54
-rw-r--r--arch/s390/crypto/sha1_s390.c (renamed from arch/s390/crypto/sha1_z990.c)32
-rw-r--r--arch/s390/crypto/sha256_s390.c151
-rw-r--r--arch/s390/defconfig65
-rw-r--r--arch/s390/kernel/Makefile15
-rw-r--r--arch/s390/kernel/compat_linux.c2
-rw-r--r--arch/s390/kernel/compat_signal.c2
-rw-r--r--arch/s390/kernel/cpcmd.c16
-rw-r--r--arch/s390/kernel/entry64.S18
-rw-r--r--arch/s390/kernel/head.S4
-rw-r--r--arch/s390/kernel/machine_kexec.c2
-rw-r--r--arch/s390/kernel/module.c12
-rw-r--r--arch/s390/kernel/process.c12
-rw-r--r--arch/s390/kernel/ptrace.c24
-rw-r--r--arch/s390/kernel/reipl_diag.c2
-rw-r--r--arch/s390/kernel/setup.c14
-rw-r--r--arch/s390/kernel/signal.c8
-rw-r--r--arch/s390/kernel/smp.c14
-rw-r--r--arch/s390/kernel/sys_s390.c12
-rw-r--r--arch/s390/kernel/traps.c10
-rw-r--r--arch/s390/kernel/vmlinux.lds.S2
-rw-r--r--arch/s390/lib/Makefile5
-rw-r--r--arch/s390/lib/spinlock.c2
-rw-r--r--arch/s390/mm/extmem.c2
-rw-r--r--arch/s390/mm/fault.c18
-rw-r--r--arch/s390/mm/init.c8
-rw-r--r--arch/s390/mm/mmap.c2
-rw-r--r--arch/s390/oprofile/Makefile2
-rw-r--r--arch/s390/oprofile/backtrace.c79
-rw-r--r--arch/s390/oprofile/init.c4
-rw-r--r--arch/um/drivers/chan_kern.c273
-rw-r--r--arch/um/drivers/line.c298
-rw-r--r--arch/um/drivers/mconsole_kern.c232
-rw-r--r--arch/um/drivers/mconsole_user.c12
-rw-r--r--arch/um/drivers/net_kern.c8
-rw-r--r--arch/um/drivers/ssl.c47
-rw-r--r--arch/um/drivers/stdio_console.c33
-rw-r--r--arch/um/drivers/ubd_kern.c15
-rw-r--r--arch/um/include/chan_kern.h25
-rw-r--r--arch/um/include/choose-mode.h3
-rw-r--r--arch/um/include/irq_user.h13
-rw-r--r--arch/um/include/kern.h13
-rw-r--r--arch/um/include/line.h37
-rw-r--r--arch/um/include/mconsole.h8
-rw-r--r--arch/um/include/os.h17
-rw-r--r--arch/um/include/user_util.h1
-rw-r--r--arch/um/kernel/Makefile6
-rw-r--r--arch/um/kernel/irq_user.c48
-rw-r--r--arch/um/kernel/process_kern.c4
-rw-r--r--arch/um/kernel/sigio_user.c2
-rw-r--r--arch/um/kernel/um_arch.c4
-rw-r--r--arch/um/kernel/umid.c323
-rw-r--r--arch/um/os-Linux/Makefile4
-rw-r--r--arch/um/os-Linux/aio.c467
-rw-r--r--arch/um/os-Linux/umid.c335
-rw-r--r--arch/x86_64/Kconfig.debug10
-rw-r--r--arch/x86_64/ia32/ia32entry.S2
-rw-r--r--arch/x86_64/kernel/process.c5
-rw-r--r--arch/x86_64/kernel/syscall.c2
-rw-r--r--arch/x86_64/mm/init.c23
-rw-r--r--arch/x86_64/mm/pageattr.c9
-rw-r--r--block/Kconfig2
-rw-r--r--block/as-iosched.c144
-rw-r--r--block/cfq-iosched.c16
-rw-r--r--block/deadline-iosched.c8
-rw-r--r--block/elevator.c86
-rw-r--r--block/ll_rw_blk.c536
-rw-r--r--block/scsi_ioctl.c2
-rw-r--r--crypto/Kconfig43
-rw-r--r--crypto/tcrypt.c4
-rw-r--r--crypto/tcrypt.h64
-rw-r--r--drivers/base/memory.c7
-rw-r--r--drivers/block/DAC960.c2
-rw-r--r--drivers/block/Kconfig2
-rw-r--r--drivers/block/cciss.c2
-rw-r--r--drivers/block/cpqarray.c2
-rw-r--r--drivers/block/floppy.c2
-rw-r--r--drivers/block/loop.c23
-rw-r--r--drivers/block/nbd.c124
-rw-r--r--drivers/block/paride/Kconfig5
-rw-r--r--drivers/block/rd.c4
-rw-r--r--drivers/block/sx8.c2
-rw-r--r--drivers/block/ub.c2
-rw-r--r--drivers/block/viodasd.c2
-rw-r--r--drivers/cdrom/cdu31a.c2
-rw-r--r--drivers/char/Kconfig2
-rw-r--r--drivers/char/hangcheck-timer.c2
-rw-r--r--drivers/char/hw_random.c70
-rw-r--r--drivers/char/ipmi/ipmi_msghandler.c4
-rw-r--r--drivers/char/watchdog/Kconfig2
-rw-r--r--drivers/ide/ide-cd.c4
-rw-r--r--drivers/ide/ide-disk.c137
-rw-r--r--drivers/ide/ide-io.c11
-rw-r--r--drivers/ieee1394/Kconfig23
-rw-r--r--drivers/ieee1394/Makefile2
-rw-r--r--drivers/ieee1394/csr1212.c21
-rw-r--r--drivers/ieee1394/csr1212.h2
-rw-r--r--drivers/ieee1394/dma.c73
-rw-r--r--drivers/ieee1394/dv1394.c13
-rw-r--r--drivers/ieee1394/eth1394.c20
-rw-r--r--drivers/ieee1394/highlevel.c18
-rw-r--r--drivers/ieee1394/hosts.c30
-rw-r--r--drivers/ieee1394/hosts.h162
-rw-r--r--drivers/ieee1394/ieee1394-ioctl.h8
-rw-r--r--drivers/ieee1394/ieee1394.h19
-rw-r--r--drivers/ieee1394/ieee1394_core.c827
-rw-r--r--drivers/ieee1394/ieee1394_core.h100
-rw-r--r--drivers/ieee1394/ieee1394_transactions.c389
-rw-r--r--drivers/ieee1394/iso.c102
-rw-r--r--drivers/ieee1394/nodemgr.c50
-rw-r--r--drivers/ieee1394/nodemgr.h18
-rw-r--r--drivers/ieee1394/ohci1394.c43
-rw-r--r--drivers/ieee1394/ohci1394.h4
-rw-r--r--drivers/ieee1394/pcilynx.c2
-rw-r--r--drivers/ieee1394/raw1394.c79
-rw-r--r--drivers/ieee1394/sbp2.c1040
-rw-r--r--drivers/ieee1394/sbp2.h70
-rw-r--r--drivers/ieee1394/video1394.c106
-rw-r--r--drivers/input/evdev.c2
-rw-r--r--drivers/macintosh/therm_adt746x.c39
-rw-r--r--drivers/macintosh/therm_pm72.c7
-rw-r--r--drivers/macintosh/windfarm_lm75_sensor.c7
-rw-r--r--drivers/md/bitmap.c114
-rw-r--r--drivers/md/dm-crypt.c5
-rw-r--r--drivers/md/dm-io.h3
-rw-r--r--drivers/md/dm-ioctl.c21
-rw-r--r--drivers/md/dm-log.c2
-rw-r--r--drivers/md/dm-raid1.c13
-rw-r--r--drivers/md/dm-snap.c25
-rw-r--r--drivers/md/dm.c95
-rw-r--r--drivers/md/dm.h5
-rw-r--r--drivers/md/faulty.c9
-rw-r--r--drivers/md/kcopyd.c3
-rw-r--r--drivers/md/linear.c14
-rw-r--r--drivers/md/md.c893
-rw-r--r--drivers/md/multipath.c22
-rw-r--r--drivers/md/raid0.c26
-rw-r--r--drivers/md/raid1.c726
-rw-r--r--drivers/md/raid10.c544
-rw-r--r--drivers/md/raid5.c174
-rw-r--r--drivers/md/raid6main.c348
-rw-r--r--drivers/media/video/cpia_pp.c30
-rw-r--r--drivers/message/i2o/Kconfig12
-rw-r--r--drivers/message/i2o/bus-osm.c23
-rw-r--r--drivers/message/i2o/config-osm.c2
-rw-r--r--drivers/message/i2o/core.h20
-rw-r--r--drivers/message/i2o/device.c339
-rw-r--r--drivers/message/i2o/driver.c12
-rw-r--r--drivers/message/i2o/exec-osm.c114
-rw-r--r--drivers/message/i2o/i2o_block.c190
-rw-r--r--drivers/message/i2o/i2o_config.c196
-rw-r--r--drivers/message/i2o/i2o_lan.h38
-rw-r--r--drivers/message/i2o/i2o_proc.c2
-rw-r--r--drivers/message/i2o/i2o_scsi.c89
-rw-r--r--drivers/message/i2o/iop.c356
-rw-r--r--drivers/message/i2o/pci.c7
-rw-r--r--drivers/mmc/mmc_block.c4
-rw-r--r--drivers/net/phy/Kconfig2
-rw-r--r--drivers/net/plip.c2
-rw-r--r--drivers/parport/Kconfig2
-rw-r--r--drivers/parport/daisy.c51
-rw-r--r--drivers/parport/ieee1284_ops.c62
-rw-r--r--drivers/parport/parport_pc.c30
-rw-r--r--drivers/parport/probe.c199
-rw-r--r--drivers/parport/share.c1
-rw-r--r--drivers/pnp/pnpbios/bioscalls.c45
-rw-r--r--drivers/s390/Makefile2
-rw-r--r--drivers/s390/block/Kconfig8
-rw-r--r--drivers/s390/block/dasd.c34
-rw-r--r--drivers/s390/block/dasd_diag.c11
-rw-r--r--drivers/s390/block/dasd_diag.h31
-rw-r--r--drivers/s390/block/dasd_eckd.c9
-rw-r--r--drivers/s390/block/dasd_fba.c6
-rw-r--r--drivers/s390/block/dasd_int.h3
-rw-r--r--drivers/s390/block/dasd_ioctl.c5
-rw-r--r--drivers/s390/block/dcssblk.c2
-rw-r--r--drivers/s390/block/xpram.c4
-rw-r--r--drivers/s390/char/sclp_cpi.c2
-rw-r--r--drivers/s390/char/sclp_quiesce.c2
-rw-r--r--drivers/s390/char/tape_block.c4
-rw-r--r--drivers/s390/char/vmwatchdog.c2
-rw-r--r--drivers/s390/cio/blacklist.c234
-rw-r--r--drivers/s390/cio/blacklist.h2
-rw-r--r--drivers/s390/cio/ccwgroup.c6
-rw-r--r--drivers/s390/cio/chsc.c473
-rw-r--r--drivers/s390/cio/chsc.h13
-rw-r--r--drivers/s390/cio/cio.c168
-rw-r--r--drivers/s390/cio/cio.h11
-rw-r--r--drivers/s390/cio/cmf.c8
-rw-r--r--drivers/s390/cio/css.c297
-rw-r--r--drivers/s390/cio/css.h43
-rw-r--r--drivers/s390/cio/device.c47
-rw-r--r--drivers/s390/cio/device.h1
-rw-r--r--drivers/s390/cio/device_fsm.c29
-rw-r--r--drivers/s390/cio/device_id.c26
-rw-r--r--drivers/s390/cio/device_ops.c4
-rw-r--r--drivers/s390/cio/device_pgid.c56
-rw-r--r--drivers/s390/cio/device_status.c14
-rw-r--r--drivers/s390/cio/ioasm.h86
-rw-r--r--drivers/s390/cio/qdio.c713
-rw-r--r--drivers/s390/cio/qdio.h144
-rw-r--r--drivers/s390/cio/schid.h26
-rw-r--r--drivers/s390/crypto/z90common.h9
-rw-r--r--drivers/s390/crypto/z90crypt.h13
-rw-r--r--drivers/s390/crypto/z90hardware.c309
-rw-r--r--drivers/s390/crypto/z90main.c111
-rw-r--r--drivers/s390/net/Kconfig2
-rw-r--r--drivers/s390/net/claw.c6
-rw-r--r--drivers/s390/net/cu3088.c3
-rw-r--r--drivers/s390/net/iucv.c10
-rw-r--r--drivers/s390/net/qeth_main.c21
-rw-r--r--drivers/s390/s390_rdev.c53
-rw-r--r--drivers/s390/s390mach.c66
-rw-r--r--drivers/s390/sysinfo.c2
-rw-r--r--drivers/scsi/Kconfig10
-rw-r--r--drivers/scsi/ahci.c1
-rw-r--r--drivers/scsi/ata_piix.c5
-rw-r--r--drivers/scsi/hosts.c9
-rw-r--r--drivers/scsi/ide-scsi.c4
-rw-r--r--drivers/scsi/libata-core.c145
-rw-r--r--drivers/scsi/libata-scsi.c48
-rw-r--r--drivers/scsi/libata.h4
-rw-r--r--drivers/scsi/sata_mv.c1
-rw-r--r--drivers/scsi/sata_nv.c1
-rw-r--r--drivers/scsi/sata_promise.c1
-rw-r--r--drivers/scsi/sata_sil.c1
-rw-r--r--drivers/scsi/sata_sil24.c1
-rw-r--r--drivers/scsi/sata_sis.c1
-rw-r--r--drivers/scsi/sata_svw.c1
-rw-r--r--drivers/scsi/sata_sx4.c1
-rw-r--r--drivers/scsi/sata_uli.c1
-rw-r--r--drivers/scsi/sata_via.c1
-rw-r--r--drivers/scsi/sata_vsc.c1
-rw-r--r--drivers/scsi/scsi_lib.c50
-rw-r--r--drivers/scsi/scsi_sysfs.c31
-rw-r--r--drivers/scsi/sd.c85
-rw-r--r--drivers/serial/mpc52xx_uart.c28
-rw-r--r--fs/Kconfig66
-rw-r--r--fs/Makefile2
-rw-r--r--fs/bio.c26
-rw-r--r--fs/configfs/Makefile7
-rw-r--r--fs/configfs/configfs_internal.h142
-rw-r--r--fs/configfs/dir.c1102
-rw-r--r--fs/configfs/file.c360
-rw-r--r--fs/configfs/inode.c162
-rw-r--r--fs/configfs/item.c227
-rw-r--r--fs/configfs/mount.c159
-rw-r--r--fs/configfs/symlink.c281
-rw-r--r--fs/exec.c2
-rw-r--r--fs/fuse/dev.c72
-rw-r--r--fs/fuse/dir.c278
-rw-r--r--fs/fuse/file.c49
-rw-r--r--fs/fuse/fuse_i.h12
-rw-r--r--fs/fuse/inode.c14
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/jbd/checkpoint.c418
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/nfsd/nfs3proc.c11
-rw-r--r--fs/nfsd/nfs3xdr.c47
-rw-r--r--fs/nfsd/nfsxdr.c48
-rw-r--r--fs/nfsd/vfs.c40
-rw-r--r--fs/ocfs2/Makefile33
-rw-r--r--fs/ocfs2/alloc.c2040
-rw-r--r--fs/ocfs2/alloc.h82
-rw-r--r--fs/ocfs2/aops.c643
-rw-r--r--fs/ocfs2/aops.h41
-rw-r--r--fs/ocfs2/buffer_head_io.c232
-rw-r--r--fs/ocfs2/buffer_head_io.h73
-rw-r--r--fs/ocfs2/cluster/Makefile4
-rw-r--r--fs/ocfs2/cluster/endian.h30
-rw-r--r--fs/ocfs2/cluster/heartbeat.c1797
-rw-r--r--fs/ocfs2/cluster/heartbeat.h82
-rw-r--r--fs/ocfs2/cluster/masklog.c166
-rw-r--r--fs/ocfs2/cluster/masklog.h275
-rw-r--r--fs/ocfs2/cluster/nodemanager.c791
-rw-r--r--fs/ocfs2/cluster/nodemanager.h64
-rw-r--r--fs/ocfs2/cluster/ocfs2_heartbeat.h37
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h39
-rw-r--r--fs/ocfs2/cluster/quorum.c315
-rw-r--r--fs/ocfs2/cluster/quorum.h36
-rw-r--r--fs/ocfs2/cluster/sys.c124
-rw-r--r--fs/ocfs2/cluster/sys.h33
-rw-r--r--fs/ocfs2/cluster/tcp.c1829
-rw-r--r--fs/ocfs2/cluster/tcp.h113
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h174
-rw-r--r--fs/ocfs2/cluster/ver.c42
-rw-r--r--fs/ocfs2/cluster/ver.h31
-rw-r--r--fs/ocfs2/dcache.c91
-rw-r--r--fs/ocfs2/dcache.h31
-rw-r--r--fs/ocfs2/dir.c618
-rw-r--r--fs/ocfs2/dir.h54
-rw-r--r--fs/ocfs2/dlm/Makefile8
-rw-r--r--fs/ocfs2/dlm/dlmapi.h214
-rw-r--r--fs/ocfs2/dlm/dlmast.c466
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h884
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c530
-rw-r--r--fs/ocfs2/dlm/dlmconvert.h35
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c246
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h30
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1469
-rw-r--r--fs/ocfs2/dlm/dlmdomain.h36
-rw-r--r--fs/ocfs2/dlm/dlmfs.c640
-rw-r--r--fs/ocfs2/dlm/dlmfsver.c42
-rw-r--r--fs/ocfs2/dlm/dlmfsver.h31
-rw-r--r--fs/ocfs2/dlm/dlmlock.c676
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c2664
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2132
-rw-r--r--fs/ocfs2/dlm/dlmthread.c692
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c672
-rw-r--r--fs/ocfs2/dlm/dlmver.c42
-rw-r--r--fs/ocfs2/dlm/dlmver.h31
-rw-r--r--fs/ocfs2/dlm/userdlm.c658
-rw-r--r--fs/ocfs2/dlm/userdlm.h111
-rw-r--r--fs/ocfs2/dlmglue.c2904
-rw-r--r--fs/ocfs2/dlmglue.h111
-rw-r--r--fs/ocfs2/endian.h45
-rw-r--r--fs/ocfs2/export.c248
-rw-r--r--fs/ocfs2/export.h31
-rw-r--r--fs/ocfs2/extent_map.c994
-rw-r--r--fs/ocfs2/extent_map.h46
-rw-r--r--fs/ocfs2/file.c1237
-rw-r--r--fs/ocfs2/file.h57
-rw-r--r--fs/ocfs2/heartbeat.c378
-rw-r--r--fs/ocfs2/heartbeat.h67
-rw-r--r--fs/ocfs2/inode.c1140
-rw-r--r--fs/ocfs2/inode.h145
-rw-r--r--fs/ocfs2/journal.c1652
-rw-r--r--fs/ocfs2/journal.h457
-rw-r--r--fs/ocfs2/localalloc.c983
-rw-r--r--fs/ocfs2/localalloc.h56
-rw-r--r--fs/ocfs2/mmap.c102
-rw-r--r--fs/ocfs2/mmap.h6
-rw-r--r--fs/ocfs2/namei.c2264
-rw-r--r--fs/ocfs2/namei.h58
-rw-r--r--fs/ocfs2/ocfs1_fs_compat.h109
-rw-r--r--fs/ocfs2/ocfs2.h464
-rw-r--r--fs/ocfs2/ocfs2_fs.h638
-rw-r--r--fs/ocfs2/ocfs2_lockid.h73
-rw-r--r--fs/ocfs2/slot_map.c303
-rw-r--r--fs/ocfs2/slot_map.h66
-rw-r--r--fs/ocfs2/suballoc.c1651
-rw-r--r--fs/ocfs2/suballoc.h132
-rw-r--r--fs/ocfs2/super.c1733
-rw-r--r--fs/ocfs2/super.h44
-rw-r--r--fs/ocfs2/symlink.c180
-rw-r--r--fs/ocfs2/symlink.h42
-rw-r--r--fs/ocfs2/sysfile.c131
-rw-r--r--fs/ocfs2/sysfile.h33
-rw-r--r--fs/ocfs2/uptodate.c544
-rw-r--r--fs/ocfs2/uptodate.h44
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/ver.h31
-rw-r--r--fs/ocfs2/vote.c1202
-rw-r--r--fs/ocfs2/vote.h56
-rw-r--r--fs/partitions/Kconfig2
-rw-r--r--fs/partitions/ibm.c30
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/ramfs/Makefile4
-rw-r--r--fs/ramfs/file-mmu.c57
-rw-r--r--fs/ramfs/file-nommu.c292
-rw-r--r--fs/ramfs/inode.c22
-rw-r--r--fs/ramfs/internal.h15
-rw-r--r--include/asm-alpha/atomic.h1
-rw-r--r--include/asm-alpha/dma-mapping.h2
-rw-r--r--include/asm-alpha/hardirq.h2
-rw-r--r--include/asm-alpha/mman.h1
-rw-r--r--include/asm-arm/atomic.h1
-rw-r--r--include/asm-arm/mman.h1
-rw-r--r--include/asm-arm26/atomic.h1
-rw-r--r--include/asm-arm26/mman.h1
-rw-r--r--include/asm-cris/atomic.h1
-rw-r--r--include/asm-cris/mman.h1
-rw-r--r--include/asm-frv/atomic.h1
-rw-r--r--include/asm-frv/futex.h42
-rw-r--r--include/asm-frv/mman.h1
-rw-r--r--include/asm-frv/signal.h1
-rw-r--r--include/asm-generic/atomic.h116
-rw-r--r--include/asm-generic/vmlinux.lds.h4
-rw-r--r--include/asm-h8300/atomic.h1
-rw-r--r--include/asm-h8300/irq.h5
-rw-r--r--include/asm-h8300/mman.h1
-rw-r--r--include/asm-i386/atomic.h1
-rw-r--r--include/asm-i386/bitops.h26
-rw-r--r--include/asm-i386/bugs.h23
-rw-r--r--include/asm-i386/cacheflush.h4
-rw-r--r--include/asm-i386/desc.h8
-rw-r--r--include/asm-i386/mach-bigsmp/mach_apic.h79
-rw-r--r--include/asm-i386/mach-bigsmp/mach_apicdef.h4
-rw-r--r--include/asm-i386/mman.h1
-rw-r--r--include/asm-i386/mmzone.h5
-rw-r--r--include/asm-i386/module.h4
-rw-r--r--include/asm-i386/mpspec_def.h2
-rw-r--r--include/asm-i386/segment.h14
-rw-r--r--include/asm-i386/system.h31
-rw-r--r--include/asm-i386/unistd.h2
-rw-r--r--include/asm-ia64/atomic.h1
-rw-r--r--include/asm-ia64/mman.h1
-rw-r--r--include/asm-ia64/unistd.h2
-rw-r--r--include/asm-m32r/assembler.h10
-rw-r--r--include/asm-m32r/atomic.h1
-rw-r--r--include/asm-m32r/cacheflush.h2
-rw-r--r--include/asm-m32r/irq.h16
-rw-r--r--include/asm-m32r/m32102.h38
-rw-r--r--include/asm-m32r/m32104ut/m32104ut_pld.h163
-rw-r--r--include/asm-m32r/m32r.h8
-rw-r--r--include/asm-m32r/mman.h1
-rw-r--r--include/asm-m32r/system.h12
-rw-r--r--include/asm-m32r/unistd.h12
-rw-r--r--include/asm-m68k/atomic.h1
-rw-r--r--include/asm-m68k/irq.h2
-rw-r--r--include/asm-m68k/mman.h1
-rw-r--r--include/asm-m68knommu/atomic.h1
-rw-r--r--include/asm-m68knommu/irq.h6
-rw-r--r--include/asm-mips/atomic.h1
-rw-r--r--include/asm-mips/mman.h1
-rw-r--r--include/asm-mips/riscos-syscall.h979
-rw-r--r--include/asm-parisc/atomic.h1
-rw-r--r--include/asm-parisc/mman.h1
-rw-r--r--include/asm-powerpc/atomic.h1
-rw-r--r--include/asm-powerpc/mman.h1
-rw-r--r--include/asm-ppc/ibm_ocp.h1
-rw-r--r--include/asm-ppc/io.h2
-rw-r--r--include/asm-ppc/mpc52xx.h13
-rw-r--r--include/asm-s390/atomic.h174
-rw-r--r--include/asm-s390/ccwdev.h3
-rw-r--r--include/asm-s390/mman.h1
-rw-r--r--include/asm-s390/qdio.h8
-rw-r--r--include/asm-s390/s390_rdev.h15
-rw-r--r--include/asm-s390/uaccess.h14
-rw-r--r--include/asm-s390/unistd.h2
-rw-r--r--include/asm-s390/vtoc.h24
-rw-r--r--include/asm-sh/atomic.h1
-rw-r--r--include/asm-sh/mman.h1
-rw-r--r--include/asm-sh64/atomic.h1
-rw-r--r--include/asm-sparc/atomic.h1
-rw-r--r--include/asm-sparc/mman.h1
-rw-r--r--include/asm-sparc64/atomic.h1
-rw-r--r--include/asm-sparc64/mman.h1
-rw-r--r--include/asm-v850/atomic.h1
-rw-r--r--include/asm-v850/mman.h1
-rw-r--r--include/asm-x86_64/atomic.h1
-rw-r--r--include/asm-x86_64/cacheflush.h4
-rw-r--r--include/asm-x86_64/mman.h1
-rw-r--r--include/asm-x86_64/mpspec.h2
-rw-r--r--include/asm-x86_64/pgtable.h2
-rw-r--r--include/asm-x86_64/system.h2
-rw-r--r--include/asm-xtensa/atomic.h1
-rw-r--r--include/asm-xtensa/mman.h1
-rw-r--r--include/keys/user-type.h1
-rw-r--r--include/linux/ata.h8
-rw-r--r--include/linux/blkdev.h91
-rw-r--r--include/linux/bootmem.h46
-rw-r--r--include/linux/configfs.h205
-rw-r--r--include/linux/dm-ioctl.h11
-rw-r--r--include/linux/elevator.h1
-rw-r--r--include/linux/fs.h32
-rw-r--r--include/linux/fuse.h24
-rw-r--r--include/linux/hugetlb.h4
-rw-r--r--include/linux/i2o.h976
-rw-r--r--include/linux/irq.h13
-rw-r--r--include/linux/jbd.h8
-rw-r--r--include/linux/key.h8
-rw-r--r--include/linux/libata.h11
-rw-r--r--include/linux/mempolicy.h38
-rw-r--r--include/linux/mm.h55
-rw-r--r--include/linux/mmzone.h44
-rw-r--r--include/linux/nbd.h8
-rw-r--r--include/linux/nfsd/xdr.h3
-rw-r--r--include/linux/nfsd/xdr3.h1
-rw-r--r--include/linux/page-flags.h91
-rw-r--r--include/linux/parport.h1
-rw-r--r--include/linux/parport_pc.h4
-rw-r--r--include/linux/pci_ids.h10
-rw-r--r--include/linux/raid/md.h4
-rw-r--r--include/linux/raid/md_k.h80
-rw-r--r--include/linux/raid/raid1.h14
-rw-r--r--include/linux/raid/raid10.h22
-rw-r--r--include/linux/raid/raid5.h7
-rw-r--r--include/linux/ramfs.h10
-rw-r--r--include/linux/rmap.h1
-rw-r--r--include/linux/sched.h25
-rw-r--r--include/linux/suspend.h8
-rw-r--r--include/linux/swap.h2
-rw-r--r--include/linux/writeback.h6
-rw-r--r--include/scsi/scsi_driver.h1
-rw-r--r--include/scsi/scsi_host.h7
-rw-r--r--init/Kconfig3
-rw-r--r--init/do_mounts_md.c22
-rw-r--r--init/do_mounts_rd.c4
-rw-r--r--init/main.c5
-rw-r--r--ipc/shm.c18
-rw-r--r--kernel/acct.c16
-rw-r--r--kernel/futex.c7
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/irq/proc.c4
-rw-r--r--kernel/module.c3
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/power/disk.c92
-rw-r--r--kernel/power/power.h24
-rw-r--r--kernel/power/snapshot.c89
-rw-r--r--kernel/power/swsusp.c1020
-rw-r--r--kernel/sysctl.c6
-rw-r--r--lib/Kconfig.debug5
-rw-r--r--lib/swiotlb.c3
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/bootmem.c58
-rw-r--r--mm/filemap.c78
-rw-r--r--mm/hugetlb.c192
-rw-r--r--mm/internal.h21
-rw-r--r--mm/madvise.c35
-rw-r--r--mm/memory.c32
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c102
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/page_alloc.c343
-rw-r--r--mm/readahead.c15
-rw-r--r--mm/rmap.c57
-rw-r--r--mm/shmem.c36
-rw-r--r--mm/swap.c27
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c20
-rw-r--r--mm/tiny-shmem.c29
-rw-r--r--mm/truncate.c44
-rw-r--r--mm/vmscan.c125
-rw-r--r--net/sunrpc/svcauth_unix.c14
-rw-r--r--net/sunrpc/svcsock.c2
-rw-r--r--security/keys/internal.h1
-rw-r--r--security/keys/key.c58
-rw-r--r--security/keys/keyring.c66
-rw-r--r--security/keys/user_defined.c33
-rw-r--r--security/selinux/selinuxfs.c4
-rw-r--r--security/selinux/ss/avtab.c2
-rw-r--r--security/selinux/ss/policydb.c2
-rw-r--r--sound/oss/ad1848.c92
-rw-r--r--sound/oss/cs4281/cs4281m.c21
-rw-r--r--sound/oss/cs4281/cs4281pm-24.c39
-rw-r--r--sound/oss/cs46xx.c60
-rw-r--r--sound/oss/cs46xxpm-24.h4
-rw-r--r--sound/oss/maestro.c149
-rw-r--r--sound/oss/nm256_audio.c47
-rw-r--r--sound/oss/opl3sa2.c110
631 files changed, 62650 insertions, 12111 deletions
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 303c57a7fad9..8e63831971d5 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -263,14 +263,8 @@ A flag in the bio structure, BIO_BARRIER is used to identify a barrier i/o.
263The generic i/o scheduler would make sure that it places the barrier request and 263The generic i/o scheduler would make sure that it places the barrier request and
264all other requests coming after it after all the previous requests in the 264all other requests coming after it after all the previous requests in the
265queue. Barriers may be implemented in different ways depending on the 265queue. Barriers may be implemented in different ways depending on the
266driver. A SCSI driver for example could make use of ordered tags to 266driver. For more details regarding I/O barriers, please read barrier.txt
267preserve the necessary ordering with a lower impact on throughput. For IDE 267in this directory.
268this might be two sync cache flush: a pre and post flush when encountering
269a barrier write.
270
271There is a provision for queues to indicate what kind of barriers they
272can provide. This is as of yet unmerged, details will be added here once it
273is in the kernel.
274 268
2751.2.2 Request Priority/Latency 2691.2.2 Request Priority/Latency
276 270
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index cb13b963f7ae..9474501dd6cc 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -47,17 +47,6 @@ Who: Paul E. McKenney <paulmck@us.ibm.com>
47 47
48--------------------------- 48---------------------------
49 49
50What: IEEE1394 Audio and Music Data Transmission Protocol driver,
51 Connection Management Procedures driver
52When: November 2005
53Files: drivers/ieee1394/{amdtp,cmp}*
54Why: These are incomplete, have never worked, and are better implemented
55 in userland via raw1394 (see http://freebob.sourceforge.net/ for
56 example.)
57Who: Jody McIntyre <scjody@steamballoon.com>
58
59---------------------------
60
61What: raw1394: requests of type RAW1394_REQ_ISO_SEND, RAW1394_REQ_ISO_LISTEN 50What: raw1394: requests of type RAW1394_REQ_ISO_SEND, RAW1394_REQ_ISO_LISTEN
62When: November 2005 51When: November 2005
63Why: Deprecated in favour of the new ioctl-based rawiso interface, which is 52Why: Deprecated in favour of the new ioctl-based rawiso interface, which is
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 7e17712f3229..74052d22d868 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -12,10 +12,14 @@ cifs.txt
12 - description of the CIFS filesystem 12 - description of the CIFS filesystem
13coda.txt 13coda.txt
14 - description of the CODA filesystem. 14 - description of the CODA filesystem.
15configfs/
16 - directory containing configfs documentation and example code.
15cramfs.txt 17cramfs.txt
16 - info on the cram filesystem for small storage (ROMs etc) 18 - info on the cram filesystem for small storage (ROMs etc)
17devfs/ 19devfs/
18 - directory containing devfs documentation. 20 - directory containing devfs documentation.
21dlmfs.txt
22 - info on the userspace interface to the OCFS2 DLM.
19ext2.txt 23ext2.txt
20 - info, mount options and specifications for the Ext2 filesystem. 24 - info, mount options and specifications for the Ext2 filesystem.
21hpfs.txt 25hpfs.txt
@@ -30,6 +34,8 @@ ntfs.txt
30 - info and mount options for the NTFS filesystem (Windows NT). 34 - info and mount options for the NTFS filesystem (Windows NT).
31proc.txt 35proc.txt
32 - info on Linux's /proc filesystem. 36 - info on Linux's /proc filesystem.
37ocfs2.txt
38 - info and mount options for the OCFS2 clustered filesystem.
33romfs.txt 39romfs.txt
34 - Description of the ROMFS filesystem. 40 - Description of the ROMFS filesystem.
35smbfs.txt 41smbfs.txt
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
new file mode 100644
index 000000000000..c4ff96b7c4e0
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -0,0 +1,434 @@
1
2configfs - Userspace-driven kernel object configuation.
3
4Joel Becker <joel.becker@oracle.com>
5
6Updated: 31 March 2005
7
8Copyright (c) 2005 Oracle Corporation,
9 Joel Becker <joel.becker@oracle.com>
10
11
12[What is configfs?]
13
14configfs is a ram-based filesystem that provides the converse of
15sysfs's functionality. Where sysfs is a filesystem-based view of
16kernel objects, configfs is a filesystem-based manager of kernel
17objects, or config_items.
18
19With sysfs, an object is created in kernel (for example, when a device
20is discovered) and it is registered with sysfs. Its attributes then
21appear in sysfs, allowing userspace to read the attributes via
22readdir(3)/read(2). It may allow some attributes to be modified via
23write(2). The important point is that the object is created and
24destroyed in kernel, the kernel controls the lifecycle of the sysfs
25representation, and sysfs is merely a window on all this.
26
27A configfs config_item is created via an explicit userspace operation:
28mkdir(2). It is destroyed via rmdir(2). The attributes appear at
29mkdir(2) time, and can be read or modified via read(2) and write(2).
30As with sysfs, readdir(3) queries the list of items and/or attributes.
31symlink(2) can be used to group items together. Unlike sysfs, the
32lifetime of the representation is completely driven by userspace. The
33kernel modules backing the items must respond to this.
34
35Both sysfs and configfs can and should exist together on the same
36system. One is not a replacement for the other.
37
38[Using configfs]
39
40configfs can be compiled as a module or into the kernel. You can access
41it by doing
42
43 mount -t configfs none /config
44
45The configfs tree will be empty unless client modules are also loaded.
46These are modules that register their item types with configfs as
47subsystems. Once a client subsystem is loaded, it will appear as a
48subdirectory (or more than one) under /config. Like sysfs, the
49configfs tree is always there, whether mounted on /config or not.
50
51An item is created via mkdir(2). The item's attributes will also
52appear at this time. readdir(3) can determine what the attributes are,
53read(2) can query their default values, and write(2) can store new
54values. Like sysfs, attributes should be ASCII text files, preferably
55with only one value per file. The same efficiency caveats from sysfs
56apply. Don't mix more than one attribute in one attribute file.
57
58Like sysfs, configfs expects write(2) to store the entire buffer at
59once. When writing to configfs attributes, userspace processes should
60first read the entire file, modify the portions they wish to change, and
61then write the entire buffer back. Attribute files have a maximum size
62of one page (PAGE_SIZE, 4096 on i386).
63
64When an item needs to be destroyed, remove it with rmdir(2). An
65item cannot be destroyed if any other item has a link to it (via
66symlink(2)). Links can be removed via unlink(2).
67
68[Configuring FakeNBD: an Example]
69
70Imagine there's a Network Block Device (NBD) driver that allows you to
71access remote block devices. Call it FakeNBD. FakeNBD uses configfs
72for its configuration. Obviously, there will be a nice program that
73sysadmins use to configure FakeNBD, but somehow that program has to tell
74the driver about it. Here's where configfs comes in.
75
76When the FakeNBD driver is loaded, it registers itself with configfs.
77readdir(3) sees this just fine:
78
79 # ls /config
80 fakenbd
81
82A fakenbd connection can be created with mkdir(2). The name is
83arbitrary, but likely the tool will make some use of the name. Perhaps
84it is a uuid or a disk name:
85
86 # mkdir /config/fakenbd/disk1
87 # ls /config/fakenbd/disk1
88 target device rw
89
90The target attribute contains the IP address of the server FakeNBD will
91connect to. The device attribute is the device on the server.
92Predictably, the rw attribute determines whether the connection is
93read-only or read-write.
94
95 # echo 10.0.0.1 > /config/fakenbd/disk1/target
96 # echo /dev/sda1 > /config/fakenbd/disk1/device
97 # echo 1 > /config/fakenbd/disk1/rw
98
99That's it. That's all there is. Now the device is configured, via the
100shell no less.
101
102[Coding With configfs]
103
104Every object in configfs is a config_item. A config_item reflects an
105object in the subsystem. It has attributes that match values on that
106object. configfs handles the filesystem representation of that object
107and its attributes, allowing the subsystem to ignore all but the
108basic show/store interaction.
109
110Items are created and destroyed inside a config_group. A group is a
111collection of items that share the same attributes and operations.
112Items are created by mkdir(2) and removed by rmdir(2), but configfs
113handles that. The group has a set of operations to perform these tasks
114
115A subsystem is the top level of a client module. During initialization,
116the client module registers the subsystem with configfs, the subsystem
117appears as a directory at the top of the configfs filesystem. A
118subsystem is also a config_group, and can do everything a config_group
119can.
120
121[struct config_item]
122
123 struct config_item {
124 char *ci_name;
125 char ci_namebuf[UOBJ_NAME_LEN];
126 struct kref ci_kref;
127 struct list_head ci_entry;
128 struct config_item *ci_parent;
129 struct config_group *ci_group;
130 struct config_item_type *ci_type;
131 struct dentry *ci_dentry;
132 };
133
134 void config_item_init(struct config_item *);
135 void config_item_init_type_name(struct config_item *,
136 const char *name,
137 struct config_item_type *type);
138 struct config_item *config_item_get(struct config_item *);
139 void config_item_put(struct config_item *);
140
141Generally, struct config_item is embedded in a container structure, a
142structure that actually represents what the subsystem is doing. The
143config_item portion of that structure is how the object interacts with
144configfs.
145
146Whether statically defined in a source file or created by a parent
147config_group, a config_item must have one of the _init() functions
148called on it. This initializes the reference count and sets up the
149appropriate fields.
150
151All users of a config_item should have a reference on it via
152config_item_get(), and drop the reference when they are done via
153config_item_put().
154
155By itself, a config_item cannot do much more than appear in configfs.
156Usually a subsystem wants the item to display and/or store attributes,
157among other things. For that, it needs a type.
158
159[struct config_item_type]
160
161 struct configfs_item_operations {
162 void (*release)(struct config_item *);
163 ssize_t (*show_attribute)(struct config_item *,
164 struct configfs_attribute *,
165 char *);
166 ssize_t (*store_attribute)(struct config_item *,
167 struct configfs_attribute *,
168 const char *, size_t);
169 int (*allow_link)(struct config_item *src,
170 struct config_item *target);
171 int (*drop_link)(struct config_item *src,
172 struct config_item *target);
173 };
174
175 struct config_item_type {
176 struct module *ct_owner;
177 struct configfs_item_operations *ct_item_ops;
178 struct configfs_group_operations *ct_group_ops;
179 struct configfs_attribute **ct_attrs;
180 };
181
182The most basic function of a config_item_type is to define what
183operations can be performed on a config_item. All items that have been
184allocated dynamically will need to provide the ct_item_ops->release()
185method. This method is called when the config_item's reference count
186reaches zero. Items that wish to display an attribute need to provide
187the ct_item_ops->show_attribute() method. Similarly, storing a new
188attribute value uses the store_attribute() method.
189
190[struct configfs_attribute]
191
192 struct configfs_attribute {
193 char *ca_name;
194 struct module *ca_owner;
195 mode_t ca_mode;
196 };
197
198When a config_item wants an attribute to appear as a file in the item's
199configfs directory, it must define a configfs_attribute describing it.
200It then adds the attribute to the NULL-terminated array
201config_item_type->ct_attrs. When the item appears in configfs, the
202attribute file will appear with the configfs_attribute->ca_name
203filename. configfs_attribute->ca_mode specifies the file permissions.
204
205If an attribute is readable and the config_item provides a
206ct_item_ops->show_attribute() method, that method will be called
207whenever userspace asks for a read(2) on the attribute. The converse
208will happen for write(2).
209
210[struct config_group]
211
212A config_item cannot live in a vaccum. The only way one can be created
213is via mkdir(2) on a config_group. This will trigger creation of a
214child item.
215
216 struct config_group {
217 struct config_item cg_item;
218 struct list_head cg_children;
219 struct configfs_subsystem *cg_subsys;
220 struct config_group **default_groups;
221 };
222
223 void config_group_init(struct config_group *group);
224 void config_group_init_type_name(struct config_group *group,
225 const char *name,
226 struct config_item_type *type);
227
228
229The config_group structure contains a config_item. Properly configuring
230that item means that a group can behave as an item in its own right.
231However, it can do more: it can create child items or groups. This is
232accomplished via the group operations specified on the group's
233config_item_type.
234
235 struct configfs_group_operations {
236 struct config_item *(*make_item)(struct config_group *group,
237 const char *name);
238 struct config_group *(*make_group)(struct config_group *group,
239 const char *name);
240 int (*commit_item)(struct config_item *item);
241 void (*drop_item)(struct config_group *group,
242 struct config_item *item);
243 };
244
245A group creates child items by providing the
246ct_group_ops->make_item() method. If provided, this method is called from mkdir(2) in the group's directory. The subsystem allocates a new
247config_item (or more likely, its container structure), initializes it,
248and returns it to configfs. Configfs will then populate the filesystem
249tree to reflect the new item.
250
251If the subsystem wants the child to be a group itself, the subsystem
252provides ct_group_ops->make_group(). Everything else behaves the same,
253using the group _init() functions on the group.
254
255Finally, when userspace calls rmdir(2) on the item or group,
256ct_group_ops->drop_item() is called. As a config_group is also a
257config_item, it is not necessary for a seperate drop_group() method.
258The subsystem must config_item_put() the reference that was initialized
259upon item allocation. If a subsystem has no work to do, it may omit
260the ct_group_ops->drop_item() method, and configfs will call
261config_item_put() on the item on behalf of the subsystem.
262
263IMPORTANT: drop_item() is void, and as such cannot fail. When rmdir(2)
264is called, configfs WILL remove the item from the filesystem tree
265(assuming that it has no children to keep it busy). The subsystem is
266responsible for responding to this. If the subsystem has references to
267the item in other threads, the memory is safe. It may take some time
268for the item to actually disappear from the subsystem's usage. But it
269is gone from configfs.
270
271A config_group cannot be removed while it still has child items. This
272is implemented in the configfs rmdir(2) code. ->drop_item() will not be
273called, as the item has not been dropped. rmdir(2) will fail, as the
274directory is not empty.
275
276[struct configfs_subsystem]
277
278A subsystem must register itself, ususally at module_init time. This
279tells configfs to make the subsystem appear in the file tree.
280
281 struct configfs_subsystem {
282 struct config_group su_group;
283 struct semaphore su_sem;
284 };
285
286 int configfs_register_subsystem(struct configfs_subsystem *subsys);
287 void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
288
289 A subsystem consists of a toplevel config_group and a semaphore.
290The group is where child config_items are created. For a subsystem,
291this group is usually defined statically. Before calling
292configfs_register_subsystem(), the subsystem must have initialized the
293group via the usual group _init() functions, and it must also have
294initialized the semaphore.
295 When the register call returns, the subsystem is live, and it
296will be visible via configfs. At that point, mkdir(2) can be called and
297the subsystem must be ready for it.
298
299[An Example]
300
301The best example of these basic concepts is the simple_children
302subsystem/group and the simple_child item in configfs_example.c It
303shows a trivial object displaying and storing an attribute, and a simple
304group creating and destroying these children.
305
306[Hierarchy Navigation and the Subsystem Semaphore]
307
308There is an extra bonus that configfs provides. The config_groups and
309config_items are arranged in a hierarchy due to the fact that they
310appear in a filesystem. A subsystem is NEVER to touch the filesystem
311parts, but the subsystem might be interested in this hierarchy. For
312this reason, the hierarchy is mirrored via the config_group->cg_children
313and config_item->ci_parent structure members.
314
315A subsystem can navigate the cg_children list and the ci_parent pointer
316to see the tree created by the subsystem. This can race with configfs'
317management of the hierarchy, so configfs uses the subsystem semaphore to
318protect modifications. Whenever a subsystem wants to navigate the
319hierarchy, it must do so under the protection of the subsystem
320semaphore.
321
322A subsystem will be prevented from acquiring the semaphore while a newly
323allocated item has not been linked into this hierarchy. Similarly, it
324will not be able to acquire the semaphore while a dropping item has not
325yet been unlinked. This means that an item's ci_parent pointer will
326never be NULL while the item is in configfs, and that an item will only
327be in its parent's cg_children list for the same duration. This allows
328a subsystem to trust ci_parent and cg_children while they hold the
329semaphore.
330
331[Item Aggregation Via symlink(2)]
332
333configfs provides a simple group via the group->item parent/child
334relationship. Often, however, a larger environment requires aggregation
335outside of the parent/child connection. This is implemented via
336symlink(2).
337
338A config_item may provide the ct_item_ops->allow_link() and
339ct_item_ops->drop_link() methods. If the ->allow_link() method exists,
340symlink(2) may be called with the config_item as the source of the link.
341These links are only allowed between configfs config_items. Any
342symlink(2) attempt outside the configfs filesystem will be denied.
343
344When symlink(2) is called, the source config_item's ->allow_link()
345method is called with itself and a target item. If the source item
346allows linking to target item, it returns 0. A source item may wish to
347reject a link if it only wants links to a certain type of object (say,
348in its own subsystem).
349
350When unlink(2) is called on the symbolic link, the source item is
351notified via the ->drop_link() method. Like the ->drop_item() method,
352this is a void function and cannot return failure. The subsystem is
353responsible for responding to the change.
354
355A config_item cannot be removed while it links to any other item, nor
356can it be removed while an item links to it. Dangling symlinks are not
357allowed in configfs.
358
359[Automatically Created Subgroups]
360
361A new config_group may want to have two types of child config_items.
362While this could be codified by magic names in ->make_item(), it is much
363more explicit to have a method whereby userspace sees this divergence.
364
365Rather than have a group where some items behave differently than
366others, configfs provides a method whereby one or many subgroups are
367automatically created inside the parent at its creation. Thus,
368mkdir("parent) results in "parent", "parent/subgroup1", up through
369"parent/subgroupN". Items of type 1 can now be created in
370"parent/subgroup1", and items of type N can be created in
371"parent/subgroupN".
372
373These automatic subgroups, or default groups, do not preclude other
374children of the parent group. If ct_group_ops->make_group() exists,
375other child groups can be created on the parent group directly.
376
377A configfs subsystem specifies default groups by filling in the
378NULL-terminated array default_groups on the config_group structure.
379Each group in that array is populated in the configfs tree at the same
380time as the parent group. Similarly, they are removed at the same time
381as the parent. No extra notification is provided. When a ->drop_item()
382method call notifies the subsystem the parent group is going away, it
383also means every default group child associated with that parent group.
384
385As a consequence of this, default_groups cannot be removed directly via
386rmdir(2). They also are not considered when rmdir(2) on the parent
387group is checking for children.
388
389[Committable Items]
390
391NOTE: Committable items are currently unimplemented.
392
393Some config_items cannot have a valid initial state. That is, no
394default values can be specified for the item's attributes such that the
395item can do its work. Userspace must configure one or more attributes,
396after which the subsystem can start whatever entity this item
397represents.
398
399Consider the FakeNBD device from above. Without a target address *and*
400a target device, the subsystem has no idea what block device to import.
401The simple example assumes that the subsystem merely waits until all the
402appropriate attributes are configured, and then connects. This will,
403indeed, work, but now every attribute store must check if the attributes
404are initialized. Every attribute store must fire off the connection if
405that condition is met.
406
407Far better would be an explicit action notifying the subsystem that the
408config_item is ready to go. More importantly, an explicit action allows
409the subsystem to provide feedback as to whether the attibutes are
410initialized in a way that makes sense. configfs provides this as
411committable items.
412
413configfs still uses only normal filesystem operations. An item is
414committed via rename(2). The item is moved from a directory where it
415can be modified to a directory where it cannot.
416
417Any group that provides the ct_group_ops->commit_item() method has
418committable items. When this group appears in configfs, mkdir(2) will
419not work directly in the group. Instead, the group will have two
420subdirectories: "live" and "pending". The "live" directory does not
421support mkdir(2) or rmdir(2) either. It only allows rename(2). The
422"pending" directory does allow mkdir(2) and rmdir(2). An item is
423created in the "pending" directory. Its attributes can be modified at
424will. Userspace commits the item by renaming it into the "live"
425directory. At this point, the subsystem recieves the ->commit_item()
426callback. If all required attributes are filled to satisfaction, the
427method returns zero and the item is moved to the "live" directory.
428
429As rmdir(2) does not work in the "live" directory, an item must be
430shutdown, or "uncommitted". Again, this is done via rename(2), this
431time from the "live" directory back to the "pending" one. The subsystem
432is notified by the ct_group_ops->uncommit_object() method.
433
434
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
new file mode 100644
index 000000000000..f3c6e4946f98
--- /dev/null
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -0,0 +1,474 @@
1/*
2 * vim: noexpandtab ts=8 sts=0 sw=8:
3 *
4 * configfs_example.c - This file is a demonstration module containing
5 * a number of configfs subsystems.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public
18 * License along with this program; if not, write to the
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 021110-1307, USA.
21 *
22 * Based on sysfs:
23 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
24 *
25 * configfs Copyright (C) 2005 Oracle. All rights reserved.
26 */
27
28#include <linux/init.h>
29#include <linux/module.h>
30#include <linux/slab.h>
31
32#include <linux/configfs.h>
33
34
35
36/*
37 * 01-childless
38 *
39 * This first example is a childless subsystem. It cannot create
40 * any config_items. It just has attributes.
41 *
42 * Note that we are enclosing the configfs_subsystem inside a container.
43 * This is not necessary if a subsystem has no attributes directly
44 * on the subsystem. See the next example, 02-simple-children, for
45 * such a subsystem.
46 */
47
48struct childless {
49 struct configfs_subsystem subsys;
50 int showme;
51 int storeme;
52};
53
54struct childless_attribute {
55 struct configfs_attribute attr;
56 ssize_t (*show)(struct childless *, char *);
57 ssize_t (*store)(struct childless *, const char *, size_t);
58};
59
60static inline struct childless *to_childless(struct config_item *item)
61{
62 return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
63}
64
65static ssize_t childless_showme_read(struct childless *childless,
66 char *page)
67{
68 ssize_t pos;
69
70 pos = sprintf(page, "%d\n", childless->showme);
71 childless->showme++;
72
73 return pos;
74}
75
76static ssize_t childless_storeme_read(struct childless *childless,
77 char *page)
78{
79 return sprintf(page, "%d\n", childless->storeme);
80}
81
82static ssize_t childless_storeme_write(struct childless *childless,
83 const char *page,
84 size_t count)
85{
86 unsigned long tmp;
87 char *p = (char *) page;
88
89 tmp = simple_strtoul(p, &p, 10);
90 if (!p || (*p && (*p != '\n')))
91 return -EINVAL;
92
93 if (tmp > INT_MAX)
94 return -ERANGE;
95
96 childless->storeme = tmp;
97
98 return count;
99}
100
101static ssize_t childless_description_read(struct childless *childless,
102 char *page)
103{
104 return sprintf(page,
105"[01-childless]\n"
106"\n"
107"The childless subsystem is the simplest possible subsystem in\n"
108"configfs. It does not support the creation of child config_items.\n"
109"It only has a few attributes. In fact, it isn't much different\n"
110"than a directory in /proc.\n");
111}
112
113static struct childless_attribute childless_attr_showme = {
114 .attr = { .ca_owner = THIS_MODULE, .ca_name = "showme", .ca_mode = S_IRUGO },
115 .show = childless_showme_read,
116};
117static struct childless_attribute childless_attr_storeme = {
118 .attr = { .ca_owner = THIS_MODULE, .ca_name = "storeme", .ca_mode = S_IRUGO | S_IWUSR },
119 .show = childless_storeme_read,
120 .store = childless_storeme_write,
121};
122static struct childless_attribute childless_attr_description = {
123 .attr = { .ca_owner = THIS_MODULE, .ca_name = "description", .ca_mode = S_IRUGO },
124 .show = childless_description_read,
125};
126
127static struct configfs_attribute *childless_attrs[] = {
128 &childless_attr_showme.attr,
129 &childless_attr_storeme.attr,
130 &childless_attr_description.attr,
131 NULL,
132};
133
134static ssize_t childless_attr_show(struct config_item *item,
135 struct configfs_attribute *attr,
136 char *page)
137{
138 struct childless *childless = to_childless(item);
139 struct childless_attribute *childless_attr =
140 container_of(attr, struct childless_attribute, attr);
141 ssize_t ret = 0;
142
143 if (childless_attr->show)
144 ret = childless_attr->show(childless, page);
145 return ret;
146}
147
148static ssize_t childless_attr_store(struct config_item *item,
149 struct configfs_attribute *attr,
150 const char *page, size_t count)
151{
152 struct childless *childless = to_childless(item);
153 struct childless_attribute *childless_attr =
154 container_of(attr, struct childless_attribute, attr);
155 ssize_t ret = -EINVAL;
156
157 if (childless_attr->store)
158 ret = childless_attr->store(childless, page, count);
159 return ret;
160}
161
162static struct configfs_item_operations childless_item_ops = {
163 .show_attribute = childless_attr_show,
164 .store_attribute = childless_attr_store,
165};
166
167static struct config_item_type childless_type = {
168 .ct_item_ops = &childless_item_ops,
169 .ct_attrs = childless_attrs,
170 .ct_owner = THIS_MODULE,
171};
172
173static struct childless childless_subsys = {
174 .subsys = {
175 .su_group = {
176 .cg_item = {
177 .ci_namebuf = "01-childless",
178 .ci_type = &childless_type,
179 },
180 },
181 },
182};
183
184
185/* ----------------------------------------------------------------- */
186
187/*
188 * 02-simple-children
189 *
190 * This example merely has a simple one-attribute child. Note that
191 * there is no extra attribute structure, as the child's attribute is
192 * known from the get-go. Also, there is no container for the
193 * subsystem, as it has no attributes of its own.
194 */
195
196struct simple_child {
197 struct config_item item;
198 int storeme;
199};
200
201static inline struct simple_child *to_simple_child(struct config_item *item)
202{
203 return item ? container_of(item, struct simple_child, item) : NULL;
204}
205
206static struct configfs_attribute simple_child_attr_storeme = {
207 .ca_owner = THIS_MODULE,
208 .ca_name = "storeme",
209 .ca_mode = S_IRUGO | S_IWUSR,
210};
211
212static struct configfs_attribute *simple_child_attrs[] = {
213 &simple_child_attr_storeme,
214 NULL,
215};
216
217static ssize_t simple_child_attr_show(struct config_item *item,
218 struct configfs_attribute *attr,
219 char *page)
220{
221 ssize_t count;
222 struct simple_child *simple_child = to_simple_child(item);
223
224 count = sprintf(page, "%d\n", simple_child->storeme);
225
226 return count;
227}
228
229static ssize_t simple_child_attr_store(struct config_item *item,
230 struct configfs_attribute *attr,
231 const char *page, size_t count)
232{
233 struct simple_child *simple_child = to_simple_child(item);
234 unsigned long tmp;
235 char *p = (char *) page;
236
237 tmp = simple_strtoul(p, &p, 10);
238 if (!p || (*p && (*p != '\n')))
239 return -EINVAL;
240
241 if (tmp > INT_MAX)
242 return -ERANGE;
243
244 simple_child->storeme = tmp;
245
246 return count;
247}
248
249static void simple_child_release(struct config_item *item)
250{
251 kfree(to_simple_child(item));
252}
253
254static struct configfs_item_operations simple_child_item_ops = {
255 .release = simple_child_release,
256 .show_attribute = simple_child_attr_show,
257 .store_attribute = simple_child_attr_store,
258};
259
260static struct config_item_type simple_child_type = {
261 .ct_item_ops = &simple_child_item_ops,
262 .ct_attrs = simple_child_attrs,
263 .ct_owner = THIS_MODULE,
264};
265
266
267static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
268{
269 struct simple_child *simple_child;
270
271 simple_child = kmalloc(sizeof(struct simple_child), GFP_KERNEL);
272 if (!simple_child)
273 return NULL;
274
275 memset(simple_child, 0, sizeof(struct simple_child));
276
277 config_item_init_type_name(&simple_child->item, name,
278 &simple_child_type);
279
280 simple_child->storeme = 0;
281
282 return &simple_child->item;
283}
284
285static struct configfs_attribute simple_children_attr_description = {
286 .ca_owner = THIS_MODULE,
287 .ca_name = "description",
288 .ca_mode = S_IRUGO,
289};
290
291static struct configfs_attribute *simple_children_attrs[] = {
292 &simple_children_attr_description,
293 NULL,
294};
295
296static ssize_t simple_children_attr_show(struct config_item *item,
297 struct configfs_attribute *attr,
298 char *page)
299{
300 return sprintf(page,
301"[02-simple-children]\n"
302"\n"
303"This subsystem allows the creation of child config_items. These\n"
304"items have only one attribute that is readable and writeable.\n");
305}
306
307static struct configfs_item_operations simple_children_item_ops = {
308 .show_attribute = simple_children_attr_show,
309};
310
311/*
312 * Note that, since no extra work is required on ->drop_item(),
313 * no ->drop_item() is provided.
314 */
315static struct configfs_group_operations simple_children_group_ops = {
316 .make_item = simple_children_make_item,
317};
318
319static struct config_item_type simple_children_type = {
320 .ct_item_ops = &simple_children_item_ops,
321 .ct_group_ops = &simple_children_group_ops,
322 .ct_attrs = simple_children_attrs,
323};
324
325static struct configfs_subsystem simple_children_subsys = {
326 .su_group = {
327 .cg_item = {
328 .ci_namebuf = "02-simple-children",
329 .ci_type = &simple_children_type,
330 },
331 },
332};
333
334
335/* ----------------------------------------------------------------- */
336
337/*
338 * 03-group-children
339 *
340 * This example reuses the simple_children group from above. However,
341 * the simple_children group is not the subsystem itself, it is a
342 * child of the subsystem. Creation of a group in the subsystem creates
343 * a new simple_children group. That group can then have simple_child
344 * children of its own.
345 */
346
347struct simple_children {
348 struct config_group group;
349};
350
351static struct config_group *group_children_make_group(struct config_group *group, const char *name)
352{
353 struct simple_children *simple_children;
354
355 simple_children = kmalloc(sizeof(struct simple_children),
356 GFP_KERNEL);
357 if (!simple_children)
358 return NULL;
359
360 memset(simple_children, 0, sizeof(struct simple_children));
361
362 config_group_init_type_name(&simple_children->group, name,
363 &simple_children_type);
364
365 return &simple_children->group;
366}
367
368static struct configfs_attribute group_children_attr_description = {
369 .ca_owner = THIS_MODULE,
370 .ca_name = "description",
371 .ca_mode = S_IRUGO,
372};
373
374static struct configfs_attribute *group_children_attrs[] = {
375 &group_children_attr_description,
376 NULL,
377};
378
379static ssize_t group_children_attr_show(struct config_item *item,
380 struct configfs_attribute *attr,
381 char *page)
382{
383 return sprintf(page,
384"[03-group-children]\n"
385"\n"
386"This subsystem allows the creation of child config_groups. These\n"
387"groups are like the subsystem simple-children.\n");
388}
389
390static struct configfs_item_operations group_children_item_ops = {
391 .show_attribute = group_children_attr_show,
392};
393
394/*
395 * Note that, since no extra work is required on ->drop_item(),
396 * no ->drop_item() is provided.
397 */
398static struct configfs_group_operations group_children_group_ops = {
399 .make_group = group_children_make_group,
400};
401
402static struct config_item_type group_children_type = {
403 .ct_item_ops = &group_children_item_ops,
404 .ct_group_ops = &group_children_group_ops,
405 .ct_attrs = group_children_attrs,
406};
407
408static struct configfs_subsystem group_children_subsys = {
409 .su_group = {
410 .cg_item = {
411 .ci_namebuf = "03-group-children",
412 .ci_type = &group_children_type,
413 },
414 },
415};
416
417/* ----------------------------------------------------------------- */
418
419/*
420 * We're now done with our subsystem definitions.
421 * For convenience in this module, here's a list of them all. It
422 * allows the init function to easily register them. Most modules
423 * will only have one subsystem, and will only call register_subsystem
424 * on it directly.
425 */
426static struct configfs_subsystem *example_subsys[] = {
427 &childless_subsys.subsys,
428 &simple_children_subsys,
429 &group_children_subsys,
430 NULL,
431};
432
433static int __init configfs_example_init(void)
434{
435 int ret;
436 int i;
437 struct configfs_subsystem *subsys;
438
439 for (i = 0; example_subsys[i]; i++) {
440 subsys = example_subsys[i];
441
442 config_group_init(&subsys->su_group);
443 init_MUTEX(&subsys->su_sem);
444 ret = configfs_register_subsystem(subsys);
445 if (ret) {
446 printk(KERN_ERR "Error %d while registering subsystem %s\n",
447 ret,
448 subsys->su_group.cg_item.ci_namebuf);
449 goto out_unregister;
450 }
451 }
452
453 return 0;
454
455out_unregister:
456 for (; i >= 0; i--) {
457 configfs_unregister_subsystem(example_subsys[i]);
458 }
459
460 return ret;
461}
462
463static void __exit configfs_example_exit(void)
464{
465 int i;
466
467 for (i = 0; example_subsys[i]; i++) {
468 configfs_unregister_subsystem(example_subsys[i]);
469 }
470}
471
472module_init(configfs_example_init);
473module_exit(configfs_example_exit);
474MODULE_LICENSE("GPL");
diff --git a/Documentation/filesystems/dlmfs.txt b/Documentation/filesystems/dlmfs.txt
new file mode 100644
index 000000000000..9afab845a906
--- /dev/null
+++ b/Documentation/filesystems/dlmfs.txt
@@ -0,0 +1,130 @@
1dlmfs
2==================
3A minimal DLM userspace interface implemented via a virtual file
4system.
5
6dlmfs is built with OCFS2 as it requires most of its infrastructure.
7
8Project web page: http://oss.oracle.com/projects/ocfs2
9Tools web page: http://oss.oracle.com/projects/ocfs2-tools
10OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
11
12All code copyright 2005 Oracle except when otherwise noted.
13
14CREDITS
15=======
16
17Some code taken from ramfs which is Copyright (C) 2000 Linus Torvalds
18and Transmeta Corp.
19
20Mark Fasheh <mark.fasheh@oracle.com>
21
22Caveats
23=======
24- Right now it only works with the OCFS2 DLM, though support for other
25 DLM implementations should not be a major issue.
26
27Mount options
28=============
29None
30
31Usage
32=====
33
34If you're just interested in OCFS2, then please see ocfs2.txt. The
35rest of this document will be geared towards those who want to use
36dlmfs for easy to setup and easy to use clustered locking in
37userspace.
38
39Setup
40=====
41
42dlmfs requires that the OCFS2 cluster infrastructure be in
43place. Please download ocfs2-tools from the above url and configure a
44cluster.
45
46You'll want to start heartbeating on a volume which all the nodes in
47your lockspace can access. The easiest way to do this is via
48ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
49that an OCFS2 file system be in place so that it can automatically
50find it's heartbeat area, though it will eventually support heartbeat
51against raw disks.
52
53Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed
54with ocfs2-tools.
55
56Once you're heartbeating, DLM lock 'domains' can be easily created /
57destroyed and locks within them accessed.
58
59Locking
60=======
61
62Users may access dlmfs via standard file system calls, or they can use
63'libo2dlm' (distributed with ocfs2-tools) which abstracts the file
64system calls and presents a more traditional locking api.
65
66dlmfs handles lock caching automatically for the user, so a lock
67request for an already acquired lock will not generate another DLM
68call. Userspace programs are assumed to handle their own local
69locking.
70
71Two levels of locks are supported - Shared Read, and Exlcusive.
72Also supported is a Trylock operation.
73
74For information on the libo2dlm interface, please see o2dlm.h,
75distributed with ocfs2-tools.
76
77Lock value blocks can be read and written to a resource via read(2)
78and write(2) against the fd obtained via your open(2) call. The
79maximum currently supported LVB length is 64 bytes (though that is an
80OCFS2 DLM limitation). Through this mechanism, users of dlmfs can share
81small amounts of data amongst their nodes.
82
83mkdir(2) signals dlmfs to join a domain (which will have the same name
84as the resulting directory)
85
86rmdir(2) signals dlmfs to leave the domain
87
88Locks for a given domain are represented by regular inodes inside the
89domain directory. Locking against them is done via the open(2) system
90call.
91
92The open(2) call will not return until your lock has been granted or
93an error has occurred, unless it has been instructed to do a trylock
94operation. If the lock succeeds, you'll get an fd.
95
96open(2) with O_CREAT to ensure the resource inode is created - dlmfs does
97not automatically create inodes for existing lock resources.
98
99Open Flag Lock Request Type
100--------- -----------------
101O_RDONLY Shared Read
102O_RDWR Exclusive
103
104Open Flag Resulting Locking Behavior
105--------- --------------------------
106O_NONBLOCK Trylock operation
107
108You must provide exactly one of O_RDONLY or O_RDWR.
109
110If O_NONBLOCK is also provided and the trylock operation was valid but
111could not lock the resource then open(2) will return ETXTBUSY.
112
113close(2) drops the lock associated with your fd.
114
115Modes passed to mkdir(2) or open(2) are adhered to locally. Chown is
116supported locally as well. This means you can use them to restrict
117access to the resources via dlmfs on your local node only.
118
119The resource LVB may be read from the fd in either Shared Read or
120Exclusive modes via the read(2) system call. It can be written via
121write(2) only when open in Exclusive mode.
122
123Once written, an LVB will be visible to other nodes who obtain Read
124Only or higher level locks on the resource.
125
126See Also
127========
128http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
129
130For more information on the VMS distributed locking API.
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
new file mode 100644
index 000000000000..f2595caf052e
--- /dev/null
+++ b/Documentation/filesystems/ocfs2.txt
@@ -0,0 +1,55 @@
1OCFS2 filesystem
2==================
3OCFS2 is a general purpose extent based shared disk cluster file
4system with many similarities to ext3. It supports 64 bit inode
5numbers, and has automatically extending metadata groups which may
6also make it attractive for non-clustered use.
7
8You'll want to install the ocfs2-tools package in order to at least
9get "mount.ocfs2" and "ocfs2_hb_ctl".
10
11Project web page: http://oss.oracle.com/projects/ocfs2
12Tools web page: http://oss.oracle.com/projects/ocfs2-tools
13OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
14
15All code copyright 2005 Oracle except when otherwise noted.
16
17CREDITS:
18Lots of code taken from ext3 and other projects.
19
20Authors in alphabetical order:
21Joel Becker <joel.becker@oracle.com>
22Zach Brown <zach.brown@oracle.com>
23Mark Fasheh <mark.fasheh@oracle.com>
24Kurt Hackel <kurt.hackel@oracle.com>
25Sunil Mushran <sunil.mushran@oracle.com>
26Manish Singh <manish.singh@oracle.com>
27
28Caveats
29=======
30Features which OCFS2 does not support yet:
31 - sparse files
32 - extended attributes
33 - shared writeable mmap
34 - loopback is supported, but data written will not
35 be cluster coherent.
36 - quotas
37 - cluster aware flock
38 - Directory change notification (F_NOTIFY)
39 - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
40 - POSIX ACLs
41 - readpages / writepages (not user visible)
42
43Mount options
44=============
45
46OCFS2 supports the following mount options:
47(*) == default
48
49barrier=1 This enables/disables barriers. barrier=0 disables it,
50 barrier=1 enables it.
51errors=remount-ro(*) Remount the filesystem read-only on an error.
52errors=panic Panic and halt the machine if an error occurs.
53intr (*) Allow signals to interrupt cluster operations.
54nointr Do not allow signals to interrupt cluster
55 operations.
diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 31154882000a..6304db59bfe4 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -860,24 +860,6 @@ The structure has a number of fields, some of which are mandatory:
860 It is safe to sleep in this method. 860 It is safe to sleep in this method.
861 861
862 862
863 (*) int (*duplicate)(struct key *key, const struct key *source);
864
865 If this type of key can be duplicated, then this method should be
866 provided. It is called to copy the payload attached to the source into the
867 new key. The data length on the new key will have been updated and the
868 quota adjusted already.
869
870 This method will be called with the source key's semaphore read-locked to
871 prevent its payload from being changed, thus RCU constraints need not be
872 applied to the source key.
873
874 This method does not have to lock the destination key in order to attach a
875 payload. The fact that KEY_FLAG_INSTANTIATED is not set in key->flags
876 prevents anything else from gaining access to the key.
877
878 It is safe to sleep in this method.
879
880
881 (*) int (*update)(struct key *key, const void *data, size_t datalen); 863 (*) int (*update)(struct key *key, const void *data, size_t datalen);
882 864
883 If this type of key can be updated, then this method should be provided. 865 If this type of key can be updated, then this method should be provided.
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 23e6cce40f9c..03a13c462cf2 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -51,6 +51,30 @@ superblock can be autodetected and run at boot time.
51The kernel parameter "raid=partitionable" (or "raid=part") means 51The kernel parameter "raid=partitionable" (or "raid=part") means
52that all auto-detected arrays are assembled as partitionable. 52that all auto-detected arrays are assembled as partitionable.
53 53
54Boot time assembly of degraded/dirty arrays
55-------------------------------------------
56
57If a raid5 or raid6 array is both dirty and degraded, it could have
58undetectable data corruption. This is because the fact that it is
59'dirty' means that the parity cannot be trusted, and the fact that it
60is degraded means that some datablocks are missing and cannot reliably
61be reconstructed (due to no parity).
62
63For this reason, md will normally refuse to start such an array. This
64requires the sysadmin to take action to explicitly start the array
65desipite possible corruption. This is normally done with
66 mdadm --assemble --force ....
67
68This option is not really available if the array has the root
69filesystem on it. In order to support this booting from such an
70array, md supports a module parameter "start_dirty_degraded" which,
71when set to 1, bypassed the checks and will allows dirty degraded
72arrays to be started.
73
74So, to boot with a root filesystem of a dirty degraded raid[56], use
75
76 md-mod.start_dirty_degraded=1
77
54 78
55Superblock formats 79Superblock formats
56------------------ 80------------------
@@ -141,6 +165,70 @@ All md devices contain:
141 in a fully functional array. If this is not yet known, the file 165 in a fully functional array. If this is not yet known, the file
142 will be empty. If an array is being resized (not currently 166 will be empty. If an array is being resized (not currently
143 possible) this will contain the larger of the old and new sizes. 167 possible) this will contain the larger of the old and new sizes.
168 Some raid level (RAID1) allow this value to be set while the
169 array is active. This will reconfigure the array. Otherwise
170 it can only be set while assembling an array.
171
172 chunk_size
173 This is the size if bytes for 'chunks' and is only relevant to
174 raid levels that involve striping (1,4,5,6,10). The address space
175 of the array is conceptually divided into chunks and consecutive
176 chunks are striped onto neighbouring devices.
177 The size should be atleast PAGE_SIZE (4k) and should be a power
178 of 2. This can only be set while assembling an array
179
180 component_size
181 For arrays with data redundancy (i.e. not raid0, linear, faulty,
182 multipath), all components must be the same size - or at least
183 there must a size that they all provide space for. This is a key
184 part or the geometry of the array. It is measured in sectors
185 and can be read from here. Writing to this value may resize
186 the array if the personality supports it (raid1, raid5, raid6),
187 and if the component drives are large enough.
188
189 metadata_version
190 This indicates the format that is being used to record metadata
191 about the array. It can be 0.90 (traditional format), 1.0, 1.1,
192 1.2 (newer format in varying locations) or "none" indicating that
193 the kernel isn't managing metadata at all.
194
195 level
196 The raid 'level' for this array. The name will often (but not
197 always) be the same as the name of the module that implements the
198 level. To be auto-loaded the module must have an alias
199 md-$LEVEL e.g. md-raid5
200 This can be written only while the array is being assembled, not
201 after it is started.
202
203 new_dev
204 This file can be written but not read. The value written should
205 be a block device number as major:minor. e.g. 8:0
206 This will cause that device to be attached to the array, if it is
207 available. It will then appear at md/dev-XXX (depending on the
208 name of the device) and further configuration is then possible.
209
210 sync_speed_min
211 sync_speed_max
212 This are similar to /proc/sys/dev/raid/speed_limit_{min,max}
213 however they only apply to the particular array.
214 If no value has been written to these, of if the word 'system'
215 is written, then the system-wide value is used. If a value,
216 in kibibytes-per-second is written, then it is used.
217 When the files are read, they show the currently active value
218 followed by "(local)" or "(system)" depending on whether it is
219 a locally set or system-wide value.
220
221 sync_completed
222 This shows the number of sectors that have been completed of
223 whatever the current sync_action is, followed by the number of
224 sectors in total that could need to be processed. The two
225 numbers are separated by a '/' thus effectively showing one
226 value, a fraction of the process that is complete.
227
228 sync_speed
229 This shows the current actual speed, in K/sec, of the current
230 sync_action. It is averaged over the last 30 seconds.
231
144 232
145As component devices are added to an md array, they appear in the 'md' 233As component devices are added to an md array, they appear in the 'md'
146directory as new directories named 234directory as new directories named
@@ -167,6 +255,38 @@ Each directory contains:
167 of being recoverred to 255 of being recoverred to
168 This list make grow in future. 256 This list make grow in future.
169 257
258 errors
259 An approximate count of read errors that have been detected on
260 this device but have not caused the device to be evicted from
261 the array (either because they were corrected or because they
262 happened while the array was read-only). When using version-1
263 metadata, this value persists across restarts of the array.
264
265 This value can be written while assembling an array thus
266 providing an ongoing count for arrays with metadata managed by
267 userspace.
268
269 slot
270 This gives the role that the device has in the array. It will
271 either be 'none' if the device is not active in the array
272 (i.e. is a spare or has failed) or an integer less than the
273 'raid_disks' number for the array indicating which possition
274 it currently fills. This can only be set while assembling an
275 array. A device for which this is set is assumed to be working.
276
277 offset
278 This gives the location in the device (in sectors from the
279 start) where data from the array will be stored. Any part of
280 the device before this offset us not touched, unless it is
281 used for storing metadata (Formats 1.1 and 1.2).
282
283 size
284 The amount of the device, after the offset, that can be used
285 for storage of data. This will normally be the same as the
286 component_size. This can be written while assembling an
287 array. If a value less than the current component_size is
288 written, component_size will be reduced to this value.
289
170 290
171An active md device will also contain and entry for each active device 291An active md device will also contain and entry for each active device
172in the array. These are named 292in the array. These are named
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
index f5ebda5f4276..bd4ffb5bd49a 100644
--- a/Documentation/power/interface.txt
+++ b/Documentation/power/interface.txt
@@ -41,3 +41,14 @@ to. Writing to this file will accept one of
41It will only change to 'firmware' or 'platform' if the system supports 41It will only change to 'firmware' or 'platform' if the system supports
42it. 42it.
43 43
44/sys/power/image_size controls the size of the image created by
45the suspend-to-disk mechanism. It can be written a string
46representing a non-negative integer that will be used as an upper
47limit of the image size, in megabytes. The suspend-to-disk mechanism will
48do its best to ensure the image size will not exceed that number. However,
49if this turns out to be impossible, it will try to suspend anyway using the
50smallest image possible. In particular, if "0" is written to this file, the
51suspend image will be as small as possible.
52
53Reading from this file will display the current image size limit, which
54is set to 500 MB by default.
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index b0d50840788e..cd0fcd89a6f0 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -27,6 +27,11 @@ echo shutdown > /sys/power/disk; echo disk > /sys/power/state
27 27
28echo platform > /sys/power/disk; echo disk > /sys/power/state 28echo platform > /sys/power/disk; echo disk > /sys/power/state
29 29
30If you want to limit the suspend image size to N megabytes, do
31
32echo N > /sys/power/image_size
33
34before suspend (it is limited to 500 MB by default).
30 35
31Encrypted suspend image: 36Encrypted suspend image:
32------------------------ 37------------------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 16f37826ae38..554fcec48b3e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -258,6 +258,13 @@ P: Ivan Kokshaysky
258M: ink@jurassic.park.msu.ru 258M: ink@jurassic.park.msu.ru
259S: Maintained for 2.4; PCI support for 2.6. 259S: Maintained for 2.4; PCI support for 2.6.
260 260
261AMD GEODE PROCESSOR/CHIPSET SUPPORT
262P: Jordan Crouse
263M: info-linux@geode.amd.com
264L: info-linux@geode.amd.com
265W: http://www.amd.com/us-en/ConnectivitySolutions/TechnicalResources/0,,50_2334_2452_11363,00.html
266S: Supported
267
261APM DRIVER 268APM DRIVER
262P: Stephen Rothwell 269P: Stephen Rothwell
263M: sfr@canb.auug.org.au 270M: sfr@canb.auug.org.au
@@ -554,6 +561,11 @@ W: http://us1.samba.org/samba/Linux_CIFS_client.html
554T: git kernel.org:/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git 561T: git kernel.org:/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git
555S: Supported 562S: Supported
556 563
564CONFIGFS
565P: Joel Becker
566M: Joel Becker <joel.becker@oracle.com>
567S: Supported
568
557CIRRUS LOGIC GENERIC FBDEV DRIVER 569CIRRUS LOGIC GENERIC FBDEV DRIVER
558P: Jeff Garzik 570P: Jeff Garzik
559M: jgarzik@pobox.com 571M: jgarzik@pobox.com
@@ -1230,7 +1242,7 @@ IEEE 1394 SUBSYSTEM
1230P: Ben Collins 1242P: Ben Collins
1231M: bcollins@debian.org 1243M: bcollins@debian.org
1232P: Jody McIntyre 1244P: Jody McIntyre
1233M: scjody@steamballoon.com 1245M: scjody@modernduck.com
1234L: linux1394-devel@lists.sourceforge.net 1246L: linux1394-devel@lists.sourceforge.net
1235W: http://www.linux1394.org/ 1247W: http://www.linux1394.org/
1236T: git kernel.org:/pub/scm/linux/kernel/git/scjody/ieee1394.git 1248T: git kernel.org:/pub/scm/linux/kernel/git/scjody/ieee1394.git
@@ -1240,14 +1252,14 @@ IEEE 1394 OHCI DRIVER
1240P: Ben Collins 1252P: Ben Collins
1241M: bcollins@debian.org 1253M: bcollins@debian.org
1242P: Jody McIntyre 1254P: Jody McIntyre
1243M: scjody@steamballoon.com 1255M: scjody@modernduck.com
1244L: linux1394-devel@lists.sourceforge.net 1256L: linux1394-devel@lists.sourceforge.net
1245W: http://www.linux1394.org/ 1257W: http://www.linux1394.org/
1246S: Maintained 1258S: Maintained
1247 1259
1248IEEE 1394 PCILYNX DRIVER 1260IEEE 1394 PCILYNX DRIVER
1249P: Jody McIntyre 1261P: Jody McIntyre
1250M: scjody@steamballoon.com 1262M: scjody@modernduck.com
1251L: linux1394-devel@lists.sourceforge.net 1263L: linux1394-devel@lists.sourceforge.net
1252W: http://www.linux1394.org/ 1264W: http://www.linux1394.org/
1253S: Maintained 1265S: Maintained
@@ -1898,6 +1910,15 @@ M: ajoshi@shell.unixbox.com
1898L: linux-nvidia@lists.surfsouth.com 1910L: linux-nvidia@lists.surfsouth.com
1899S: Maintained 1911S: Maintained
1900 1912
1913ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
1914P: Mark Fasheh
1915M: mark.fasheh@oracle.com
1916P: Kurt Hackel
1917M: kurt.hackel@oracle.com
1918L: ocfs2-devel@oss.oracle.com
1919W: http://oss.oracle.com/projects/ocfs2/
1920S: Supported
1921
1901OLYMPIC NETWORK DRIVER 1922OLYMPIC NETWORK DRIVER
1902P: Peter De Shrijver 1923P: Peter De Shrijver
1903M: p2@ace.ulyssis.student.kuleuven.ac.be 1924M: p2@ace.ulyssis.student.kuleuven.ac.be
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 786491f9ceb2..153337ff1d7b 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -40,6 +40,19 @@ config GENERIC_IOMAP
40 bool 40 bool
41 default n 41 default n
42 42
43config GENERIC_HARDIRQS
44 bool
45 default y
46
47config GENERIC_IRQ_PROBE
48 bool
49 default y
50
51config AUTO_IRQ_AFFINITY
52 bool
53 depends on SMP
54 default y
55
43source "init/Kconfig" 56source "init/Kconfig"
44 57
45 58
diff --git a/arch/alpha/kernel/alpha_ksyms.c b/arch/alpha/kernel/alpha_ksyms.c
index 24ae9a366073..f3e98f837784 100644
--- a/arch/alpha/kernel/alpha_ksyms.c
+++ b/arch/alpha/kernel/alpha_ksyms.c
@@ -175,7 +175,6 @@ EXPORT_SYMBOL(up);
175 */ 175 */
176 176
177#ifdef CONFIG_SMP 177#ifdef CONFIG_SMP
178EXPORT_SYMBOL(synchronize_irq);
179EXPORT_SYMBOL(flush_tlb_mm); 178EXPORT_SYMBOL(flush_tlb_mm);
180EXPORT_SYMBOL(flush_tlb_range); 179EXPORT_SYMBOL(flush_tlb_range);
181EXPORT_SYMBOL(flush_tlb_page); 180EXPORT_SYMBOL(flush_tlb_page);
diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index b6114f5c0d2b..76be5cf0de13 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -32,214 +32,25 @@
32#include <asm/io.h> 32#include <asm/io.h>
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34 34
35/*
36 * Controller mappings for all interrupt sources:
37 */
38irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
39 [0 ... NR_IRQS-1] = {
40 .handler = &no_irq_type,
41 .lock = SPIN_LOCK_UNLOCKED
42 }
43};
44
45static void register_irq_proc(unsigned int irq);
46
47volatile unsigned long irq_err_count; 35volatile unsigned long irq_err_count;
48 36
49/* 37void ack_bad_irq(unsigned int irq)
50 * Special irq handlers.
51 */
52
53irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
54{
55 return IRQ_NONE;
56}
57
58/*
59 * Generic no controller code
60 */
61
62static void no_irq_enable_disable(unsigned int irq) { }
63static unsigned int no_irq_startup(unsigned int irq) { return 0; }
64
65static void
66no_irq_ack(unsigned int irq)
67{ 38{
68 irq_err_count++; 39 irq_err_count++;
69 printk(KERN_CRIT "Unexpected IRQ trap at vector %u\n", irq); 40 printk(KERN_CRIT "Unexpected IRQ trap at vector %u\n", irq);
70} 41}
71 42
72struct hw_interrupt_type no_irq_type = {
73 .typename = "none",
74 .startup = no_irq_startup,
75 .shutdown = no_irq_enable_disable,
76 .enable = no_irq_enable_disable,
77 .disable = no_irq_enable_disable,
78 .ack = no_irq_ack,
79 .end = no_irq_enable_disable,
80};
81
82int
83handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
84 struct irqaction *action)
85{
86 int status = 1; /* Force the "do bottom halves" bit */
87 int ret;
88
89 do {
90 if (!(action->flags & SA_INTERRUPT))
91 local_irq_enable();
92 else
93 local_irq_disable();
94
95 ret = action->handler(irq, action->dev_id, regs);
96 if (ret == IRQ_HANDLED)
97 status |= action->flags;
98 action = action->next;
99 } while (action);
100 if (status & SA_SAMPLE_RANDOM)
101 add_interrupt_randomness(irq);
102 local_irq_disable();
103
104 return status;
105}
106
107/*
108 * Generic enable/disable code: this just calls
109 * down into the PIC-specific version for the actual
110 * hardware disable after having gotten the irq
111 * controller lock.
112 */
113void inline
114disable_irq_nosync(unsigned int irq)
115{
116 irq_desc_t *desc = irq_desc + irq;
117 unsigned long flags;
118
119 spin_lock_irqsave(&desc->lock, flags);
120 if (!desc->depth++) {
121 desc->status |= IRQ_DISABLED;
122 desc->handler->disable(irq);
123 }
124 spin_unlock_irqrestore(&desc->lock, flags);
125}
126
127/*
128 * Synchronous version of the above, making sure the IRQ is
129 * no longer running on any other IRQ..
130 */
131void
132disable_irq(unsigned int irq)
133{
134 disable_irq_nosync(irq);
135 synchronize_irq(irq);
136}
137
138void
139enable_irq(unsigned int irq)
140{
141 irq_desc_t *desc = irq_desc + irq;
142 unsigned long flags;
143
144 spin_lock_irqsave(&desc->lock, flags);
145 switch (desc->depth) {
146 case 1: {
147 unsigned int status = desc->status & ~IRQ_DISABLED;
148 desc->status = status;
149 if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
150 desc->status = status | IRQ_REPLAY;
151 hw_resend_irq(desc->handler,irq);
152 }
153 desc->handler->enable(irq);
154 /* fall-through */
155 }
156 default:
157 desc->depth--;
158 break;
159 case 0:
160 printk(KERN_ERR "enable_irq() unbalanced from %p\n",
161 __builtin_return_address(0));
162 }
163 spin_unlock_irqrestore(&desc->lock, flags);
164}
165
166int
167setup_irq(unsigned int irq, struct irqaction * new)
168{
169 int shared = 0;
170 struct irqaction *old, **p;
171 unsigned long flags;
172 irq_desc_t *desc = irq_desc + irq;
173
174 if (desc->handler == &no_irq_type)
175 return -ENOSYS;
176
177 /*
178 * Some drivers like serial.c use request_irq() heavily,
179 * so we have to be careful not to interfere with a
180 * running system.
181 */
182 if (new->flags & SA_SAMPLE_RANDOM) {
183 /*
184 * This function might sleep, we want to call it first,
185 * outside of the atomic block.
186 * Yes, this might clear the entropy pool if the wrong
187 * driver is attempted to be loaded, without actually
188 * installing a new handler, but is this really a problem,
189 * only the sysadmin is able to do this.
190 */
191 rand_initialize_irq(irq);
192 }
193
194 /*
195 * The following block of code has to be executed atomically
196 */
197 spin_lock_irqsave(&desc->lock,flags);
198 p = &desc->action;
199 if ((old = *p) != NULL) {
200 /* Can't share interrupts unless both agree to */
201 if (!(old->flags & new->flags & SA_SHIRQ)) {
202 spin_unlock_irqrestore(&desc->lock,flags);
203 return -EBUSY;
204 }
205
206 /* add new interrupt at end of irq queue */
207 do {
208 p = &old->next;
209 old = *p;
210 } while (old);
211 shared = 1;
212 }
213
214 *p = new;
215
216 if (!shared) {
217 desc->depth = 0;
218 desc->status &=
219 ~(IRQ_DISABLED|IRQ_AUTODETECT|IRQ_WAITING|IRQ_INPROGRESS);
220 desc->handler->startup(irq);
221 }
222 spin_unlock_irqrestore(&desc->lock,flags);
223
224 return 0;
225}
226
227static struct proc_dir_entry * root_irq_dir;
228static struct proc_dir_entry * irq_dir[NR_IRQS];
229
230#ifdef CONFIG_SMP 43#ifdef CONFIG_SMP
231static struct proc_dir_entry * smp_affinity_entry[NR_IRQS];
232static char irq_user_affinity[NR_IRQS]; 44static char irq_user_affinity[NR_IRQS];
233static cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
234 45
235static void 46int
236select_smp_affinity(int irq) 47select_smp_affinity(unsigned int irq)
237{ 48{
238 static int last_cpu; 49 static int last_cpu;
239 int cpu = last_cpu + 1; 50 int cpu = last_cpu + 1;
240 51
241 if (! irq_desc[irq].handler->set_affinity || irq_user_affinity[irq]) 52 if (!irq_desc[irq].handler->set_affinity || irq_user_affinity[irq])
242 return; 53 return 1;
243 54
244 while (!cpu_possible(cpu)) 55 while (!cpu_possible(cpu))
245 cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0); 56 cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
@@ -247,208 +58,10 @@ select_smp_affinity(int irq)
247 58
248 irq_affinity[irq] = cpumask_of_cpu(cpu); 59 irq_affinity[irq] = cpumask_of_cpu(cpu);
249 irq_desc[irq].handler->set_affinity(irq, cpumask_of_cpu(cpu)); 60 irq_desc[irq].handler->set_affinity(irq, cpumask_of_cpu(cpu));
61 return 0;
250} 62}
251
252static int
253irq_affinity_read_proc (char *page, char **start, off_t off,
254 int count, int *eof, void *data)
255{
256 int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]);
257 if (count - len < 2)
258 return -EINVAL;
259 len += sprintf(page + len, "\n");
260 return len;
261}
262
263static int
264irq_affinity_write_proc(struct file *file, const char __user *buffer,
265 unsigned long count, void *data)
266{
267 int irq = (long) data, full_count = count, err;
268 cpumask_t new_value;
269
270 if (!irq_desc[irq].handler->set_affinity)
271 return -EIO;
272
273 err = cpumask_parse(buffer, count, new_value);
274
275 /* The special value 0 means release control of the
276 affinity to kernel. */
277 cpus_and(new_value, new_value, cpu_online_map);
278 if (cpus_empty(new_value)) {
279 irq_user_affinity[irq] = 0;
280 select_smp_affinity(irq);
281 }
282 /* Do not allow disabling IRQs completely - it's a too easy
283 way to make the system unusable accidentally :-) At least
284 one online CPU still has to be targeted. */
285 else {
286 irq_affinity[irq] = new_value;
287 irq_user_affinity[irq] = 1;
288 irq_desc[irq].handler->set_affinity(irq, new_value);
289 }
290
291 return full_count;
292}
293
294#endif /* CONFIG_SMP */ 63#endif /* CONFIG_SMP */
295 64
296#define MAX_NAMELEN 10
297
298static void
299register_irq_proc (unsigned int irq)
300{
301 char name [MAX_NAMELEN];
302
303 if (!root_irq_dir || (irq_desc[irq].handler == &no_irq_type) ||
304 irq_dir[irq])
305 return;
306
307 memset(name, 0, MAX_NAMELEN);
308 sprintf(name, "%d", irq);
309
310 /* create /proc/irq/1234 */
311 irq_dir[irq] = proc_mkdir(name, root_irq_dir);
312
313#ifdef CONFIG_SMP
314 if (irq_desc[irq].handler->set_affinity) {
315 struct proc_dir_entry *entry;
316 /* create /proc/irq/1234/smp_affinity */
317 entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]);
318
319 if (entry) {
320 entry->nlink = 1;
321 entry->data = (void *)(long)irq;
322 entry->read_proc = irq_affinity_read_proc;
323 entry->write_proc = irq_affinity_write_proc;
324 }
325
326 smp_affinity_entry[irq] = entry;
327 }
328#endif
329}
330
331void
332init_irq_proc (void)
333{
334 int i;
335
336 /* create /proc/irq */
337 root_irq_dir = proc_mkdir("irq", NULL);
338
339#ifdef CONFIG_SMP
340 /* create /proc/irq/prof_cpu_mask */
341 create_prof_cpu_mask(root_irq_dir);
342#endif
343
344 /*
345 * Create entries for all existing IRQs.
346 */
347 for (i = 0; i < ACTUAL_NR_IRQS; i++) {
348 if (irq_desc[i].handler == &no_irq_type)
349 continue;
350 register_irq_proc(i);
351 }
352}
353
354int
355request_irq(unsigned int irq, irqreturn_t (*handler)(int, void *, struct pt_regs *),
356 unsigned long irqflags, const char * devname, void *dev_id)
357{
358 int retval;
359 struct irqaction * action;
360
361 if (irq >= ACTUAL_NR_IRQS)
362 return -EINVAL;
363 if (!handler)
364 return -EINVAL;
365
366#if 1
367 /*
368 * Sanity-check: shared interrupts should REALLY pass in
369 * a real dev-ID, otherwise we'll have trouble later trying
370 * to figure out which interrupt is which (messes up the
371 * interrupt freeing logic etc).
372 */
373 if ((irqflags & SA_SHIRQ) && !dev_id) {
374 printk(KERN_ERR
375 "Bad boy: %s (at %p) called us without a dev_id!\n",
376 devname, __builtin_return_address(0));
377 }
378#endif
379
380 action = (struct irqaction *)
381 kmalloc(sizeof(struct irqaction), GFP_KERNEL);
382 if (!action)
383 return -ENOMEM;
384
385 action->handler = handler;
386 action->flags = irqflags;
387 cpus_clear(action->mask);
388 action->name = devname;
389 action->next = NULL;
390 action->dev_id = dev_id;
391
392#ifdef CONFIG_SMP
393 select_smp_affinity(irq);
394#endif
395
396 retval = setup_irq(irq, action);
397 if (retval)
398 kfree(action);
399 return retval;
400}
401
402EXPORT_SYMBOL(request_irq);
403
404void
405free_irq(unsigned int irq, void *dev_id)
406{
407 irq_desc_t *desc;
408 struct irqaction **p;
409 unsigned long flags;
410
411 if (irq >= ACTUAL_NR_IRQS) {
412 printk(KERN_CRIT "Trying to free IRQ%d\n", irq);
413 return;
414 }
415
416 desc = irq_desc + irq;
417 spin_lock_irqsave(&desc->lock,flags);
418 p = &desc->action;
419 for (;;) {
420 struct irqaction * action = *p;
421 if (action) {
422 struct irqaction **pp = p;
423 p = &action->next;
424 if (action->dev_id != dev_id)
425 continue;
426
427 /* Found - now remove it from the list of entries. */
428 *pp = action->next;
429 if (!desc->action) {
430 desc->status |= IRQ_DISABLED;
431 desc->handler->shutdown(irq);
432 }
433 spin_unlock_irqrestore(&desc->lock,flags);
434
435#ifdef CONFIG_SMP
436 /* Wait to make sure it's not being used on
437 another CPU. */
438 while (desc->status & IRQ_INPROGRESS)
439 barrier();
440#endif
441 kfree(action);
442 return;
443 }
444 printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
445 spin_unlock_irqrestore(&desc->lock,flags);
446 return;
447 }
448}
449
450EXPORT_SYMBOL(free_irq);
451
452int 65int
453show_interrupts(struct seq_file *p, void *v) 66show_interrupts(struct seq_file *p, void *v)
454{ 67{
@@ -531,10 +144,6 @@ handle_irq(int irq, struct pt_regs * regs)
531 * 0 return value means that this irq is already being 144 * 0 return value means that this irq is already being
532 * handled by some other CPU. (or is disabled) 145 * handled by some other CPU. (or is disabled)
533 */ 146 */
534 int cpu = smp_processor_id();
535 irq_desc_t *desc = irq_desc + irq;
536 struct irqaction * action;
537 unsigned int status;
538 static unsigned int illegal_count=0; 147 static unsigned int illegal_count=0;
539 148
540 if ((unsigned) irq > ACTUAL_NR_IRQS && illegal_count < MAX_ILLEGAL_IRQS ) { 149 if ((unsigned) irq > ACTUAL_NR_IRQS && illegal_count < MAX_ILLEGAL_IRQS ) {
@@ -546,229 +155,8 @@ handle_irq(int irq, struct pt_regs * regs)
546 } 155 }
547 156
548 irq_enter(); 157 irq_enter();
549 kstat_cpu(cpu).irqs[irq]++; 158 local_irq_disable();
550 spin_lock_irq(&desc->lock); /* mask also the higher prio events */ 159 __do_IRQ(irq, regs);
551 desc->handler->ack(irq); 160 local_irq_enable();
552 /*
553 * REPLAY is when Linux resends an IRQ that was dropped earlier.
554 * WAITING is used by probe to mark irqs that are being tested.
555 */
556 status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
557 status |= IRQ_PENDING; /* we _want_ to handle it */
558
559 /*
560 * If the IRQ is disabled for whatever reason, we cannot
561 * use the action we have.
562 */
563 action = NULL;
564 if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) {
565 action = desc->action;
566 status &= ~IRQ_PENDING; /* we commit to handling */
567 status |= IRQ_INPROGRESS; /* we are handling it */
568 }
569 desc->status = status;
570
571 /*
572 * If there is no IRQ handler or it was disabled, exit early.
573 * Since we set PENDING, if another processor is handling
574 * a different instance of this same irq, the other processor
575 * will take care of it.
576 */
577 if (!action)
578 goto out;
579
580 /*
581 * Edge triggered interrupts need to remember pending events.
582 * This applies to any hw interrupts that allow a second
583 * instance of the same irq to arrive while we are in handle_irq
584 * or in the handler. But the code here only handles the _second_
585 * instance of the irq, not the third or fourth. So it is mostly
586 * useful for irq hardware that does not mask cleanly in an
587 * SMP environment.
588 */
589 for (;;) {
590 spin_unlock(&desc->lock);
591 handle_IRQ_event(irq, regs, action);
592 spin_lock(&desc->lock);
593
594 if (!(desc->status & IRQ_PENDING)
595 || (desc->status & IRQ_LEVEL))
596 break;
597 desc->status &= ~IRQ_PENDING;
598 }
599 desc->status &= ~IRQ_INPROGRESS;
600out:
601 /*
602 * The ->end() handler has to deal with interrupts which got
603 * disabled while the handler was running.
604 */
605 desc->handler->end(irq);
606 spin_unlock(&desc->lock);
607
608 irq_exit(); 161 irq_exit();
609} 162}
610
611/*
612 * IRQ autodetection code..
613 *
614 * This depends on the fact that any interrupt that
615 * comes in on to an unassigned handler will get stuck
616 * with "IRQ_WAITING" cleared and the interrupt
617 * disabled.
618 */
619unsigned long
620probe_irq_on(void)
621{
622 int i;
623 irq_desc_t *desc;
624 unsigned long delay;
625 unsigned long val;
626
627 /* Something may have generated an irq long ago and we want to
628 flush such a longstanding irq before considering it as spurious. */
629 for (i = NR_IRQS-1; i >= 0; i--) {
630 desc = irq_desc + i;
631
632 spin_lock_irq(&desc->lock);
633 if (!irq_desc[i].action)
634 irq_desc[i].handler->startup(i);
635 spin_unlock_irq(&desc->lock);
636 }
637
638 /* Wait for longstanding interrupts to trigger. */
639 for (delay = jiffies + HZ/50; time_after(delay, jiffies); )
640 /* about 20ms delay */ barrier();
641
642 /* enable any unassigned irqs (we must startup again here because
643 if a longstanding irq happened in the previous stage, it may have
644 masked itself) first, enable any unassigned irqs. */
645 for (i = NR_IRQS-1; i >= 0; i--) {
646 desc = irq_desc + i;
647
648 spin_lock_irq(&desc->lock);
649 if (!desc->action) {
650 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
651 if (desc->handler->startup(i))
652 desc->status |= IRQ_PENDING;
653 }
654 spin_unlock_irq(&desc->lock);
655 }
656
657 /*
658 * Wait for spurious interrupts to trigger
659 */
660 for (delay = jiffies + HZ/10; time_after(delay, jiffies); )
661 /* about 100ms delay */ barrier();
662
663 /*
664 * Now filter out any obviously spurious interrupts
665 */
666 val = 0;
667 for (i=0; i<NR_IRQS; i++) {
668 irq_desc_t *desc = irq_desc + i;
669 unsigned int status;
670
671 spin_lock_irq(&desc->lock);
672 status = desc->status;
673
674 if (status & IRQ_AUTODETECT) {
675 /* It triggered already - consider it spurious. */
676 if (!(status & IRQ_WAITING)) {
677 desc->status = status & ~IRQ_AUTODETECT;
678 desc->handler->shutdown(i);
679 } else
680 if (i < 32)
681 val |= 1 << i;
682 }
683 spin_unlock_irq(&desc->lock);
684 }
685
686 return val;
687}
688
689EXPORT_SYMBOL(probe_irq_on);
690
691/*
692 * Return a mask of triggered interrupts (this
693 * can handle only legacy ISA interrupts).
694 */
695unsigned int
696probe_irq_mask(unsigned long val)
697{
698 int i;
699 unsigned int mask;
700
701 mask = 0;
702 for (i = 0; i < NR_IRQS; i++) {
703 irq_desc_t *desc = irq_desc + i;
704 unsigned int status;
705
706 spin_lock_irq(&desc->lock);
707 status = desc->status;
708
709 if (status & IRQ_AUTODETECT) {
710 /* We only react to ISA interrupts */
711 if (!(status & IRQ_WAITING)) {
712 if (i < 16)
713 mask |= 1 << i;
714 }
715
716 desc->status = status & ~IRQ_AUTODETECT;
717 desc->handler->shutdown(i);
718 }
719 spin_unlock_irq(&desc->lock);
720 }
721
722 return mask & val;
723}
724
725/*
726 * Get the result of the IRQ probe.. A negative result means that
727 * we have several candidates (but we return the lowest-numbered
728 * one).
729 */
730
731int
732probe_irq_off(unsigned long val)
733{
734 int i, irq_found, nr_irqs;
735
736 nr_irqs = 0;
737 irq_found = 0;
738 for (i=0; i<NR_IRQS; i++) {
739 irq_desc_t *desc = irq_desc + i;
740 unsigned int status;
741
742 spin_lock_irq(&desc->lock);
743 status = desc->status;
744
745 if (status & IRQ_AUTODETECT) {
746 if (!(status & IRQ_WAITING)) {
747 if (!nr_irqs)
748 irq_found = i;
749 nr_irqs++;
750 }
751 desc->status = status & ~IRQ_AUTODETECT;
752 desc->handler->shutdown(i);
753 }
754 spin_unlock_irq(&desc->lock);
755 }
756
757 if (nr_irqs > 1)
758 irq_found = -irq_found;
759 return irq_found;
760}
761
762EXPORT_SYMBOL(probe_irq_off);
763
764#ifdef CONFIG_SMP
765void synchronize_irq(unsigned int irq)
766{
767 /* is there anything to synchronize with? */
768 if (!irq_desc[irq].action)
769 return;
770
771 while (irq_desc[irq].status & IRQ_INPROGRESS)
772 barrier();
773}
774#endif
diff --git a/arch/cris/arch-v10/kernel/kgdb.c b/arch/cris/arch-v10/kernel/kgdb.c
index b72e6a91a639..34528da98817 100644
--- a/arch/cris/arch-v10/kernel/kgdb.c
+++ b/arch/cris/arch-v10/kernel/kgdb.c
@@ -569,12 +569,6 @@ gdb_cris_strtol (const char *s, char **endptr, int base)
569 return x; 569 return x;
570} 570}
571 571
572int
573double_this(int x)
574{
575 return 2 * x;
576}
577
578/********************************* Register image ****************************/ 572/********************************* Register image ****************************/
579/* Copy the content of a register image into another. The size n is 573/* Copy the content of a register image into another. The size n is
580 the size of the register image. Due to struct assignment generation of 574 the size of the register image. Due to struct assignment generation of
diff --git a/arch/frv/kernel/Makefile b/arch/frv/kernel/Makefile
index 981c2c7dec0d..422f30ede575 100644
--- a/arch/frv/kernel/Makefile
+++ b/arch/frv/kernel/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_FUJITSU_MB93493) += irq-mb93493.o
20obj-$(CONFIG_PM) += pm.o cmode.o 20obj-$(CONFIG_PM) += pm.o cmode.o
21obj-$(CONFIG_MB93093_PDK) += pm-mb93093.o 21obj-$(CONFIG_MB93093_PDK) += pm-mb93093.o
22obj-$(CONFIG_SYSCTL) += sysctl.o 22obj-$(CONFIG_SYSCTL) += sysctl.o
23obj-$(CONFIG_FUTEX) += futex.o
diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S
index ad10ea595459..5f6548388b74 100644
--- a/arch/frv/kernel/entry.S
+++ b/arch/frv/kernel/entry.S
@@ -1076,7 +1076,7 @@ __entry_work_notifysig:
1076 LEDS 0x6410 1076 LEDS 0x6410
1077 ori.p gr4,#0,gr8 1077 ori.p gr4,#0,gr8
1078 call do_notify_resume 1078 call do_notify_resume
1079 bra __entry_return_direct 1079 bra __entry_resume_userspace
1080 1080
1081 # perform syscall entry tracing 1081 # perform syscall entry tracing
1082__syscall_trace_entry: 1082__syscall_trace_entry:
diff --git a/arch/frv/kernel/futex.c b/arch/frv/kernel/futex.c
new file mode 100644
index 000000000000..eae874a970c6
--- /dev/null
+++ b/arch/frv/kernel/futex.c
@@ -0,0 +1,242 @@
1/* futex.c: futex operations
2 *
3 * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/futex.h>
13#include <asm/futex.h>
14#include <asm/errno.h>
15#include <asm/uaccess.h>
16
17/*
18 * the various futex operations; MMU fault checking is ignored under no-MMU
19 * conditions
20 */
21static inline int atomic_futex_op_xchg_set(int oparg, int __user *uaddr, int *_oldval)
22{
23 int oldval, ret;
24
25 asm("0: \n"
26 " orcc gr0,gr0,gr0,icc3 \n" /* set ICC3.Z */
27 " ckeq icc3,cc7 \n"
28 "1: ld.p %M0,%1 \n" /* LD.P/ORCR must be atomic */
29 " orcr cc7,cc7,cc3 \n" /* set CC3 to true */
30 "2: cst.p %3,%M0 ,cc3,#1 \n"
31 " corcc gr29,gr29,gr0 ,cc3,#1 \n" /* clear ICC3.Z if store happens */
32 " beq icc3,#0,0b \n"
33 " setlos 0,%2 \n"
34 "3: \n"
35 ".subsection 2 \n"
36 "4: setlos %5,%2 \n"
37 " bra 3b \n"
38 ".previous \n"
39 ".section __ex_table,\"a\" \n"
40 " .balign 8 \n"
41 " .long 1b,4b \n"
42 " .long 2b,4b \n"
43 ".previous"
44 : "+U"(*uaddr), "=&r"(oldval), "=&r"(ret), "=r"(oparg)
45 : "3"(oparg), "i"(-EFAULT)
46 : "memory", "cc7", "cc3", "icc3"
47 );
48
49 *_oldval = oldval;
50 return ret;
51}
52
53static inline int atomic_futex_op_xchg_add(int oparg, int __user *uaddr, int *_oldval)
54{
55 int oldval, ret;
56
57 asm("0: \n"
58 " orcc gr0,gr0,gr0,icc3 \n" /* set ICC3.Z */
59 " ckeq icc3,cc7 \n"
60 "1: ld.p %M0,%1 \n" /* LD.P/ORCR must be atomic */
61 " orcr cc7,cc7,cc3 \n" /* set CC3 to true */
62 " add %1,%3,%3 \n"
63 "2: cst.p %3,%M0 ,cc3,#1 \n"
64 " corcc gr29,gr29,gr0 ,cc3,#1 \n" /* clear ICC3.Z if store happens */
65 " beq icc3,#0,0b \n"
66 " setlos 0,%2 \n"
67 "3: \n"
68 ".subsection 2 \n"
69 "4: setlos %5,%2 \n"
70 " bra 3b \n"
71 ".previous \n"
72 ".section __ex_table,\"a\" \n"
73 " .balign 8 \n"
74 " .long 1b,4b \n"
75 " .long 2b,4b \n"
76 ".previous"
77 : "+U"(*uaddr), "=&r"(oldval), "=&r"(ret), "=r"(oparg)
78 : "3"(oparg), "i"(-EFAULT)
79 : "memory", "cc7", "cc3", "icc3"
80 );
81
82 *_oldval = oldval;
83 return ret;
84}
85
86static inline int atomic_futex_op_xchg_or(int oparg, int __user *uaddr, int *_oldval)
87{
88 int oldval, ret;
89
90 asm("0: \n"
91 " orcc gr0,gr0,gr0,icc3 \n" /* set ICC3.Z */
92 " ckeq icc3,cc7 \n"
93 "1: ld.p %M0,%1 \n" /* LD.P/ORCR must be atomic */
94 " orcr cc7,cc7,cc3 \n" /* set CC3 to true */
95 " or %1,%3,%3 \n"
96 "2: cst.p %3,%M0 ,cc3,#1 \n"
97 " corcc gr29,gr29,gr0 ,cc3,#1 \n" /* clear ICC3.Z if store happens */
98 " beq icc3,#0,0b \n"
99 " setlos 0,%2 \n"
100 "3: \n"
101 ".subsection 2 \n"
102 "4: setlos %5,%2 \n"
103 " bra 3b \n"
104 ".previous \n"
105 ".section __ex_table,\"a\" \n"
106 " .balign 8 \n"
107 " .long 1b,4b \n"
108 " .long 2b,4b \n"
109 ".previous"
110 : "+U"(*uaddr), "=&r"(oldval), "=&r"(ret), "=r"(oparg)
111 : "3"(oparg), "i"(-EFAULT)
112 : "memory", "cc7", "cc3", "icc3"
113 );
114
115 *_oldval = oldval;
116 return ret;
117}
118
119static inline int atomic_futex_op_xchg_and(int oparg, int __user *uaddr, int *_oldval)
120{
121 int oldval, ret;
122
123 asm("0: \n"
124 " orcc gr0,gr0,gr0,icc3 \n" /* set ICC3.Z */
125 " ckeq icc3,cc7 \n"
126 "1: ld.p %M0,%1 \n" /* LD.P/ORCR must be atomic */
127 " orcr cc7,cc7,cc3 \n" /* set CC3 to true */
128 " and %1,%3,%3 \n"
129 "2: cst.p %3,%M0 ,cc3,#1 \n"
130 " corcc gr29,gr29,gr0 ,cc3,#1 \n" /* clear ICC3.Z if store happens */
131 " beq icc3,#0,0b \n"
132 " setlos 0,%2 \n"
133 "3: \n"
134 ".subsection 2 \n"
135 "4: setlos %5,%2 \n"
136 " bra 3b \n"
137 ".previous \n"
138 ".section __ex_table,\"a\" \n"
139 " .balign 8 \n"
140 " .long 1b,4b \n"
141 " .long 2b,4b \n"
142 ".previous"
143 : "+U"(*uaddr), "=&r"(oldval), "=&r"(ret), "=r"(oparg)
144 : "3"(oparg), "i"(-EFAULT)
145 : "memory", "cc7", "cc3", "icc3"
146 );
147
148 *_oldval = oldval;
149 return ret;
150}
151
152static inline int atomic_futex_op_xchg_xor(int oparg, int __user *uaddr, int *_oldval)
153{
154 int oldval, ret;
155
156 asm("0: \n"
157 " orcc gr0,gr0,gr0,icc3 \n" /* set ICC3.Z */
158 " ckeq icc3,cc7 \n"
159 "1: ld.p %M0,%1 \n" /* LD.P/ORCR must be atomic */
160 " orcr cc7,cc7,cc3 \n" /* set CC3 to true */
161 " xor %1,%3,%3 \n"
162 "2: cst.p %3,%M0 ,cc3,#1 \n"
163 " corcc gr29,gr29,gr0 ,cc3,#1 \n" /* clear ICC3.Z if store happens */
164 " beq icc3,#0,0b \n"
165 " setlos 0,%2 \n"
166 "3: \n"
167 ".subsection 2 \n"
168 "4: setlos %5,%2 \n"
169 " bra 3b \n"
170 ".previous \n"
171 ".section __ex_table,\"a\" \n"
172 " .balign 8 \n"
173 " .long 1b,4b \n"
174 " .long 2b,4b \n"
175 ".previous"
176 : "+U"(*uaddr), "=&r"(oldval), "=&r"(ret), "=r"(oparg)
177 : "3"(oparg), "i"(-EFAULT)
178 : "memory", "cc7", "cc3", "icc3"
179 );
180
181 *_oldval = oldval;
182 return ret;
183}
184
185/*****************************************************************************/
186/*
187 * do the futex operations
188 */
189int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
190{
191 int op = (encoded_op >> 28) & 7;
192 int cmp = (encoded_op >> 24) & 15;
193 int oparg = (encoded_op << 8) >> 20;
194 int cmparg = (encoded_op << 20) >> 20;
195 int oldval = 0, ret;
196
197 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
198 oparg = 1 << oparg;
199
200 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
201 return -EFAULT;
202
203 inc_preempt_count();
204
205 switch (op) {
206 case FUTEX_OP_SET:
207 ret = atomic_futex_op_xchg_set(oparg, uaddr, &oldval);
208 break;
209 case FUTEX_OP_ADD:
210 ret = atomic_futex_op_xchg_add(oparg, uaddr, &oldval);
211 break;
212 case FUTEX_OP_OR:
213 ret = atomic_futex_op_xchg_or(oparg, uaddr, &oldval);
214 break;
215 case FUTEX_OP_ANDN:
216 ret = atomic_futex_op_xchg_and(~oparg, uaddr, &oldval);
217 break;
218 case FUTEX_OP_XOR:
219 ret = atomic_futex_op_xchg_xor(oparg, uaddr, &oldval);
220 break;
221 default:
222 ret = -ENOSYS;
223 break;
224 }
225
226 dec_preempt_count();
227
228 if (!ret) {
229 switch (cmp) {
230 case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
231 case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
232 case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
233 case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
234 case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
235 case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
236 default: ret = -ENOSYS; break;
237 }
238 }
239
240 return ret;
241
242} /* end futex_atomic_op_inuser() */
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index d4ccc0728dfe..5b7146f54fd5 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -35,7 +35,7 @@ struct fdpic_func_descriptor {
35 unsigned long GOT; 35 unsigned long GOT;
36}; 36};
37 37
38asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); 38static int do_signal(sigset_t *oldset);
39 39
40/* 40/*
41 * Atomically swap in the new signal mask, and wait for a signal. 41 * Atomically swap in the new signal mask, and wait for a signal.
@@ -55,7 +55,7 @@ asmlinkage int sys_sigsuspend(int history0, int history1, old_sigset_t mask)
55 while (1) { 55 while (1) {
56 current->state = TASK_INTERRUPTIBLE; 56 current->state = TASK_INTERRUPTIBLE;
57 schedule(); 57 schedule();
58 if (do_signal(__frame, &saveset)) 58 if (do_signal(&saveset))
59 /* return the signal number as the return value of this function 59 /* return the signal number as the return value of this function
60 * - this is an utterly evil hack. syscalls should not invoke do_signal() 60 * - this is an utterly evil hack. syscalls should not invoke do_signal()
61 * as entry.S sets regs->gr8 to the return value of the system call 61 * as entry.S sets regs->gr8 to the return value of the system call
@@ -91,7 +91,7 @@ asmlinkage int sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
91 while (1) { 91 while (1) {
92 current->state = TASK_INTERRUPTIBLE; 92 current->state = TASK_INTERRUPTIBLE;
93 schedule(); 93 schedule();
94 if (do_signal(__frame, &saveset)) 94 if (do_signal(&saveset))
95 /* return the signal number as the return value of this function 95 /* return the signal number as the return value of this function
96 * - this is an utterly evil hack. syscalls should not invoke do_signal() 96 * - this is an utterly evil hack. syscalls should not invoke do_signal()
97 * as entry.S sets regs->gr8 to the return value of the system call 97 * as entry.S sets regs->gr8 to the return value of the system call
@@ -276,13 +276,12 @@ static int setup_sigcontext(struct sigcontext __user *sc, unsigned long mask)
276 * Determine which stack to use.. 276 * Determine which stack to use..
277 */ 277 */
278static inline void __user *get_sigframe(struct k_sigaction *ka, 278static inline void __user *get_sigframe(struct k_sigaction *ka,
279 struct pt_regs *regs,
280 size_t frame_size) 279 size_t frame_size)
281{ 280{
282 unsigned long sp; 281 unsigned long sp;
283 282
284 /* Default to using normal stack */ 283 /* Default to using normal stack */
285 sp = regs->sp; 284 sp = __frame->sp;
286 285
287 /* This is the X/Open sanctioned signal stack switching. */ 286 /* This is the X/Open sanctioned signal stack switching. */
288 if (ka->sa.sa_flags & SA_ONSTACK) { 287 if (ka->sa.sa_flags & SA_ONSTACK) {
@@ -291,18 +290,19 @@ static inline void __user *get_sigframe(struct k_sigaction *ka,
291 } 290 }
292 291
293 return (void __user *) ((sp - frame_size) & ~7UL); 292 return (void __user *) ((sp - frame_size) & ~7UL);
293
294} /* end get_sigframe() */ 294} /* end get_sigframe() */
295 295
296/*****************************************************************************/ 296/*****************************************************************************/
297/* 297/*
298 * 298 *
299 */ 299 */
300static void setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct pt_regs * regs) 300static int setup_frame(int sig, struct k_sigaction *ka, sigset_t *set)
301{ 301{
302 struct sigframe __user *frame; 302 struct sigframe __user *frame;
303 int rsig; 303 int rsig;
304 304
305 frame = get_sigframe(ka, regs, sizeof(*frame)); 305 frame = get_sigframe(ka, sizeof(*frame));
306 306
307 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 307 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
308 goto give_sigsegv; 308 goto give_sigsegv;
@@ -346,47 +346,51 @@ static void setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct p
346 } 346 }
347 347
348 /* set up registers for signal handler */ 348 /* set up registers for signal handler */
349 regs->sp = (unsigned long) frame; 349 __frame->sp = (unsigned long) frame;
350 regs->lr = (unsigned long) &frame->retcode; 350 __frame->lr = (unsigned long) &frame->retcode;
351 regs->gr8 = sig; 351 __frame->gr8 = sig;
352 352
353 if (get_personality & FDPIC_FUNCPTRS) { 353 if (get_personality & FDPIC_FUNCPTRS) {
354 struct fdpic_func_descriptor __user *funcptr = 354 struct fdpic_func_descriptor __user *funcptr =
355 (struct fdpic_func_descriptor *) ka->sa.sa_handler; 355 (struct fdpic_func_descriptor *) ka->sa.sa_handler;
356 __get_user(regs->pc, &funcptr->text); 356 __get_user(__frame->pc, &funcptr->text);
357 __get_user(regs->gr15, &funcptr->GOT); 357 __get_user(__frame->gr15, &funcptr->GOT);
358 } else { 358 } else {
359 regs->pc = (unsigned long) ka->sa.sa_handler; 359 __frame->pc = (unsigned long) ka->sa.sa_handler;
360 regs->gr15 = 0; 360 __frame->gr15 = 0;
361 } 361 }
362 362
363 set_fs(USER_DS); 363 set_fs(USER_DS);
364 364
365 /* the tracer may want to single-step inside the handler */
366 if (test_thread_flag(TIF_SINGLESTEP))
367 ptrace_notify(SIGTRAP);
368
365#if DEBUG_SIG 369#if DEBUG_SIG
366 printk("SIG deliver %d (%s:%d): sp=%p pc=%lx ra=%p\n", 370 printk("SIG deliver %d (%s:%d): sp=%p pc=%lx ra=%p\n",
367 sig, current->comm, current->pid, frame, regs->pc, frame->pretcode); 371 sig, current->comm, current->pid, frame, __frame->pc,
372 frame->pretcode);
368#endif 373#endif
369 374
370 return; 375 return 1;
371 376
372give_sigsegv: 377give_sigsegv:
373 if (sig == SIGSEGV)
374 ka->sa.sa_handler = SIG_DFL;
375
376 force_sig(SIGSEGV, current); 378 force_sig(SIGSEGV, current);
379 return 0;
380
377} /* end setup_frame() */ 381} /* end setup_frame() */
378 382
379/*****************************************************************************/ 383/*****************************************************************************/
380/* 384/*
381 * 385 *
382 */ 386 */
383static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 387static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
384 sigset_t *set, struct pt_regs * regs) 388 sigset_t *set)
385{ 389{
386 struct rt_sigframe __user *frame; 390 struct rt_sigframe __user *frame;
387 int rsig; 391 int rsig;
388 392
389 frame = get_sigframe(ka, regs, sizeof(*frame)); 393 frame = get_sigframe(ka, sizeof(*frame));
390 394
391 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 395 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
392 goto give_sigsegv; 396 goto give_sigsegv;
@@ -409,7 +413,7 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
409 if (__put_user(0, &frame->uc.uc_flags) || 413 if (__put_user(0, &frame->uc.uc_flags) ||
410 __put_user(0, &frame->uc.uc_link) || 414 __put_user(0, &frame->uc.uc_link) ||
411 __put_user((void*)current->sas_ss_sp, &frame->uc.uc_stack.ss_sp) || 415 __put_user((void*)current->sas_ss_sp, &frame->uc.uc_stack.ss_sp) ||
412 __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags) || 416 __put_user(sas_ss_flags(__frame->sp), &frame->uc.uc_stack.ss_flags) ||
413 __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size)) 417 __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size))
414 goto give_sigsegv; 418 goto give_sigsegv;
415 419
@@ -440,34 +444,38 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
440 } 444 }
441 445
442 /* Set up registers for signal handler */ 446 /* Set up registers for signal handler */
443 regs->sp = (unsigned long) frame; 447 __frame->sp = (unsigned long) frame;
444 regs->lr = (unsigned long) &frame->retcode; 448 __frame->lr = (unsigned long) &frame->retcode;
445 regs->gr8 = sig; 449 __frame->gr8 = sig;
446 regs->gr9 = (unsigned long) &frame->info; 450 __frame->gr9 = (unsigned long) &frame->info;
447 451
448 if (get_personality & FDPIC_FUNCPTRS) { 452 if (get_personality & FDPIC_FUNCPTRS) {
449 struct fdpic_func_descriptor *funcptr = 453 struct fdpic_func_descriptor *funcptr =
450 (struct fdpic_func_descriptor __user *) ka->sa.sa_handler; 454 (struct fdpic_func_descriptor __user *) ka->sa.sa_handler;
451 __get_user(regs->pc, &funcptr->text); 455 __get_user(__frame->pc, &funcptr->text);
452 __get_user(regs->gr15, &funcptr->GOT); 456 __get_user(__frame->gr15, &funcptr->GOT);
453 } else { 457 } else {
454 regs->pc = (unsigned long) ka->sa.sa_handler; 458 __frame->pc = (unsigned long) ka->sa.sa_handler;
455 regs->gr15 = 0; 459 __frame->gr15 = 0;
456 } 460 }
457 461
458 set_fs(USER_DS); 462 set_fs(USER_DS);
459 463
464 /* the tracer may want to single-step inside the handler */
465 if (test_thread_flag(TIF_SINGLESTEP))
466 ptrace_notify(SIGTRAP);
467
460#if DEBUG_SIG 468#if DEBUG_SIG
461 printk("SIG deliver %d (%s:%d): sp=%p pc=%lx ra=%p\n", 469 printk("SIG deliver %d (%s:%d): sp=%p pc=%lx ra=%p\n",
462 sig, current->comm, current->pid, frame, regs->pc, frame->pretcode); 470 sig, current->comm, current->pid, frame, __frame->pc,
471 frame->pretcode);
463#endif 472#endif
464 473
465 return; 474 return 1;
466 475
467give_sigsegv: 476give_sigsegv:
468 if (sig == SIGSEGV)
469 ka->sa.sa_handler = SIG_DFL;
470 force_sig(SIGSEGV, current); 477 force_sig(SIGSEGV, current);
478 return 0;
471 479
472} /* end setup_rt_frame() */ 480} /* end setup_rt_frame() */
473 481
@@ -475,43 +483,51 @@ give_sigsegv:
475/* 483/*
476 * OK, we're invoking a handler 484 * OK, we're invoking a handler
477 */ 485 */
478static void handle_signal(unsigned long sig, siginfo_t *info, 486static int handle_signal(unsigned long sig, siginfo_t *info,
479 struct k_sigaction *ka, sigset_t *oldset, 487 struct k_sigaction *ka, sigset_t *oldset)
480 struct pt_regs *regs)
481{ 488{
489 int ret;
490
482 /* Are we from a system call? */ 491 /* Are we from a system call? */
483 if (in_syscall(regs)) { 492 if (in_syscall(__frame)) {
484 /* If so, check system call restarting.. */ 493 /* If so, check system call restarting.. */
485 switch (regs->gr8) { 494 switch (__frame->gr8) {
486 case -ERESTART_RESTARTBLOCK: 495 case -ERESTART_RESTARTBLOCK:
487 case -ERESTARTNOHAND: 496 case -ERESTARTNOHAND:
488 regs->gr8 = -EINTR; 497 __frame->gr8 = -EINTR;
489 break; 498 break;
490 499
491 case -ERESTARTSYS: 500 case -ERESTARTSYS:
492 if (!(ka->sa.sa_flags & SA_RESTART)) { 501 if (!(ka->sa.sa_flags & SA_RESTART)) {
493 regs->gr8 = -EINTR; 502 __frame->gr8 = -EINTR;
494 break; 503 break;
495 } 504 }
505
496 /* fallthrough */ 506 /* fallthrough */
497 case -ERESTARTNOINTR: 507 case -ERESTARTNOINTR:
498 regs->gr8 = regs->orig_gr8; 508 __frame->gr8 = __frame->orig_gr8;
499 regs->pc -= 4; 509 __frame->pc -= 4;
500 } 510 }
501 } 511 }
502 512
503 /* Set up the stack frame */ 513 /* Set up the stack frame */
504 if (ka->sa.sa_flags & SA_SIGINFO) 514 if (ka->sa.sa_flags & SA_SIGINFO)
505 setup_rt_frame(sig, ka, info, oldset, regs); 515 ret = setup_rt_frame(sig, ka, info, oldset);
506 else 516 else
507 setup_frame(sig, ka, oldset, regs); 517 ret = setup_frame(sig, ka, oldset);
518
519 if (ret) {
520 spin_lock_irq(&current->sighand->siglock);
521 sigorsets(&current->blocked, &current->blocked,
522 &ka->sa.sa_mask);
523 if (!(ka->sa.sa_flags & SA_NODEFER))
524 sigaddset(&current->blocked, sig);
525 recalc_sigpending();
526 spin_unlock_irq(&current->sighand->siglock);
527 }
528
529 return ret;
508 530
509 spin_lock_irq(&current->sighand->siglock);
510 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
511 if (!(ka->sa.sa_flags & SA_NODEFER))
512 sigaddset(&current->blocked, sig);
513 recalc_sigpending();
514 spin_unlock_irq(&current->sighand->siglock);
515} /* end handle_signal() */ 531} /* end handle_signal() */
516 532
517/*****************************************************************************/ 533/*****************************************************************************/
@@ -520,7 +536,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info,
520 * want to handle. Thus you cannot kill init even with a SIGKILL even by 536 * want to handle. Thus you cannot kill init even with a SIGKILL even by
521 * mistake. 537 * mistake.
522 */ 538 */
523int do_signal(struct pt_regs *regs, sigset_t *oldset) 539static int do_signal(sigset_t *oldset)
524{ 540{
525 struct k_sigaction ka; 541 struct k_sigaction ka;
526 siginfo_t info; 542 siginfo_t info;
@@ -532,7 +548,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
532 * kernel mode. Just return without doing anything 548 * kernel mode. Just return without doing anything
533 * if so. 549 * if so.
534 */ 550 */
535 if (!user_mode(regs)) 551 if (!user_mode(__frame))
536 return 1; 552 return 1;
537 553
538 if (try_to_freeze()) 554 if (try_to_freeze())
@@ -541,30 +557,29 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
541 if (!oldset) 557 if (!oldset)
542 oldset = &current->blocked; 558 oldset = &current->blocked;
543 559
544 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 560 signr = get_signal_to_deliver(&info, &ka, __frame, NULL);
545 if (signr > 0) { 561 if (signr > 0)
546 handle_signal(signr, &info, &ka, oldset, regs); 562 return handle_signal(signr, &info, &ka, oldset);
547 return 1;
548 }
549 563
550 no_signal: 564no_signal:
551 /* Did we come from a system call? */ 565 /* Did we come from a system call? */
552 if (regs->syscallno >= 0) { 566 if (__frame->syscallno >= 0) {
553 /* Restart the system call - no handlers present */ 567 /* Restart the system call - no handlers present */
554 if (regs->gr8 == -ERESTARTNOHAND || 568 if (__frame->gr8 == -ERESTARTNOHAND ||
555 regs->gr8 == -ERESTARTSYS || 569 __frame->gr8 == -ERESTARTSYS ||
556 regs->gr8 == -ERESTARTNOINTR) { 570 __frame->gr8 == -ERESTARTNOINTR) {
557 regs->gr8 = regs->orig_gr8; 571 __frame->gr8 = __frame->orig_gr8;
558 regs->pc -= 4; 572 __frame->pc -= 4;
559 } 573 }
560 574
561 if (regs->gr8 == -ERESTART_RESTARTBLOCK){ 575 if (__frame->gr8 == -ERESTART_RESTARTBLOCK){
562 regs->gr8 = __NR_restart_syscall; 576 __frame->gr8 = __NR_restart_syscall;
563 regs->pc -= 4; 577 __frame->pc -= 4;
564 } 578 }
565 } 579 }
566 580
567 return 0; 581 return 0;
582
568} /* end do_signal() */ 583} /* end do_signal() */
569 584
570/*****************************************************************************/ 585/*****************************************************************************/
@@ -580,6 +595,6 @@ asmlinkage void do_notify_resume(__u32 thread_info_flags)
580 595
581 /* deal with pending signal delivery */ 596 /* deal with pending signal delivery */
582 if (thread_info_flags & _TIF_SIGPENDING) 597 if (thread_info_flags & _TIF_SIGPENDING)
583 do_signal(__frame, NULL); 598 do_signal(NULL);
584 599
585} /* end do_notify_resume() */ 600} /* end do_notify_resume() */
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 6004bb0795e0..968fabd8723f 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -464,7 +464,6 @@ config NUMA
464 depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)) 464 depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI))
465 default n if X86_PC 465 default n if X86_PC
466 default y if (X86_NUMAQ || X86_SUMMIT) 466 default y if (X86_NUMAQ || X86_SUMMIT)
467 select SPARSEMEM_STATIC
468 467
469# Need comments to help the hapless user trying to turn on NUMA support 468# Need comments to help the hapless user trying to turn on NUMA support
470comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" 469comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support"
@@ -493,6 +492,10 @@ config HAVE_ARCH_ALLOC_REMAP
493 depends on NUMA 492 depends on NUMA
494 default y 493 default y
495 494
495config ARCH_FLATMEM_ENABLE
496 def_bool y
497 depends on (ARCH_SELECT_MEMORY_MODEL && X86_PC)
498
496config ARCH_DISCONTIGMEM_ENABLE 499config ARCH_DISCONTIGMEM_ENABLE
497 def_bool y 500 def_bool y
498 depends on NUMA 501 depends on NUMA
@@ -503,7 +506,8 @@ config ARCH_DISCONTIGMEM_DEFAULT
503 506
504config ARCH_SPARSEMEM_ENABLE 507config ARCH_SPARSEMEM_ENABLE
505 def_bool y 508 def_bool y
506 depends on NUMA 509 depends on (NUMA || (X86_PC && EXPERIMENTAL))
510 select SPARSEMEM_STATIC
507 511
508config ARCH_SELECT_MEMORY_MODEL 512config ARCH_SELECT_MEMORY_MODEL
509 def_bool y 513 def_bool y
diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu
index 53bbb3c008ee..79603b3471f9 100644
--- a/arch/i386/Kconfig.cpu
+++ b/arch/i386/Kconfig.cpu
@@ -39,6 +39,7 @@ config M386
39 - "Winchip-2" for IDT Winchip 2. 39 - "Winchip-2" for IDT Winchip 2.
40 - "Winchip-2A" for IDT Winchips with 3dNow! capabilities. 40 - "Winchip-2A" for IDT Winchips with 3dNow! capabilities.
41 - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). 41 - "GeodeGX1" for Geode GX1 (Cyrix MediaGX).
42 - "Geode GX/LX" For AMD Geode GX and LX processors.
42 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. 43 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
43 - "VIA C3-2 for VIA C3-2 "Nehemiah" (model 9 and above). 44 - "VIA C3-2 for VIA C3-2 "Nehemiah" (model 9 and above).
44 45
@@ -171,6 +172,11 @@ config MGEODEGX1
171 help 172 help
172 Select this for a Geode GX1 (Cyrix MediaGX) chip. 173 Select this for a Geode GX1 (Cyrix MediaGX) chip.
173 174
175config MGEODE_LX
176 bool "Geode GX/LX"
177 help
178 Select this for AMD Geode GX and LX processors.
179
174config MCYRIXIII 180config MCYRIXIII
175 bool "CyrixIII/VIA-C3" 181 bool "CyrixIII/VIA-C3"
176 help 182 help
@@ -220,8 +226,8 @@ config X86_XADD
220config X86_L1_CACHE_SHIFT 226config X86_L1_CACHE_SHIFT
221 int 227 int
222 default "7" if MPENTIUM4 || X86_GENERIC 228 default "7" if MPENTIUM4 || X86_GENERIC
223 default "4" if X86_ELAN || M486 || M386 229 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
224 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODEGX1 230 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
225 default "6" if MK7 || MK8 || MPENTIUMM 231 default "6" if MK7 || MK8 || MPENTIUMM
226 232
227config RWSEM_GENERIC_SPINLOCK 233config RWSEM_GENERIC_SPINLOCK
@@ -290,12 +296,12 @@ config X86_INTEL_USERCOPY
290 296
291config X86_USE_PPRO_CHECKSUM 297config X86_USE_PPRO_CHECKSUM
292 bool 298 bool
293 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON 299 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX
294 default y 300 default y
295 301
296config X86_USE_3DNOW 302config X86_USE_3DNOW
297 bool 303 bool
298 depends on MCYRIXIII || MK7 304 depends on MCYRIXIII || MK7 || MGEODE_LX
299 default y 305 default y
300 306
301config X86_OOSTORE 307config X86_OOSTORE
diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug
index c48b424dd640..bf32ecc9ad04 100644
--- a/arch/i386/Kconfig.debug
+++ b/arch/i386/Kconfig.debug
@@ -42,6 +42,16 @@ config DEBUG_PAGEALLOC
42 This results in a large slowdown, but helps to find certain types 42 This results in a large slowdown, but helps to find certain types
43 of memory corruptions. 43 of memory corruptions.
44 44
45config DEBUG_RODATA
46 bool "Write protect kernel read-only data structures"
47 depends on DEBUG_KERNEL
48 help
49 Mark the kernel read-only data as write-protected in the pagetables,
50 in order to catch accidental (and incorrect) writes to such const
51 data. This option may have a slight performance impact because a
52 portion of the kernel code won't be covered by a 2MB TLB anymore.
53 If in doubt, say "N".
54
45config 4KSTACKS 55config 4KSTACKS
46 bool "Use 4Kb for kernel stacks instead of 8Kb" 56 bool "Use 4Kb for kernel stacks instead of 8Kb"
47 depends on DEBUG_KERNEL 57 depends on DEBUG_KERNEL
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 496a2c9909fe..d8f94e78de8a 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -721,7 +721,7 @@ static int __init apic_set_verbosity(char *str)
721 apic_verbosity = APIC_VERBOSE; 721 apic_verbosity = APIC_VERBOSE;
722 else 722 else
723 printk(KERN_WARNING "APIC Verbosity level %s not recognised" 723 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
724 " use apic=verbose or apic=debug", str); 724 " use apic=verbose or apic=debug\n", str);
725 725
726 return 0; 726 return 0;
727} 727}
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 1e60acbed3c1..2d793d4aef1a 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -303,17 +303,6 @@ extern int (*console_blank_hook)(int);
303#include "apm.h" 303#include "apm.h"
304 304
305/* 305/*
306 * Define to make all _set_limit calls use 64k limits. The APM 1.1 BIOS is
307 * supposed to provide limit information that it recognizes. Many machines
308 * do this correctly, but many others do not restrict themselves to their
309 * claimed limit. When this happens, they will cause a segmentation
310 * violation in the kernel at boot time. Most BIOS's, however, will
311 * respect a 64k limit, so we use that. If you want to be pedantic and
312 * hold your BIOS to its claims, then undefine this.
313 */
314#define APM_RELAX_SEGMENTS
315
316/*
317 * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend. 306 * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
318 * This patched by Chad Miller <cmiller@surfsouth.com>, original code by 307 * This patched by Chad Miller <cmiller@surfsouth.com>, original code by
319 * David Chen <chen@ctpa04.mit.edu> 308 * David Chen <chen@ctpa04.mit.edu>
@@ -1075,22 +1064,23 @@ static int apm_engage_power_management(u_short device, int enable)
1075 1064
1076static int apm_console_blank(int blank) 1065static int apm_console_blank(int blank)
1077{ 1066{
1078 int error; 1067 int error, i;
1079 u_short state; 1068 u_short state;
1069 static const u_short dev[3] = { 0x100, 0x1FF, 0x101 };
1080 1070
1081 state = blank ? APM_STATE_STANDBY : APM_STATE_READY; 1071 state = blank ? APM_STATE_STANDBY : APM_STATE_READY;
1082 /* Blank the first display device */ 1072
1083 error = set_power_state(0x100, state); 1073 for (i = 0; i < ARRAY_SIZE(dev); i++) {
1084 if ((error != APM_SUCCESS) && (error != APM_NO_ERROR)) { 1074 error = set_power_state(dev[i], state);
1085 /* try to blank them all instead */ 1075
1086 error = set_power_state(0x1ff, state); 1076 if ((error == APM_SUCCESS) || (error == APM_NO_ERROR))
1087 if ((error != APM_SUCCESS) && (error != APM_NO_ERROR)) 1077 return 1;
1088 /* try to blank device one instead */ 1078
1089 error = set_power_state(0x101, state); 1079 if (error == APM_NOT_ENGAGED)
1080 break;
1090 } 1081 }
1091 if ((error == APM_SUCCESS) || (error == APM_NO_ERROR)) 1082
1092 return 1; 1083 if (error == APM_NOT_ENGAGED && state != APM_STATE_READY) {
1093 if (error == APM_NOT_ENGAGED) {
1094 static int tried; 1084 static int tried;
1095 int eng_error; 1085 int eng_error;
1096 if (tried++ == 0) { 1086 if (tried++ == 0) {
@@ -2233,8 +2223,8 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
2233static int __init apm_init(void) 2223static int __init apm_init(void)
2234{ 2224{
2235 struct proc_dir_entry *apm_proc; 2225 struct proc_dir_entry *apm_proc;
2226 struct desc_struct *gdt;
2236 int ret; 2227 int ret;
2237 int i;
2238 2228
2239 dmi_check_system(apm_dmi_table); 2229 dmi_check_system(apm_dmi_table);
2240 2230
@@ -2312,45 +2302,30 @@ static int __init apm_init(void)
2312 set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); 2302 set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
2313 _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); 2303 _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
2314 2304
2305 /*
2306 * Set up the long jump entry point to the APM BIOS, which is called
2307 * from inline assembly.
2308 */
2315 apm_bios_entry.offset = apm_info.bios.offset; 2309 apm_bios_entry.offset = apm_info.bios.offset;
2316 apm_bios_entry.segment = APM_CS; 2310 apm_bios_entry.segment = APM_CS;
2317 2311
2318 for (i = 0; i < NR_CPUS; i++) { 2312 /*
2319 struct desc_struct *gdt = get_cpu_gdt_table(i); 2313 * The APM 1.1 BIOS is supposed to provide limit information that it
2320 set_base(gdt[APM_CS >> 3], 2314 * recognizes. Many machines do this correctly, but many others do
2321 __va((unsigned long)apm_info.bios.cseg << 4)); 2315 * not restrict themselves to their claimed limit. When this happens,
2322 set_base(gdt[APM_CS_16 >> 3], 2316 * they will cause a segmentation violation in the kernel at boot time.
2323 __va((unsigned long)apm_info.bios.cseg_16 << 4)); 2317 * Most BIOS's, however, will respect a 64k limit, so we use that.
2324 set_base(gdt[APM_DS >> 3], 2318 *
2325 __va((unsigned long)apm_info.bios.dseg << 4)); 2319 * Note we only set APM segments on CPU zero, since we pin the APM
2326#ifndef APM_RELAX_SEGMENTS 2320 * code to that CPU.
2327 if (apm_info.bios.version == 0x100) { 2321 */
2328#endif 2322 gdt = get_cpu_gdt_table(0);
2329 /* For ASUS motherboard, Award BIOS rev 110 (and others?) */ 2323 set_base(gdt[APM_CS >> 3],
2330 _set_limit((char *)&gdt[APM_CS >> 3], 64 * 1024 - 1); 2324 __va((unsigned long)apm_info.bios.cseg << 4));
2331 /* For some unknown machine. */ 2325 set_base(gdt[APM_CS_16 >> 3],
2332 _set_limit((char *)&gdt[APM_CS_16 >> 3], 64 * 1024 - 1); 2326 __va((unsigned long)apm_info.bios.cseg_16 << 4));
2333 /* For the DEC Hinote Ultra CT475 (and others?) */ 2327 set_base(gdt[APM_DS >> 3],
2334 _set_limit((char *)&gdt[APM_DS >> 3], 64 * 1024 - 1); 2328 __va((unsigned long)apm_info.bios.dseg << 4));
2335#ifndef APM_RELAX_SEGMENTS
2336 } else {
2337 _set_limit((char *)&gdt[APM_CS >> 3],
2338 (apm_info.bios.cseg_len - 1) & 0xffff);
2339 _set_limit((char *)&gdt[APM_CS_16 >> 3],
2340 (apm_info.bios.cseg_16_len - 1) & 0xffff);
2341 _set_limit((char *)&gdt[APM_DS >> 3],
2342 (apm_info.bios.dseg_len - 1) & 0xffff);
2343 /* workaround for broken BIOSes */
2344 if (apm_info.bios.cseg_len <= apm_info.bios.offset)
2345 _set_limit((char *)&gdt[APM_CS >> 3], 64 * 1024 -1);
2346 if (apm_info.bios.dseg_len <= 0x40) { /* 0x40 * 4kB == 64kB */
2347 /* for the BIOS that assumes granularity = 1 */
2348 gdt[APM_DS >> 3].b |= 0x800000;
2349 printk(KERN_NOTICE "apm: we set the granularity of dseg.\n");
2350 }
2351 }
2352#endif
2353 }
2354 2329
2355 apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info); 2330 apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info);
2356 if (apm_proc) 2331 if (apm_proc)
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index e344ef88cfcd..e7697e077f6b 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -161,8 +161,13 @@ static void __init init_amd(struct cpuinfo_x86 *c)
161 set_bit(X86_FEATURE_K6_MTRR, c->x86_capability); 161 set_bit(X86_FEATURE_K6_MTRR, c->x86_capability);
162 break; 162 break;
163 } 163 }
164 break;
165 164
165 if (c->x86_model == 10) {
166 /* AMD Geode LX is model 10 */
167 /* placeholder for any needed mods */
168 break;
169 }
170 break;
166 case 6: /* An Athlon/Duron */ 171 case 6: /* An Athlon/Duron */
167 172
168 /* Bit 15 of Athlon specific MSR 15, needs to be 0 173 /* Bit 15 of Athlon specific MSR 15, needs to be 0
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 31e344b26bae..cca655688ffc 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,9 +18,6 @@
18 18
19#include "cpu.h" 19#include "cpu.h"
20 20
21DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);
22EXPORT_PER_CPU_SYMBOL(cpu_gdt_table);
23
24DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); 21DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
25EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); 22EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
26 23
@@ -599,11 +596,6 @@ void __devinit cpu_init(void)
599 load_idt(&idt_descr); 596 load_idt(&idt_descr);
600 597
601 /* 598 /*
602 * Delete NT
603 */
604 __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl");
605
606 /*
607 * Set up and load the per-CPU TSS and LDT 599 * Set up and load the per-CPU TSS and LDT
608 */ 600 */
609 atomic_inc(&init_mm.mm_count); 601 atomic_inc(&init_mm.mm_count);
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index ff87cc22b323..75015975d038 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -343,6 +343,31 @@ static void __init init_cyrix(struct cpuinfo_x86 *c)
343} 343}
344 344
345/* 345/*
346 * Handle National Semiconductor branded processors
347 */
348static void __devinit init_nsc(struct cpuinfo_x86 *c)
349{
350 /* There may be GX1 processors in the wild that are branded
351 * NSC and not Cyrix.
352 *
353 * This function only handles the GX processor, and kicks every
354 * thing else to the Cyrix init function above - that should
355 * cover any processors that might have been branded differently
356 * after NSC aquired Cyrix.
357 *
358 * If this breaks your GX1 horribly, please e-mail
359 * info-linux@ldcmail.amd.com to tell us.
360 */
361
362 /* Handle the GX (Formally known as the GX2) */
363
364 if (c->x86 == 5 && c->x86_model == 5)
365 display_cacheinfo(c);
366 else
367 init_cyrix(c);
368}
369
370/*
346 * Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected 371 * Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected
347 * by the fact that they preserve the flags across the division of 5/2. 372 * by the fact that they preserve the flags across the division of 5/2.
348 * PII and PPro exhibit this behavior too, but they have cpuid available. 373 * PII and PPro exhibit this behavior too, but they have cpuid available.
@@ -422,7 +447,7 @@ int __init cyrix_init_cpu(void)
422static struct cpu_dev nsc_cpu_dev __initdata = { 447static struct cpu_dev nsc_cpu_dev __initdata = {
423 .c_vendor = "NSC", 448 .c_vendor = "NSC",
424 .c_ident = { "Geode by NSC" }, 449 .c_ident = { "Geode by NSC" },
425 .c_init = init_cyrix, 450 .c_init = init_nsc,
426 .c_identify = generic_identify, 451 .c_identify = generic_identify,
427}; 452};
428 453
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index 13bae799e626..006141d1c12a 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -117,14 +117,13 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
117{ 117{
118 char __user *tmp = buf; 118 char __user *tmp = buf;
119 u32 data[4]; 119 u32 data[4];
120 size_t rv;
121 u32 reg = *ppos; 120 u32 reg = *ppos;
122 int cpu = iminor(file->f_dentry->d_inode); 121 int cpu = iminor(file->f_dentry->d_inode);
123 122
124 if (count % 16) 123 if (count % 16)
125 return -EINVAL; /* Invalid chunk size */ 124 return -EINVAL; /* Invalid chunk size */
126 125
127 for (rv = 0; count; count -= 16) { 126 for (; count; count -= 16) {
128 do_cpuid(cpu, reg, data); 127 do_cpuid(cpu, reg, data);
129 if (copy_to_user(tmp, &data, 16)) 128 if (copy_to_user(tmp, &data, 16))
130 return -EFAULT; 129 return -EFAULT;
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index e50b93155249..607c06007508 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -657,6 +657,7 @@ ENTRY(spurious_interrupt_bug)
657 pushl $do_spurious_interrupt_bug 657 pushl $do_spurious_interrupt_bug
658 jmp error_code 658 jmp error_code
659 659
660.section .rodata,"a"
660#include "syscall_table.S" 661#include "syscall_table.S"
661 662
662syscall_table_size=(.-sys_call_table) 663syscall_table_size=(.-sys_call_table)
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index e437fb367498..5884469f6bfe 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -504,19 +504,24 @@ ENTRY(cpu_gdt_table)
504 .quad 0x0000000000000000 /* 0x80 TSS descriptor */ 504 .quad 0x0000000000000000 /* 0x80 TSS descriptor */
505 .quad 0x0000000000000000 /* 0x88 LDT descriptor */ 505 .quad 0x0000000000000000 /* 0x88 LDT descriptor */
506 506
507 /* Segments used for calling PnP BIOS */ 507 /*
508 .quad 0x00c09a0000000000 /* 0x90 32-bit code */ 508 * Segments used for calling PnP BIOS have byte granularity.
509 .quad 0x00809a0000000000 /* 0x98 16-bit code */ 509 * They code segments and data segments have fixed 64k limits,
510 .quad 0x0080920000000000 /* 0xa0 16-bit data */ 510 * the transfer segment sizes are set at run time.
511 .quad 0x0080920000000000 /* 0xa8 16-bit data */ 511 */
512 .quad 0x0080920000000000 /* 0xb0 16-bit data */ 512 .quad 0x00409a000000ffff /* 0x90 32-bit code */
513 .quad 0x00009a000000ffff /* 0x98 16-bit code */
514 .quad 0x000092000000ffff /* 0xa0 16-bit data */
515 .quad 0x0000920000000000 /* 0xa8 16-bit data */
516 .quad 0x0000920000000000 /* 0xb0 16-bit data */
517
513 /* 518 /*
514 * The APM segments have byte granularity and their bases 519 * The APM segments have byte granularity and their bases
515 * and limits are set at run time. 520 * are set at run time. All have 64k limits.
516 */ 521 */
517 .quad 0x00409a0000000000 /* 0xb8 APM CS code */ 522 .quad 0x00409a000000ffff /* 0xb8 APM CS code */
518 .quad 0x00009a0000000000 /* 0xc0 APM CS 16 code (16 bit) */ 523 .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */
519 .quad 0x0040920000000000 /* 0xc8 APM DS data */ 524 .quad 0x004092000000ffff /* 0xc8 APM DS data */
520 525
521 .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */ 526 .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */
522 .quad 0x0000000000000000 /* 0xd8 - unused */ 527 .quad 0x0000000000000000 /* 0xd8 - unused */
@@ -525,3 +530,5 @@ ENTRY(cpu_gdt_table)
525 .quad 0x0000000000000000 /* 0xf0 - unused */ 530 .quad 0x0000000000000000 /* 0xf0 - unused */
526 .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ 531 .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
527 532
533 /* Be sure this is zeroed to avoid false validations in Xen */
534 .fill PAGE_SIZE_asm / 8 - GDT_ENTRIES,8,0
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
index 180f070d03cb..3999bec50c33 100644
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -3,8 +3,7 @@
3#include <asm/checksum.h> 3#include <asm/checksum.h>
4#include <asm/desc.h> 4#include <asm/desc.h>
5 5
6/* This is definitely a GPL-only symbol */ 6EXPORT_SYMBOL_GPL(cpu_gdt_descr);
7EXPORT_SYMBOL_GPL(cpu_gdt_table);
8 7
9EXPORT_SYMBOL(__down_failed); 8EXPORT_SYMBOL(__down_failed);
10EXPORT_SYMBOL(__down_failed_interruptible); 9EXPORT_SYMBOL(__down_failed_interruptible);
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 22c8675c79f4..7554f8fd874a 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -1722,8 +1722,8 @@ void disable_IO_APIC(void)
1722 entry.dest_mode = 0; /* Physical */ 1722 entry.dest_mode = 0; /* Physical */
1723 entry.delivery_mode = dest_ExtINT; /* ExtInt */ 1723 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1724 entry.vector = 0; 1724 entry.vector = 0;
1725 entry.dest.physical.physical_dest = 0; 1725 entry.dest.physical.physical_dest =
1726 1726 GET_APIC_ID(apic_read(APIC_ID));
1727 1727
1728 /* 1728 /*
1729 * Add it to the IO-APIC irq-routing table: 1729 * Add it to the IO-APIC irq-routing table:
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 1ca5269b1e86..91a64016956e 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -38,6 +38,12 @@
38int smp_found_config; 38int smp_found_config;
39unsigned int __initdata maxcpus = NR_CPUS; 39unsigned int __initdata maxcpus = NR_CPUS;
40 40
41#ifdef CONFIG_HOTPLUG_CPU
42#define CPU_HOTPLUG_ENABLED (1)
43#else
44#define CPU_HOTPLUG_ENABLED (0)
45#endif
46
41/* 47/*
42 * Various Linux-internal data structures created from the 48 * Various Linux-internal data structures created from the
43 * MP-table. 49 * MP-table.
@@ -219,14 +225,18 @@ static void __devinit MP_processor_info (struct mpc_config_processor *m)
219 cpu_set(num_processors, cpu_possible_map); 225 cpu_set(num_processors, cpu_possible_map);
220 num_processors++; 226 num_processors++;
221 227
222 if ((num_processors > 8) && 228 if (CPU_HOTPLUG_ENABLED || (num_processors > 8)) {
223 ((APIC_XAPIC(ver) && 229 switch (boot_cpu_data.x86_vendor) {
224 (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)) || 230 case X86_VENDOR_INTEL:
225 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))) 231 if (!APIC_XAPIC(ver)) {
226 def_to_bigsmp = 1; 232 def_to_bigsmp = 0;
227 else 233 break;
228 def_to_bigsmp = 0; 234 }
229 235 /* If P4 and above fall through */
236 case X86_VENDOR_AMD:
237 def_to_bigsmp = 1;
238 }
239 }
230 bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; 240 bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
231} 241}
232 242
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 44470fea4309..1d0a55e68760 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -172,7 +172,6 @@ static ssize_t msr_read(struct file *file, char __user * buf,
172{ 172{
173 u32 __user *tmp = (u32 __user *) buf; 173 u32 __user *tmp = (u32 __user *) buf;
174 u32 data[2]; 174 u32 data[2];
175 size_t rv;
176 u32 reg = *ppos; 175 u32 reg = *ppos;
177 int cpu = iminor(file->f_dentry->d_inode); 176 int cpu = iminor(file->f_dentry->d_inode);
178 int err; 177 int err;
@@ -180,7 +179,7 @@ static ssize_t msr_read(struct file *file, char __user * buf,
180 if (count % 8) 179 if (count % 8)
181 return -EINVAL; /* Invalid chunk size */ 180 return -EINVAL; /* Invalid chunk size */
182 181
183 for (rv = 0; count; count -= 8) { 182 for (; count; count -= 8) {
184 err = do_rdmsr(cpu, reg, &data[0], &data[1]); 183 err = do_rdmsr(cpu, reg, &data[0], &data[1]);
185 if (err) 184 if (err)
186 return err; 185 return err;
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 2333aead0563..45e7f0ac4b04 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -308,9 +308,7 @@ void show_regs(struct pt_regs * regs)
308 cr0 = read_cr0(); 308 cr0 = read_cr0();
309 cr2 = read_cr2(); 309 cr2 = read_cr2();
310 cr3 = read_cr3(); 310 cr3 = read_cr3();
311 if (current_cpu_data.x86 > 4) { 311 cr4 = read_cr4_safe();
312 cr4 = read_cr4();
313 }
314 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); 312 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
315 show_trace(NULL, &regs->esp); 313 show_trace(NULL, &regs->esp);
316} 314}
@@ -404,17 +402,7 @@ void flush_thread(void)
404 402
405void release_thread(struct task_struct *dead_task) 403void release_thread(struct task_struct *dead_task)
406{ 404{
407 if (dead_task->mm) { 405 BUG_ON(dead_task->mm);
408 // temporary debugging check
409 if (dead_task->mm->context.size) {
410 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
411 dead_task->comm,
412 dead_task->mm->context.ldt,
413 dead_task->mm->context.size);
414 BUG();
415 }
416 }
417
418 release_vm86_irqs(dead_task); 406 release_vm86_irqs(dead_task);
419} 407}
420 408
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 5ffbb4b7ad05..5c1fb6aada5b 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -32,9 +32,12 @@
32 * in exit.c or in signal.c. 32 * in exit.c or in signal.c.
33 */ 33 */
34 34
35/* determines which flags the user has access to. */ 35/*
36/* 1 = access 0 = no access */ 36 * Determines which flags the user has access to [1 = access, 0 = no access].
37#define FLAG_MASK 0x00044dd5 37 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
38 * Also masks reserved bits (31-22, 15, 5, 3, 1).
39 */
40#define FLAG_MASK 0x00054dd5
38 41
39/* set's the trap flag. */ 42/* set's the trap flag. */
40#define TRAP_FLAG 0x100 43#define TRAP_FLAG 0x100
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 2afe0f8d555a..2fa5803a759d 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -111,12 +111,12 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
111 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), 111 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
112 }, 112 },
113 }, 113 },
114 { /* Handle problems with rebooting on HP nc6120 */ 114 { /* Handle problems with rebooting on HP laptops */
115 .callback = set_bios_reboot, 115 .callback = set_bios_reboot,
116 .ident = "HP Compaq nc6120", 116 .ident = "HP Compaq Laptop",
117 .matches = { 117 .matches = {
118 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 118 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
119 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nc6120"), 119 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
120 }, 120 },
121 }, 121 },
122 { } 122 { }
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index fdfcb0cba9b4..27c956db0461 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -954,6 +954,12 @@ efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
954 return 0; 954 return 0;
955} 955}
956 956
957static int __init
958efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
959{
960 memory_present(0, start, end);
961 return 0;
962}
957 963
958/* 964/*
959 * Find the highest page frame number we have available 965 * Find the highest page frame number we have available
@@ -965,6 +971,7 @@ void __init find_max_pfn(void)
965 max_pfn = 0; 971 max_pfn = 0;
966 if (efi_enabled) { 972 if (efi_enabled) {
967 efi_memmap_walk(efi_find_max_pfn, &max_pfn); 973 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
974 efi_memmap_walk(efi_memory_present_wrapper, NULL);
968 return; 975 return;
969 } 976 }
970 977
@@ -979,6 +986,7 @@ void __init find_max_pfn(void)
979 continue; 986 continue;
980 if (end > max_pfn) 987 if (end > max_pfn)
981 max_pfn = end; 988 max_pfn = end;
989 memory_present(0, start, end);
982 } 990 }
983} 991}
984 992
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 9ed449af8e9f..b3c2e2c26743 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -903,6 +903,12 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
903 unsigned long start_eip; 903 unsigned long start_eip;
904 unsigned short nmi_high = 0, nmi_low = 0; 904 unsigned short nmi_high = 0, nmi_low = 0;
905 905
906 if (!cpu_gdt_descr[cpu].address &&
907 !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
908 printk("Failed to allocate GDT for CPU %d\n", cpu);
909 return 1;
910 }
911
906 ++cpucount; 912 ++cpucount;
907 913
908 /* 914 /*
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 9b21a31d4f4e..f7ba4acc20ec 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -1,4 +1,3 @@
1.data
2ENTRY(sys_call_table) 1ENTRY(sys_call_table)
3 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ 2 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
4 .long sys_exit 3 .long sys_exit
diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
index d395e3b42485..47675bbbb316 100644
--- a/arch/i386/kernel/timers/timer_tsc.c
+++ b/arch/i386/kernel/timers/timer_tsc.c
@@ -330,7 +330,9 @@ int recalibrate_cpu_khz(void)
330 unsigned int cpu_khz_old = cpu_khz; 330 unsigned int cpu_khz_old = cpu_khz;
331 331
332 if (cpu_has_tsc) { 332 if (cpu_has_tsc) {
333 local_irq_disable();
333 init_cpu_khz(); 334 init_cpu_khz();
335 local_irq_enable();
334 cpu_data[0].loops_per_jiffy = 336 cpu_data[0].loops_per_jiffy =
335 cpufreq_scale(cpu_data[0].loops_per_jiffy, 337 cpufreq_scale(cpu_data[0].loops_per_jiffy,
336 cpu_khz_old, 338 cpu_khz_old,
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index ab0e9430f775..53ad954e3ba4 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -306,14 +306,17 @@ void die(const char * str, struct pt_regs * regs, long err)
306 .lock_owner_depth = 0 306 .lock_owner_depth = 0
307 }; 307 };
308 static int die_counter; 308 static int die_counter;
309 unsigned long flags;
309 310
310 if (die.lock_owner != raw_smp_processor_id()) { 311 if (die.lock_owner != raw_smp_processor_id()) {
311 console_verbose(); 312 console_verbose();
312 spin_lock_irq(&die.lock); 313 spin_lock_irqsave(&die.lock, flags);
313 die.lock_owner = smp_processor_id(); 314 die.lock_owner = smp_processor_id();
314 die.lock_owner_depth = 0; 315 die.lock_owner_depth = 0;
315 bust_spinlocks(1); 316 bust_spinlocks(1);
316 } 317 }
318 else
319 local_save_flags(flags);
317 320
318 if (++die.lock_owner_depth < 3) { 321 if (++die.lock_owner_depth < 3) {
319 int nl = 0; 322 int nl = 0;
@@ -340,7 +343,7 @@ void die(const char * str, struct pt_regs * regs, long err)
340 343
341 bust_spinlocks(0); 344 bust_spinlocks(0);
342 die.lock_owner = -1; 345 die.lock_owner = -1;
343 spin_unlock_irq(&die.lock); 346 spin_unlock_irqrestore(&die.lock, flags);
344 347
345 if (kexec_should_crash(current)) 348 if (kexec_should_crash(current))
346 crash_kexec(regs); 349 crash_kexec(regs);
@@ -1075,9 +1078,9 @@ void __init trap_init(void)
1075 set_trap_gate(0,&divide_error); 1078 set_trap_gate(0,&divide_error);
1076 set_intr_gate(1,&debug); 1079 set_intr_gate(1,&debug);
1077 set_intr_gate(2,&nmi); 1080 set_intr_gate(2,&nmi);
1078 set_system_intr_gate(3, &int3); /* int3-5 can be called from all */ 1081 set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
1079 set_system_gate(4,&overflow); 1082 set_system_gate(4,&overflow);
1080 set_system_gate(5,&bounds); 1083 set_trap_gate(5,&bounds);
1081 set_trap_gate(6,&invalid_op); 1084 set_trap_gate(6,&invalid_op);
1082 set_trap_gate(7,&device_not_available); 1085 set_trap_gate(7,&device_not_available);
1083 set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS); 1086 set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
@@ -1095,6 +1098,28 @@ void __init trap_init(void)
1095#endif 1098#endif
1096 set_trap_gate(19,&simd_coprocessor_error); 1099 set_trap_gate(19,&simd_coprocessor_error);
1097 1100
1101 if (cpu_has_fxsr) {
1102 /*
1103 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
1104 * Generates a compile-time "error: zero width for bit-field" if
1105 * the alignment is wrong.
1106 */
1107 struct fxsrAlignAssert {
1108 int _:!(offsetof(struct task_struct,
1109 thread.i387.fxsave) & 15);
1110 };
1111
1112 printk(KERN_INFO "Enabling fast FPU save and restore... ");
1113 set_in_cr4(X86_CR4_OSFXSR);
1114 printk("done.\n");
1115 }
1116 if (cpu_has_xmm) {
1117 printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
1118 "support... ");
1119 set_in_cr4(X86_CR4_OSXMMEXCPT);
1120 printk("done.\n");
1121 }
1122
1098 set_system_gate(SYSCALL_VECTOR,&system_call); 1123 set_system_gate(SYSCALL_VECTOR,&system_call);
1099 1124
1100 /* 1125 /*
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 06e26f006238..7df494b51a5b 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -735,6 +735,30 @@ void free_initmem(void)
735 printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10); 735 printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10);
736} 736}
737 737
738#ifdef CONFIG_DEBUG_RODATA
739
740extern char __start_rodata, __end_rodata;
741void mark_rodata_ro(void)
742{
743 unsigned long addr = (unsigned long)&__start_rodata;
744
745 for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
746 change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
747
748 printk ("Write protecting the kernel read-only data: %luk\n",
749 (unsigned long)(&__end_rodata - &__start_rodata) >> 10);
750
751 /*
752 * change_page_attr() requires a global_flush_tlb() call after it.
753 * We do this after the printk so that if something went wrong in the
754 * change, the printk gets out at least to give a better debug hint
755 * of who is the culprit.
756 */
757 global_flush_tlb();
758}
759#endif
760
761
738#ifdef CONFIG_BLK_DEV_INITRD 762#ifdef CONFIG_BLK_DEV_INITRD
739void free_initrd_mem(unsigned long start, unsigned long end) 763void free_initrd_mem(unsigned long start, unsigned long end)
740{ 764{
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index f600fc244f02..c30a16df6440 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -13,6 +13,7 @@
13#include <asm/processor.h> 13#include <asm/processor.h>
14#include <asm/tlbflush.h> 14#include <asm/tlbflush.h>
15#include <asm/pgalloc.h> 15#include <asm/pgalloc.h>
16#include <asm/sections.h>
16 17
17static DEFINE_SPINLOCK(cpa_lock); 18static DEFINE_SPINLOCK(cpa_lock);
18static struct list_head df_list = LIST_HEAD_INIT(df_list); 19static struct list_head df_list = LIST_HEAD_INIT(df_list);
@@ -36,7 +37,8 @@ pte_t *lookup_address(unsigned long address)
36 return pte_offset_kernel(pmd, address); 37 return pte_offset_kernel(pmd, address);
37} 38}
38 39
39static struct page *split_large_page(unsigned long address, pgprot_t prot) 40static struct page *split_large_page(unsigned long address, pgprot_t prot,
41 pgprot_t ref_prot)
40{ 42{
41 int i; 43 int i;
42 unsigned long addr; 44 unsigned long addr;
@@ -54,7 +56,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot)
54 pbase = (pte_t *)page_address(base); 56 pbase = (pte_t *)page_address(base);
55 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { 57 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
56 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, 58 set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
57 addr == address ? prot : PAGE_KERNEL)); 59 addr == address ? prot : ref_prot));
58 } 60 }
59 return base; 61 return base;
60} 62}
@@ -98,11 +100,18 @@ static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
98 */ 100 */
99static inline void revert_page(struct page *kpte_page, unsigned long address) 101static inline void revert_page(struct page *kpte_page, unsigned long address)
100{ 102{
101 pte_t *linear = (pte_t *) 103 pgprot_t ref_prot;
104 pte_t *linear;
105
106 ref_prot =
107 ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
108 ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
109
110 linear = (pte_t *)
102 pmd_offset(pud_offset(pgd_offset_k(address), address), address); 111 pmd_offset(pud_offset(pgd_offset_k(address), address), address);
103 set_pmd_pte(linear, address, 112 set_pmd_pte(linear, address,
104 pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, 113 pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
105 PAGE_KERNEL_LARGE)); 114 ref_prot));
106} 115}
107 116
108static int 117static int
@@ -123,10 +132,16 @@ __change_page_attr(struct page *page, pgprot_t prot)
123 if ((pte_val(*kpte) & _PAGE_PSE) == 0) { 132 if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
124 set_pte_atomic(kpte, mk_pte(page, prot)); 133 set_pte_atomic(kpte, mk_pte(page, prot));
125 } else { 134 } else {
126 struct page *split = split_large_page(address, prot); 135 pgprot_t ref_prot;
136 struct page *split;
137
138 ref_prot =
139 ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
140 ? PAGE_KERNEL_EXEC : PAGE_KERNEL;
141 split = split_large_page(address, prot, ref_prot);
127 if (!split) 142 if (!split)
128 return -ENOMEM; 143 return -ENOMEM;
129 set_pmd_pte(kpte,address,mk_pte(split, PAGE_KERNEL)); 144 set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
130 kpte_page = split; 145 kpte_page = split;
131 } 146 }
132 get_page(kpte_page); 147 get_page(kpte_page);
diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c
index 19e6f4871d1e..ee8e01697d96 100644
--- a/arch/i386/pci/irq.c
+++ b/arch/i386/pci/irq.c
@@ -846,7 +846,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
846 * reported by the device if possible. 846 * reported by the device if possible.
847 */ 847 */
848 newirq = dev->irq; 848 newirq = dev->irq;
849 if (!((1 << newirq) & mask)) { 849 if (newirq && !((1 << newirq) & mask)) {
850 if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0; 850 if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
851 else printk(KERN_WARNING "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n", newirq, pci_name(dev)); 851 else printk(KERN_WARNING "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n", newirq, pci_name(dev));
852 } 852 }
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 4d100f3886e1..fae67bbb52f6 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -81,6 +81,12 @@ config PLAT_MAPPI2
81config PLAT_MAPPI3 81config PLAT_MAPPI3
82 bool "Mappi-III(M3A-2170)" 82 bool "Mappi-III(M3A-2170)"
83 83
84config PLAT_M32104UT
85 bool "M32104UT"
86 help
87 The M3T-M32104UT is an reference board based on uT-Engine
88 specification. This board has a M32104 chip.
89
84endchoice 90endchoice
85 91
86choice 92choice
@@ -93,6 +99,10 @@ config CHIP_M32700
93config CHIP_M32102 99config CHIP_M32102
94 bool "M32102" 100 bool "M32102"
95 101
102config CHIP_M32104
103 bool "M32104"
104 depends on PLAT_M32104UT
105
96config CHIP_VDEC2 106config CHIP_VDEC2
97 bool "VDEC2" 107 bool "VDEC2"
98 108
@@ -115,7 +125,7 @@ config TLB_ENTRIES
115 125
116config ISA_M32R 126config ISA_M32R
117 bool 127 bool
118 depends on CHIP_M32102 128 depends on CHIP_M32102 || CHIP_M32104
119 default y 129 default y
120 130
121config ISA_M32R2 131config ISA_M32R2
@@ -140,6 +150,7 @@ config BUS_CLOCK
140 default "50000000" if PLAT_MAPPI3 150 default "50000000" if PLAT_MAPPI3
141 default "50000000" if PLAT_M32700UT 151 default "50000000" if PLAT_M32700UT
142 default "50000000" if PLAT_OPSPUT 152 default "50000000" if PLAT_OPSPUT
153 default "54000000" if PLAT_M32104UT
143 default "33333333" if PLAT_OAKS32R 154 default "33333333" if PLAT_OAKS32R
144 default "20000000" if PLAT_MAPPI2 155 default "20000000" if PLAT_MAPPI2
145 156
@@ -157,6 +168,7 @@ config MEMORY_START
157 default "08000000" if PLAT_USRV 168 default "08000000" if PLAT_USRV
158 default "08000000" if PLAT_M32700UT 169 default "08000000" if PLAT_M32700UT
159 default "08000000" if PLAT_OPSPUT 170 default "08000000" if PLAT_OPSPUT
171 default "04000000" if PLAT_M32104UT
160 default "01000000" if PLAT_OAKS32R 172 default "01000000" if PLAT_OAKS32R
161 173
162config MEMORY_SIZE 174config MEMORY_SIZE
@@ -166,6 +178,7 @@ config MEMORY_SIZE
166 default "02000000" if PLAT_USRV 178 default "02000000" if PLAT_USRV
167 default "01000000" if PLAT_M32700UT 179 default "01000000" if PLAT_M32700UT
168 default "01000000" if PLAT_OPSPUT 180 default "01000000" if PLAT_OPSPUT
181 default "01000000" if PLAT_M32104UT
169 default "00800000" if PLAT_OAKS32R 182 default "00800000" if PLAT_OAKS32R
170 183
171config NOHIGHMEM 184config NOHIGHMEM
@@ -174,21 +187,22 @@ config NOHIGHMEM
174 187
175config ARCH_DISCONTIGMEM_ENABLE 188config ARCH_DISCONTIGMEM_ENABLE
176 bool "Internal RAM Support" 189 bool "Internal RAM Support"
177 depends on CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP 190 depends on CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104
178 default y 191 default y
179 192
180source "mm/Kconfig" 193source "mm/Kconfig"
181 194
182config IRAM_START 195config IRAM_START
183 hex "Internal memory start address (hex)" 196 hex "Internal memory start address (hex)"
184 default "00f00000" 197 default "00f00000" if !CHIP_M32104
185 depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP) && DISCONTIGMEM 198 default "00700000" if CHIP_M32104
199 depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104) && DISCONTIGMEM
186 200
187config IRAM_SIZE 201config IRAM_SIZE
188 hex "Internal memory size (hex)" 202 hex "Internal memory size (hex)"
189 depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP) && DISCONTIGMEM 203 depends on (CHIP_M32700 || CHIP_M32102 || CHIP_VDEC2 || CHIP_OPSP || CHIP_M32104) && DISCONTIGMEM
190 default "00080000" if CHIP_M32700 204 default "00080000" if CHIP_M32700
191 default "00010000" if CHIP_M32102 || CHIP_OPSP 205 default "00010000" if CHIP_M32102 || CHIP_OPSP || CHIP_M32104
192 default "00008000" if CHIP_VDEC2 206 default "00008000" if CHIP_VDEC2
193 207
194# 208#
diff --git a/arch/m32r/boot/compressed/head.S b/arch/m32r/boot/compressed/head.S
index 07cfd6ad1ae4..234d8b1e0ac1 100644
--- a/arch/m32r/boot/compressed/head.S
+++ b/arch/m32r/boot/compressed/head.S
@@ -143,6 +143,11 @@ startup:
143 ldi r0, -2 143 ldi r0, -2
144 ldi r1, 0x0100 ; invalidate 144 ldi r1, 0x0100 ; invalidate
145 stb r1, @r0 145 stb r1, @r0
146#elif defined(CONFIG_CHIP_M32104)
147 /* Cache flush */
148 ldi r0, -2
149 ldi r1, 0x0700 ; invalidate i-cache, copy back d-cache
150 sth r1, @r0
146#else 151#else
147#error "put your cache flush function, please" 152#error "put your cache flush function, please"
148#endif 153#endif
diff --git a/arch/m32r/boot/setup.S b/arch/m32r/boot/setup.S
index 5d256434b4ad..398542507d84 100644
--- a/arch/m32r/boot/setup.S
+++ b/arch/m32r/boot/setup.S
@@ -1,11 +1,10 @@
1/* 1/*
2 * linux/arch/m32r/boot/setup.S -- A setup code. 2 * linux/arch/m32r/boot/setup.S -- A setup code.
3 * 3 *
4 * Copyright (C) 2001, 2002 Hiroyuki Kondo, Hirokazu Takata, 4 * Copyright (C) 2001-2005 Hiroyuki Kondo, Hirokazu Takata,
5 * and Hitoshi Yamamoto 5 * Hitoshi Yamamoto, Hayato Fujiwara
6 * 6 *
7 */ 7 */
8/* $Id$ */
9 8
10#include <linux/linkage.h> 9#include <linux/linkage.h>
11#include <asm/segment.h> 10#include <asm/segment.h>
@@ -80,6 +79,20 @@ ENTRY(boot)
80 ldi r1, #0x101 ; cache on (with invalidation) 79 ldi r1, #0x101 ; cache on (with invalidation)
81; ldi r1, #0x00 ; cache off 80; ldi r1, #0x00 ; cache off
82 st r1, @r0 81 st r1, @r0
82#elif defined(CONFIG_CHIP_M32104)
83 ldi r0, #-96 ; DNCR0
84 seth r1, #0x0060 ; from 0x00600000
85 or3 r1, r1, #0x0005 ; size 2MB
86 st r1, @r0
87 seth r1, #0x0100 ; from 0x01000000
88 or3 r1, r1, #0x0003 ; size 16MB
89 st r1, @+r0
90 seth r1, #0x0200 ; from 0x02000000
91 or3 r1, r1, #0x0002 ; size 32MB
92 st r1, @+r0
93 ldi r0, #-4 ;LDIMM (r0, M32R_MCCR)
94 ldi r1, #0x703 ; cache on (with invalidation)
95 st r1, @r0
83#else 96#else
84#error unknown chip configuration 97#error unknown chip configuration
85#endif 98#endif
@@ -115,10 +128,15 @@ mmu_on:
115 st r1, @(MATM_offset,r0) ; Set MATM (T bit ON) 128 st r1, @(MATM_offset,r0) ; Set MATM (T bit ON)
116 ld r0, @(MATM_offset,r0) ; Check 129 ld r0, @(MATM_offset,r0) ; Check
117#else 130#else
131#if defined(CONFIG_CHIP_M32700)
118 seth r0,#high(M32R_MCDCAR) 132 seth r0,#high(M32R_MCDCAR)
119 or3 r0,r0,#low(M32R_MCDCAR) 133 or3 r0,r0,#low(M32R_MCDCAR)
120 ld24 r1,#0x8080 134 ld24 r1,#0x8080
121 st r1,@r0 135 st r1,@r0
136#elif defined(CONFIG_CHIP_M32104)
137 LDIMM (r2, eit_vector) ; set EVB(cr5)
138 mvtc r2, cr5
139#endif
122#endif /* CONFIG_MMU */ 140#endif /* CONFIG_MMU */
123 jmp r13 141 jmp r13
124 nop 142 nop
diff --git a/arch/m32r/kernel/Makefile b/arch/m32r/kernel/Makefile
index 6c6b6c376638..5a2fa886906f 100644
--- a/arch/m32r/kernel/Makefile
+++ b/arch/m32r/kernel/Makefile
@@ -16,5 +16,6 @@ obj-$(CONFIG_PLAT_M32700UT) += setup_m32700ut.o io_m32700ut.o
16obj-$(CONFIG_PLAT_OPSPUT) += setup_opsput.o io_opsput.o 16obj-$(CONFIG_PLAT_OPSPUT) += setup_opsput.o io_opsput.o
17obj-$(CONFIG_MODULES) += module.o 17obj-$(CONFIG_MODULES) += module.o
18obj-$(CONFIG_PLAT_OAKS32R) += setup_oaks32r.o io_oaks32r.o 18obj-$(CONFIG_PLAT_OAKS32R) += setup_oaks32r.o io_oaks32r.o
19obj-$(CONFIG_PLAT_M32104UT) += setup_m32104ut.o io_m32104ut.o
19 20
20EXTRA_AFLAGS := -traditional 21EXTRA_AFLAGS := -traditional
diff --git a/arch/m32r/kernel/entry.S b/arch/m32r/kernel/entry.S
index 396c94218cc2..3871b65f0c82 100644
--- a/arch/m32r/kernel/entry.S
+++ b/arch/m32r/kernel/entry.S
@@ -315,7 +315,7 @@ ENTRY(ei_handler)
315 mv r1, sp ; arg1(regs) 315 mv r1, sp ; arg1(regs)
316#if defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_XNUX2) \ 316#if defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_XNUX2) \
317 || defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_M32102) \ 317 || defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_M32102) \
318 || defined(CONFIG_CHIP_OPSP) 318 || defined(CONFIG_CHIP_OPSP) || defined(CONFIG_CHIP_M32104)
319 319
320; GET_ICU_STATUS; 320; GET_ICU_STATUS;
321 seth r0, #shigh(M32R_ICU_ISTS_ADDR) 321 seth r0, #shigh(M32R_ICU_ISTS_ADDR)
@@ -541,7 +541,20 @@ check_int2:
541 bra check_end 541 bra check_end
542 .fillinsn 542 .fillinsn
543check_end: 543check_end:
544#endif /* CONFIG_PLAT_OPSPUT */ 544#elif defined(CONFIG_PLAT_M32104UT)
545 add3 r2, r0, #-(M32R_IRQ_INT1) ; INT1# interrupt
546 bnez r2, check_end
547 ; read ICU status register of PLD
548 seth r0, #high(PLD_ICUISTS)
549 or3 r0, r0, #low(PLD_ICUISTS)
550 lduh r0, @r0
551 slli r0, #21
552 srli r0, #27 ; ISN
553 addi r0, #(M32104UT_PLD_IRQ_BASE)
554 bra check_end
555 .fillinsn
556check_end:
557#endif /* CONFIG_PLAT_M32104UT */
545 bl do_IRQ 558 bl do_IRQ
546#endif /* CONFIG_SMP */ 559#endif /* CONFIG_SMP */
547 ld r14, @sp+ 560 ld r14, @sp+
@@ -651,8 +664,6 @@ ENTRY(rie_handler)
651/* void rie_handler(int error_code) */ 664/* void rie_handler(int error_code) */
652 SWITCH_TO_KERNEL_STACK 665 SWITCH_TO_KERNEL_STACK
653 SAVE_ALL 666 SAVE_ALL
654 mvfc r0, bpc
655 ld r1, @r0
656 ldi r1, #0x20 ; error_code 667 ldi r1, #0x20 ; error_code
657 mv r0, sp ; pt_regs 668 mv r0, sp ; pt_regs
658 bl do_rie_handler 669 bl do_rie_handler
diff --git a/arch/m32r/kernel/io_m32104ut.c b/arch/m32r/kernel/io_m32104ut.c
new file mode 100644
index 000000000000..d26adab9586c
--- /dev/null
+++ b/arch/m32r/kernel/io_m32104ut.c
@@ -0,0 +1,298 @@
1/*
2 * linux/arch/m32r/kernel/io_m32104ut.c
3 *
4 * Typical I/O routines for M32104UT board.
5 *
6 * Copyright (c) 2001-2005 Hiroyuki Kondo, Hirokazu Takata,
7 * Hitoshi Yamamoto, Mamoru Sakugawa,
8 * Naoto Sugai, Hayato Fujiwara
9 */
10
11#include <linux/config.h>
12#include <asm/m32r.h>
13#include <asm/page.h>
14#include <asm/io.h>
15#include <asm/byteorder.h>
16
17#if defined(CONFIG_PCMCIA) && defined(CONFIG_M32R_CFC)
18#include <linux/types.h>
19
20#define M32R_PCC_IOMAP_SIZE 0x1000
21
22#define M32R_PCC_IOSTART0 0x1000
23#define M32R_PCC_IOEND0 (M32R_PCC_IOSTART0 + M32R_PCC_IOMAP_SIZE - 1)
24
25extern void pcc_ioread_byte(int, unsigned long, void *, size_t, size_t, int);
26extern void pcc_ioread_word(int, unsigned long, void *, size_t, size_t, int);
27extern void pcc_iowrite_byte(int, unsigned long, void *, size_t, size_t, int);
28extern void pcc_iowrite_word(int, unsigned long, void *, size_t, size_t, int);
29#endif /* CONFIG_PCMCIA && CONFIG_M32R_CFC */
30
31#define PORT2ADDR(port) _port2addr(port)
32
33static inline void *_port2addr(unsigned long port)
34{
35 return (void *)(port | NONCACHE_OFFSET);
36}
37
38#if defined(CONFIG_IDE) && !defined(CONFIG_M32R_CFC)
39static inline void *__port2addr_ata(unsigned long port)
40{
41 static int dummy_reg;
42
43 switch (port) {
44 case 0x1f0: return (void *)(0x0c002000 | NONCACHE_OFFSET);
45 case 0x1f1: return (void *)(0x0c012800 | NONCACHE_OFFSET);
46 case 0x1f2: return (void *)(0x0c012002 | NONCACHE_OFFSET);
47 case 0x1f3: return (void *)(0x0c012802 | NONCACHE_OFFSET);
48 case 0x1f4: return (void *)(0x0c012004 | NONCACHE_OFFSET);
49 case 0x1f5: return (void *)(0x0c012804 | NONCACHE_OFFSET);
50 case 0x1f6: return (void *)(0x0c012006 | NONCACHE_OFFSET);
51 case 0x1f7: return (void *)(0x0c012806 | NONCACHE_OFFSET);
52 case 0x3f6: return (void *)(0x0c01200e | NONCACHE_OFFSET);
53 default: return (void *)&dummy_reg;
54 }
55}
56#endif
57
58/*
59 * M32104T-LAN is located in the extended bus space
60 * from 0x01000000 to 0x01ffffff on physical address.
61 * The base address of LAN controller(LAN91C111) is 0x300.
62 */
63#define LAN_IOSTART (0x300 | NONCACHE_OFFSET)
64#define LAN_IOEND (0x320 | NONCACHE_OFFSET)
65static inline void *_port2addr_ne(unsigned long port)
66{
67 return (void *)(port + NONCACHE_OFFSET + 0x01000000);
68}
69
70static inline void delay(void)
71{
72 __asm__ __volatile__ ("push r0; \n\t pop r0;" : : :"memory");
73}
74
75/*
76 * NIC I/O function
77 */
78
79#define PORT2ADDR_NE(port) _port2addr_ne(port)
80
81static inline unsigned char _ne_inb(void *portp)
82{
83 return *(volatile unsigned char *)portp;
84}
85
86static inline unsigned short _ne_inw(void *portp)
87{
88 return (unsigned short)le16_to_cpu(*(volatile unsigned short *)portp);
89}
90
91static inline void _ne_insb(void *portp, void *addr, unsigned long count)
92{
93 unsigned char *buf = (unsigned char *)addr;
94
95 while (count--)
96 *buf++ = _ne_inb(portp);
97}
98
99static inline void _ne_outb(unsigned char b, void *portp)
100{
101 *(volatile unsigned char *)portp = b;
102}
103
104static inline void _ne_outw(unsigned short w, void *portp)
105{
106 *(volatile unsigned short *)portp = cpu_to_le16(w);
107}
108
109unsigned char _inb(unsigned long port)
110{
111 if (port >= LAN_IOSTART && port < LAN_IOEND)
112 return _ne_inb(PORT2ADDR_NE(port));
113
114 return *(volatile unsigned char *)PORT2ADDR(port);
115}
116
117unsigned short _inw(unsigned long port)
118{
119 if (port >= LAN_IOSTART && port < LAN_IOEND)
120 return _ne_inw(PORT2ADDR_NE(port));
121
122 return *(volatile unsigned short *)PORT2ADDR(port);
123}
124
125unsigned long _inl(unsigned long port)
126{
127 return *(volatile unsigned long *)PORT2ADDR(port);
128}
129
130unsigned char _inb_p(unsigned long port)
131{
132 unsigned char v = _inb(port);
133 delay();
134 return (v);
135}
136
137unsigned short _inw_p(unsigned long port)
138{
139 unsigned short v = _inw(port);
140 delay();
141 return (v);
142}
143
144unsigned long _inl_p(unsigned long port)
145{
146 unsigned long v = _inl(port);
147 delay();
148 return (v);
149}
150
151void _outb(unsigned char b, unsigned long port)
152{
153 if (port >= LAN_IOSTART && port < LAN_IOEND)
154 _ne_outb(b, PORT2ADDR_NE(port));
155 else
156 *(volatile unsigned char *)PORT2ADDR(port) = b;
157}
158
159void _outw(unsigned short w, unsigned long port)
160{
161 if (port >= LAN_IOSTART && port < LAN_IOEND)
162 _ne_outw(w, PORT2ADDR_NE(port));
163 else
164 *(volatile unsigned short *)PORT2ADDR(port) = w;
165}
166
167void _outl(unsigned long l, unsigned long port)
168{
169 *(volatile unsigned long *)PORT2ADDR(port) = l;
170}
171
172void _outb_p(unsigned char b, unsigned long port)
173{
174 _outb(b, port);
175 delay();
176}
177
178void _outw_p(unsigned short w, unsigned long port)
179{
180 _outw(w, port);
181 delay();
182}
183
184void _outl_p(unsigned long l, unsigned long port)
185{
186 _outl(l, port);
187 delay();
188}
189
190void _insb(unsigned int port, void *addr, unsigned long count)
191{
192 if (port >= LAN_IOSTART && port < LAN_IOEND)
193 _ne_insb(PORT2ADDR_NE(port), addr, count);
194 else {
195 unsigned char *buf = addr;
196 unsigned char *portp = PORT2ADDR(port);
197 while (count--)
198 *buf++ = *(volatile unsigned char *)portp;
199 }
200}
201
202void _insw(unsigned int port, void *addr, unsigned long count)
203{
204 unsigned short *buf = addr;
205 unsigned short *portp;
206
207 if (port >= LAN_IOSTART && port < LAN_IOEND) {
208 /*
209 * This portion is only used by smc91111.c to read data
210 * from the DATA_REG. Do not swap the data.
211 */
212 portp = PORT2ADDR_NE(port);
213 while (count--)
214 *buf++ = *(volatile unsigned short *)portp;
215#if defined(CONFIG_PCMCIA) && defined(CONFIG_M32R_CFC)
216 } else if (port >= M32R_PCC_IOSTART0 && port <= M32R_PCC_IOEND0) {
217 pcc_ioread_word(9, port, (void *)addr, sizeof(unsigned short),
218 count, 1);
219#endif
220#if defined(CONFIG_IDE) && !defined(CONFIG_M32R_CFC)
221 } else if ((port >= 0x1f0 && port <=0x1f7) || port == 0x3f6) {
222 portp = __port2addr_ata(port);
223 while (count--)
224 *buf++ = *(volatile unsigned short *)portp;
225#endif
226 } else {
227 portp = PORT2ADDR(port);
228 while (count--)
229 *buf++ = *(volatile unsigned short *)portp;
230 }
231}
232
233void _insl(unsigned int port, void *addr, unsigned long count)
234{
235 unsigned long *buf = addr;
236 unsigned long *portp;
237
238 portp = PORT2ADDR(port);
239 while (count--)
240 *buf++ = *(volatile unsigned long *)portp;
241}
242
243void _outsb(unsigned int port, const void *addr, unsigned long count)
244{
245 const unsigned char *buf = addr;
246 unsigned char *portp;
247
248 if (port >= LAN_IOSTART && port < LAN_IOEND) {
249 portp = PORT2ADDR_NE(port);
250 while (count--)
251 _ne_outb(*buf++, portp);
252 } else {
253 portp = PORT2ADDR(port);
254 while (count--)
255 *(volatile unsigned char *)portp = *buf++;
256 }
257}
258
259void _outsw(unsigned int port, const void *addr, unsigned long count)
260{
261 const unsigned short *buf = addr;
262 unsigned short *portp;
263
264 if (port >= LAN_IOSTART && port < LAN_IOEND) {
265 /*
266 * This portion is only used by smc91111.c to write data
267 * into the DATA_REG. Do not swap the data.
268 */
269 portp = PORT2ADDR_NE(port);
270 while (count--)
271 *(volatile unsigned short *)portp = *buf++;
272#if defined(CONFIG_IDE) && !defined(CONFIG_M32R_CFC)
273 } else if ((port >= 0x1f0 && port <=0x1f7) || port == 0x3f6) {
274 portp = __port2addr_ata(port);
275 while (count--)
276 *(volatile unsigned short *)portp = *buf++;
277#endif
278#if defined(CONFIG_PCMCIA) && defined(CONFIG_M32R_CFC)
279 } else if (port >= M32R_PCC_IOSTART0 && port <= M32R_PCC_IOEND0) {
280 pcc_iowrite_word(9, port, (void *)addr, sizeof(unsigned short),
281 count, 1);
282#endif
283 } else {
284 portp = PORT2ADDR(port);
285 while (count--)
286 *(volatile unsigned short *)portp = *buf++;
287 }
288}
289
290void _outsl(unsigned int port, const void *addr, unsigned long count)
291{
292 const unsigned long *buf = addr;
293 unsigned char *portp;
294
295 portp = PORT2ADDR(port);
296 while (count--)
297 *(volatile unsigned long *)portp = *buf++;
298}
diff --git a/arch/m32r/kernel/io_m32700ut.c b/arch/m32r/kernel/io_m32700ut.c
index eda9f963c1eb..939932d6cc00 100644
--- a/arch/m32r/kernel/io_m32700ut.c
+++ b/arch/m32r/kernel/io_m32700ut.c
@@ -36,7 +36,7 @@ extern void pcc_iowrite_word(int, unsigned long, void *, size_t, size_t, int);
36 36
37static inline void *_port2addr(unsigned long port) 37static inline void *_port2addr(unsigned long port)
38{ 38{
39 return (void *)(port + NONCACHE_OFFSET); 39 return (void *)(port | NONCACHE_OFFSET);
40} 40}
41 41
42#if defined(CONFIG_IDE) && !defined(CONFIG_M32R_CFC) 42#if defined(CONFIG_IDE) && !defined(CONFIG_M32R_CFC)
@@ -45,15 +45,15 @@ static inline void *__port2addr_ata(unsigned long port)
45 static int dummy_reg; 45 static int dummy_reg;
46 46
47 switch (port) { 47 switch (port) {
48 case 0x1f0: return (void *)0xac002000; 48 case 0x1f0: return (void *)(0x0c002000 | NONCACHE_OFFSET);
49 case 0x1f1: return (void *)0xac012800; 49 case 0x1f1: return (void *)(0x0c012800 | NONCACHE_OFFSET);
50 case 0x1f2: return (void *)0xac012002; 50 case 0x1f2: return (void *)(0x0c012002 | NONCACHE_OFFSET);
51 case 0x1f3: return (void *)0xac012802; 51 case 0x1f3: return (void *)(0x0c012802 | NONCACHE_OFFSET);
52 case 0x1f4: return (void *)0xac012004; 52 case 0x1f4: return (void *)(0x0c012004 | NONCACHE_OFFSET);
53 case 0x1f5: return (void *)0xac012804; 53 case 0x1f5: return (void *)(0x0c012804 | NONCACHE_OFFSET);
54 case 0x1f6: return (void *)0xac012006; 54 case 0x1f6: return (void *)(0x0c012006 | NONCACHE_OFFSET);
55 case 0x1f7: return (void *)0xac012806; 55 case 0x1f7: return (void *)(0x0c012806 | NONCACHE_OFFSET);
56 case 0x3f6: return (void *)0xac01200e; 56 case 0x3f6: return (void *)(0x0c01200e | NONCACHE_OFFSET);
57 default: return (void *)&dummy_reg; 57 default: return (void *)&dummy_reg;
58 } 58 }
59} 59}
@@ -64,8 +64,8 @@ static inline void *__port2addr_ata(unsigned long port)
64 * from 0x10000000 to 0x13ffffff on physical address. 64 * from 0x10000000 to 0x13ffffff on physical address.
65 * The base address of LAN controller(LAN91C111) is 0x300. 65 * The base address of LAN controller(LAN91C111) is 0x300.
66 */ 66 */
67#define LAN_IOSTART 0xa0000300 67#define LAN_IOSTART (0x300 | NONCACHE_OFFSET)
68#define LAN_IOEND 0xa0000320 68#define LAN_IOEND (0x320 | NONCACHE_OFFSET)
69static inline void *_port2addr_ne(unsigned long port) 69static inline void *_port2addr_ne(unsigned long port)
70{ 70{
71 return (void *)(port + 0x10000000); 71 return (void *)(port + 0x10000000);
diff --git a/arch/m32r/kernel/io_mappi.c b/arch/m32r/kernel/io_mappi.c
index 3c3da042fbd1..a662b537c5ba 100644
--- a/arch/m32r/kernel/io_mappi.c
+++ b/arch/m32r/kernel/io_mappi.c
@@ -31,7 +31,7 @@ extern void pcc_iowrite(int, unsigned long, void *, size_t, size_t, int);
31 31
32static inline void *_port2addr(unsigned long port) 32static inline void *_port2addr(unsigned long port)
33{ 33{
34 return (void *)(port | (NONCACHE_OFFSET)); 34 return (void *)(port | NONCACHE_OFFSET);
35} 35}
36 36
37static inline void *_port2addr_ne(unsigned long port) 37static inline void *_port2addr_ne(unsigned long port)
diff --git a/arch/m32r/kernel/io_mappi2.c b/arch/m32r/kernel/io_mappi2.c
index df3c729cb3e0..e72d725606af 100644
--- a/arch/m32r/kernel/io_mappi2.c
+++ b/arch/m32r/kernel/io_mappi2.c
@@ -33,7 +33,7 @@ extern void pcc_iowrite_word(int, unsigned long, void *, size_t, size_t, int);
33 33
34static inline void *_port2addr(unsigned long port) 34static inline void *_port2addr(unsigned long port)
35{ 35{
36 return (void *)(port | (NONCACHE_OFFSET)); 36 return (void *)(port | NONCACHE_OFFSET);
37} 37}
38 38
39#if defined(CONFIG_IDE) && !defined(CONFIG_M32R_CFC) 39#if defined(CONFIG_IDE) && !defined(CONFIG_M32R_CFC)
@@ -42,22 +42,22 @@ static inline void *__port2addr_ata(unsigned long port)
42 static int dummy_reg; 42 static int dummy_reg;
43 43
44 switch (port) { 44 switch (port) {
45 case 0x1f0: return (void *)0xac002000; 45 case 0x1f0: return (void *)(0x0c002000 | NONCACHE_OFFSET);
46 case 0x1f1: return (void *)0xac012800; 46 case 0x1f1: return (void *)(0x0c012800 | NONCACHE_OFFSET);
47 case 0x1f2: return (void *)0xac012002; 47 case 0x1f2: return (void *)(0x0c012002 | NONCACHE_OFFSET);
48 case 0x1f3: return (void *)0xac012802; 48 case 0x1f3: return (void *)(0x0c012802 | NONCACHE_OFFSET);
49 case 0x1f4: return (void *)0xac012004; 49 case 0x1f4: return (void *)(0x0c012004 | NONCACHE_OFFSET);
50 case 0x1f5: return (void *)0xac012804; 50 case 0x1f5: return (void *)(0x0c012804 | NONCACHE_OFFSET);
51 case 0x1f6: return (void *)0xac012006; 51 case 0x1f6: return (void *)(0x0c012006 | NONCACHE_OFFSET);
52 case 0x1f7: return (void *)0xac012806; 52 case 0x1f7: return (void *)(0x0c012806 | NONCACHE_OFFSET);
53 case 0x3f6: return (void *)0xac01200e; 53 case 0x3f6: return (void *)(0x0c01200e | NONCACHE_OFFSET);
54 default: return (void *)&dummy_reg; 54 default: return (void *)&dummy_reg;
55 } 55 }
56} 56}
57#endif 57#endif
58 58
59#define LAN_IOSTART 0xa0000300 59#define LAN_IOSTART (0x300 | NONCACHE_OFFSET)
60#define LAN_IOEND 0xa0000320 60#define LAN_IOEND (0x320 | NONCACHE_OFFSET)
61#ifdef CONFIG_CHIP_OPSP 61#ifdef CONFIG_CHIP_OPSP
62static inline void *_port2addr_ne(unsigned long port) 62static inline void *_port2addr_ne(unsigned long port)
63{ 63{
diff --git a/arch/m32r/kernel/io_mappi3.c b/arch/m32r/kernel/io_mappi3.c
index f80321a58764..ed6da930bc64 100644
--- a/arch/m32r/kernel/io_mappi3.c
+++ b/arch/m32r/kernel/io_mappi3.c
@@ -33,7 +33,7 @@ extern void pcc_iowrite_word(int, unsigned long, void *, size_t, size_t, int);
33 33
34static inline void *_port2addr(unsigned long port) 34static inline void *_port2addr(unsigned long port)
35{ 35{
36 return (void *)(port + NONCACHE_OFFSET); 36 return (void *)(port | NONCACHE_OFFSET);
37} 37}
38 38
39#if defined(CONFIG_IDE) 39#if defined(CONFIG_IDE)
@@ -43,33 +43,42 @@ static inline void *__port2addr_ata(unsigned long port)
43 43
44 switch (port) { 44 switch (port) {
45 /* IDE0 CF */ 45 /* IDE0 CF */
46 case 0x1f0: return (void *)0xb4002000; 46 case 0x1f0: return (void *)(0x14002000 | NONCACHE_OFFSET);
47 case 0x1f1: return (void *)0xb4012800; 47 case 0x1f1: return (void *)(0x14012800 | NONCACHE_OFFSET);
48 case 0x1f2: return (void *)0xb4012002; 48 case 0x1f2: return (void *)(0x14012002 | NONCACHE_OFFSET);
49 case 0x1f3: return (void *)0xb4012802; 49 case 0x1f3: return (void *)(0x14012802 | NONCACHE_OFFSET);
50 case 0x1f4: return (void *)0xb4012004; 50 case 0x1f4: return (void *)(0x14012004 | NONCACHE_OFFSET);
51 case 0x1f5: return (void *)0xb4012804; 51 case 0x1f5: return (void *)(0x14012804 | NONCACHE_OFFSET);
52 case 0x1f6: return (void *)0xb4012006; 52 case 0x1f6: return (void *)(0x14012006 | NONCACHE_OFFSET);
53 case 0x1f7: return (void *)0xb4012806; 53 case 0x1f7: return (void *)(0x14012806 | NONCACHE_OFFSET);
54 case 0x3f6: return (void *)0xb401200e; 54 case 0x3f6: return (void *)(0x1401200e | NONCACHE_OFFSET);
55 /* IDE1 IDE */ 55 /* IDE1 IDE */
56 case 0x170: return (void *)0xb4810000; /* Data 16bit */ 56 case 0x170: /* Data 16bit */
57 case 0x171: return (void *)0xb4810002; /* Features / Error */ 57 return (void *)(0x14810000 | NONCACHE_OFFSET);
58 case 0x172: return (void *)0xb4810004; /* Sector count */ 58 case 0x171: /* Features / Error */
59 case 0x173: return (void *)0xb4810006; /* Sector number */ 59 return (void *)(0x14810002 | NONCACHE_OFFSET);
60 case 0x174: return (void *)0xb4810008; /* Cylinder low */ 60 case 0x172: /* Sector count */
61 case 0x175: return (void *)0xb481000a; /* Cylinder high */ 61 return (void *)(0x14810004 | NONCACHE_OFFSET);
62 case 0x176: return (void *)0xb481000c; /* Device head */ 62 case 0x173: /* Sector number */
63 case 0x177: return (void *)0xb481000e; /* Command */ 63 return (void *)(0x14810006 | NONCACHE_OFFSET);
64 case 0x376: return (void *)0xb480800c; /* Device control / Alt status */ 64 case 0x174: /* Cylinder low */
65 return (void *)(0x14810008 | NONCACHE_OFFSET);
66 case 0x175: /* Cylinder high */
67 return (void *)(0x1481000a | NONCACHE_OFFSET);
68 case 0x176: /* Device head */
69 return (void *)(0x1481000c | NONCACHE_OFFSET);
70 case 0x177: /* Command */
71 return (void *)(0x1481000e | NONCACHE_OFFSET);
72 case 0x376: /* Device control / Alt status */
73 return (void *)(0x1480800c | NONCACHE_OFFSET);
65 74
66 default: return (void *)&dummy_reg; 75 default: return (void *)&dummy_reg;
67 } 76 }
68} 77}
69#endif 78#endif
70 79
71#define LAN_IOSTART 0xa0000300 80#define LAN_IOSTART (0x300 | NONCACHE_OFFSET)
72#define LAN_IOEND 0xa0000320 81#define LAN_IOEND (0x320 | NONCACHE_OFFSET)
73static inline void *_port2addr_ne(unsigned long port) 82static inline void *_port2addr_ne(unsigned long port)
74{ 83{
75 return (void *)(port + 0x10000000); 84 return (void *)(port + 0x10000000);
diff --git a/arch/m32r/kernel/io_oaks32r.c b/arch/m32r/kernel/io_oaks32r.c
index 8be323931e4a..910dd131c227 100644
--- a/arch/m32r/kernel/io_oaks32r.c
+++ b/arch/m32r/kernel/io_oaks32r.c
@@ -16,7 +16,7 @@
16 16
17static inline void *_port2addr(unsigned long port) 17static inline void *_port2addr(unsigned long port)
18{ 18{
19 return (void *)(port | (NONCACHE_OFFSET)); 19 return (void *)(port | NONCACHE_OFFSET);
20} 20}
21 21
22static inline void *_port2addr_ne(unsigned long port) 22static inline void *_port2addr_ne(unsigned long port)
diff --git a/arch/m32r/kernel/io_opsput.c b/arch/m32r/kernel/io_opsput.c
index 4793bd18e115..bec69297db3c 100644
--- a/arch/m32r/kernel/io_opsput.c
+++ b/arch/m32r/kernel/io_opsput.c
@@ -36,7 +36,7 @@ extern void pcc_iowrite_word(int, unsigned long, void *, size_t, size_t, int);
36 36
37static inline void *_port2addr(unsigned long port) 37static inline void *_port2addr(unsigned long port)
38{ 38{
39 return (void *)(port | (NONCACHE_OFFSET)); 39 return (void *)(port | NONCACHE_OFFSET);
40} 40}
41 41
42/* 42/*
@@ -44,8 +44,8 @@ static inline void *_port2addr(unsigned long port)
44 * from 0x10000000 to 0x13ffffff on physical address. 44 * from 0x10000000 to 0x13ffffff on physical address.
45 * The base address of LAN controller(LAN91C111) is 0x300. 45 * The base address of LAN controller(LAN91C111) is 0x300.
46 */ 46 */
47#define LAN_IOSTART 0xa0000300 47#define LAN_IOSTART (0x300 | NONCACHE_OFFSET)
48#define LAN_IOEND 0xa0000320 48#define LAN_IOEND (0x320 | NONCACHE_OFFSET)
49static inline void *_port2addr_ne(unsigned long port) 49static inline void *_port2addr_ne(unsigned long port)
50{ 50{
51 return (void *)(port + 0x10000000); 51 return (void *)(port + 0x10000000);
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index f722ec8eb021..c2e4dccf0112 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -320,6 +320,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
320#elif defined(CONFIG_CHIP_MP) 320#elif defined(CONFIG_CHIP_MP)
321 seq_printf(m, "cpu family\t: M32R-MP\n" 321 seq_printf(m, "cpu family\t: M32R-MP\n"
322 "cache size\t: I-xxKB/D-xxKB\n"); 322 "cache size\t: I-xxKB/D-xxKB\n");
323#elif defined(CONFIG_CHIP_M32104)
324 seq_printf(m,"cpu family\t: M32104\n"
325 "cache size\t: I-8KB/D-8KB\n");
323#else 326#else
324 seq_printf(m, "cpu family\t: Unknown\n"); 327 seq_printf(m, "cpu family\t: Unknown\n");
325#endif 328#endif
@@ -340,6 +343,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
340 seq_printf(m, "Machine\t\t: uServer\n"); 343 seq_printf(m, "Machine\t\t: uServer\n");
341#elif defined(CONFIG_PLAT_OAKS32R) 344#elif defined(CONFIG_PLAT_OAKS32R)
342 seq_printf(m, "Machine\t\t: OAKS32R\n"); 345 seq_printf(m, "Machine\t\t: OAKS32R\n");
346#elif defined(CONFIG_PLAT_M32104UT)
347 seq_printf(m, "Machine\t\t: M3T-M32104UT uT Engine board\n");
343#else 348#else
344 seq_printf(m, "Machine\t\t: Unknown\n"); 349 seq_printf(m, "Machine\t\t: Unknown\n");
345#endif 350#endif
@@ -389,7 +394,7 @@ unsigned long cpu_initialized __initdata = 0;
389 */ 394 */
390#if defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_XNUX2) \ 395#if defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_XNUX2) \
391 || defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_M32102) \ 396 || defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_M32102) \
392 || defined(CONFIG_CHIP_OPSP) 397 || defined(CONFIG_CHIP_OPSP) || defined(CONFIG_CHIP_M32104)
393void __init cpu_init (void) 398void __init cpu_init (void)
394{ 399{
395 int cpu_id = smp_processor_id(); 400 int cpu_id = smp_processor_id();
diff --git a/arch/m32r/kernel/setup_m32104ut.c b/arch/m32r/kernel/setup_m32104ut.c
new file mode 100644
index 000000000000..6328e1357a80
--- /dev/null
+++ b/arch/m32r/kernel/setup_m32104ut.c
@@ -0,0 +1,156 @@
1/*
2 * linux/arch/m32r/kernel/setup_m32104ut.c
3 *
4 * Setup routines for M32104UT Board
5 *
6 * Copyright (c) 2002-2005 Hiroyuki Kondo, Hirokazu Takata,
7 * Hitoshi Yamamoto, Mamoru Sakugawa,
8 * Naoto Sugai, Hayato Fujiwara
9 */
10
11#include <linux/config.h>
12#include <linux/irq.h>
13#include <linux/kernel.h>
14#include <linux/init.h>
15#include <linux/device.h>
16
17#include <asm/system.h>
18#include <asm/m32r.h>
19#include <asm/io.h>
20
21#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
22
23icu_data_t icu_data[NR_IRQS];
24
25static void disable_m32104ut_irq(unsigned int irq)
26{
27 unsigned long port, data;
28
29 port = irq2port(irq);
30 data = icu_data[irq].icucr|M32R_ICUCR_ILEVEL7;
31 outl(data, port);
32}
33
34static void enable_m32104ut_irq(unsigned int irq)
35{
36 unsigned long port, data;
37
38 port = irq2port(irq);
39 data = icu_data[irq].icucr|M32R_ICUCR_IEN|M32R_ICUCR_ILEVEL6;
40 outl(data, port);
41}
42
43static void mask_and_ack_m32104ut(unsigned int irq)
44{
45 disable_m32104ut_irq(irq);
46}
47
48static void end_m32104ut_irq(unsigned int irq)
49{
50 enable_m32104ut_irq(irq);
51}
52
53static unsigned int startup_m32104ut_irq(unsigned int irq)
54{
55 enable_m32104ut_irq(irq);
56 return (0);
57}
58
59static void shutdown_m32104ut_irq(unsigned int irq)
60{
61 unsigned long port;
62
63 port = irq2port(irq);
64 outl(M32R_ICUCR_ILEVEL7, port);
65}
66
67static struct hw_interrupt_type m32104ut_irq_type =
68{
69 .typename = "M32104UT-IRQ",
70 .startup = startup_m32104ut_irq,
71 .shutdown = shutdown_m32104ut_irq,
72 .enable = enable_m32104ut_irq,
73 .disable = disable_m32104ut_irq,
74 .ack = mask_and_ack_m32104ut,
75 .end = end_m32104ut_irq
76};
77
78void __init init_IRQ(void)
79{
80 static int once = 0;
81
82 if (once)
83 return;
84 else
85 once++;
86
87#if defined(CONFIG_SMC91X)
88 /* INT#0: LAN controller on M32104UT-LAN (SMC91C111)*/
89 irq_desc[M32R_IRQ_INT0].status = IRQ_DISABLED;
90 irq_desc[M32R_IRQ_INT0].handler = &m32104ut_irq_type;
91 irq_desc[M32R_IRQ_INT0].action = 0;
92 irq_desc[M32R_IRQ_INT0].depth = 1;
93 icu_data[M32R_IRQ_INT0].icucr = M32R_ICUCR_IEN | M32R_ICUCR_ISMOD11; /* "H" level sense */
94 disable_m32104ut_irq(M32R_IRQ_INT0);
95#endif /* CONFIG_SMC91X */
96
97 /* MFT2 : system timer */
98 irq_desc[M32R_IRQ_MFT2].status = IRQ_DISABLED;
99 irq_desc[M32R_IRQ_MFT2].handler = &m32104ut_irq_type;
100 irq_desc[M32R_IRQ_MFT2].action = 0;
101 irq_desc[M32R_IRQ_MFT2].depth = 1;
102 icu_data[M32R_IRQ_MFT2].icucr = M32R_ICUCR_IEN;
103 disable_m32104ut_irq(M32R_IRQ_MFT2);
104
105#ifdef CONFIG_SERIAL_M32R_SIO
106 /* SIO0_R : uart receive data */
107 irq_desc[M32R_IRQ_SIO0_R].status = IRQ_DISABLED;
108 irq_desc[M32R_IRQ_SIO0_R].handler = &m32104ut_irq_type;
109 irq_desc[M32R_IRQ_SIO0_R].action = 0;
110 irq_desc[M32R_IRQ_SIO0_R].depth = 1;
111 icu_data[M32R_IRQ_SIO0_R].icucr = M32R_ICUCR_IEN;
112 disable_m32104ut_irq(M32R_IRQ_SIO0_R);
113
114 /* SIO0_S : uart send data */
115 irq_desc[M32R_IRQ_SIO0_S].status = IRQ_DISABLED;
116 irq_desc[M32R_IRQ_SIO0_S].handler = &m32104ut_irq_type;
117 irq_desc[M32R_IRQ_SIO0_S].action = 0;
118 irq_desc[M32R_IRQ_SIO0_S].depth = 1;
119 icu_data[M32R_IRQ_SIO0_S].icucr = M32R_ICUCR_IEN;
120 disable_m32104ut_irq(M32R_IRQ_SIO0_S);
121#endif /* CONFIG_SERIAL_M32R_SIO */
122}
123
124#if defined(CONFIG_SMC91X)
125
126#define LAN_IOSTART 0x300
127#define LAN_IOEND 0x320
128static struct resource smc91x_resources[] = {
129 [0] = {
130 .start = (LAN_IOSTART),
131 .end = (LAN_IOEND),
132 .flags = IORESOURCE_MEM,
133 },
134 [1] = {
135 .start = M32R_IRQ_INT0,
136 .end = M32R_IRQ_INT0,
137 .flags = IORESOURCE_IRQ,
138 }
139};
140
141static struct platform_device smc91x_device = {
142 .name = "smc91x",
143 .id = 0,
144 .num_resources = ARRAY_SIZE(smc91x_resources),
145 .resource = smc91x_resources,
146};
147#endif
148
149static int __init platform_init(void)
150{
151#if defined(CONFIG_SMC91X)
152 platform_device_register(&smc91x_device);
153#endif
154 return 0;
155}
156arch_initcall(platform_init);
diff --git a/arch/m32r/kernel/setup_m32700ut.c b/arch/m32r/kernel/setup_m32700ut.c
index cb76916b014d..fad1fc99bb27 100644
--- a/arch/m32r/kernel/setup_m32700ut.c
+++ b/arch/m32r/kernel/setup_m32700ut.c
@@ -26,15 +26,7 @@
26 */ 26 */
27#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long))) 27#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
28 28
29#ifndef CONFIG_SMP
30typedef struct {
31 unsigned long icucr; /* ICU Control Register */
32} icu_data_t;
33static icu_data_t icu_data[M32700UT_NUM_CPU_IRQ];
34#else
35icu_data_t icu_data[M32700UT_NUM_CPU_IRQ]; 29icu_data_t icu_data[M32700UT_NUM_CPU_IRQ];
36#endif /* CONFIG_SMP */
37
38 30
39static void disable_m32700ut_irq(unsigned int irq) 31static void disable_m32700ut_irq(unsigned int irq)
40{ 32{
diff --git a/arch/m32r/kernel/setup_mappi.c b/arch/m32r/kernel/setup_mappi.c
index 501d798cf050..00f253209cb3 100644
--- a/arch/m32r/kernel/setup_mappi.c
+++ b/arch/m32r/kernel/setup_mappi.c
@@ -19,12 +19,6 @@
19 19
20#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long))) 20#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
21 21
22#ifndef CONFIG_SMP
23typedef struct {
24 unsigned long icucr; /* ICU Control Register */
25} icu_data_t;
26#endif /* CONFIG_SMP */
27
28icu_data_t icu_data[NR_IRQS]; 22icu_data_t icu_data[NR_IRQS];
29 23
30static void disable_mappi_irq(unsigned int irq) 24static void disable_mappi_irq(unsigned int irq)
diff --git a/arch/m32r/kernel/setup_mappi2.c b/arch/m32r/kernel/setup_mappi2.c
index 7f2db5bfd626..eebc9d8b4e72 100644
--- a/arch/m32r/kernel/setup_mappi2.c
+++ b/arch/m32r/kernel/setup_mappi2.c
@@ -19,12 +19,6 @@
19 19
20#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long))) 20#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
21 21
22#ifndef CONFIG_SMP
23typedef struct {
24 unsigned long icucr; /* ICU Control Register */
25} icu_data_t;
26#endif /* CONFIG_SMP */
27
28icu_data_t icu_data[NR_IRQS]; 22icu_data_t icu_data[NR_IRQS];
29 23
30static void disable_mappi2_irq(unsigned int irq) 24static void disable_mappi2_irq(unsigned int irq)
diff --git a/arch/m32r/kernel/setup_mappi3.c b/arch/m32r/kernel/setup_mappi3.c
index f6ecdf7f555c..d2ff021e2d3d 100644
--- a/arch/m32r/kernel/setup_mappi3.c
+++ b/arch/m32r/kernel/setup_mappi3.c
@@ -19,12 +19,6 @@
19 19
20#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long))) 20#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
21 21
22#ifndef CONFIG_SMP
23typedef struct {
24 unsigned long icucr; /* ICU Control Register */
25} icu_data_t;
26#endif /* CONFIG_SMP */
27
28icu_data_t icu_data[NR_IRQS]; 22icu_data_t icu_data[NR_IRQS];
29 23
30static void disable_mappi3_irq(unsigned int irq) 24static void disable_mappi3_irq(unsigned int irq)
diff --git a/arch/m32r/kernel/setup_oaks32r.c b/arch/m32r/kernel/setup_oaks32r.c
index 45add5b76f19..0e9e63538c0f 100644
--- a/arch/m32r/kernel/setup_oaks32r.c
+++ b/arch/m32r/kernel/setup_oaks32r.c
@@ -18,12 +18,6 @@
18 18
19#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long))) 19#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
20 20
21#ifndef CONFIG_SMP
22typedef struct {
23 unsigned long icucr; /* ICU Control Register */
24} icu_data_t;
25#endif /* CONFIG_SMP */
26
27icu_data_t icu_data[NR_IRQS]; 21icu_data_t icu_data[NR_IRQS];
28 22
29static void disable_oaks32r_irq(unsigned int irq) 23static void disable_oaks32r_irq(unsigned int irq)
diff --git a/arch/m32r/kernel/setup_opsput.c b/arch/m32r/kernel/setup_opsput.c
index 1fbb140854e7..548e8fc7949b 100644
--- a/arch/m32r/kernel/setup_opsput.c
+++ b/arch/m32r/kernel/setup_opsput.c
@@ -27,15 +27,7 @@
27 */ 27 */
28#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long))) 28#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
29 29
30#ifndef CONFIG_SMP
31typedef struct {
32 unsigned long icucr; /* ICU Control Register */
33} icu_data_t;
34static icu_data_t icu_data[OPSPUT_NUM_CPU_IRQ];
35#else
36icu_data_t icu_data[OPSPUT_NUM_CPU_IRQ]; 30icu_data_t icu_data[OPSPUT_NUM_CPU_IRQ];
37#endif /* CONFIG_SMP */
38
39 31
40static void disable_opsput_irq(unsigned int irq) 32static void disable_opsput_irq(unsigned int irq)
41{ 33{
diff --git a/arch/m32r/kernel/setup_usrv.c b/arch/m32r/kernel/setup_usrv.c
index 634741bf9d35..64be659a23e7 100644
--- a/arch/m32r/kernel/setup_usrv.c
+++ b/arch/m32r/kernel/setup_usrv.c
@@ -18,12 +18,6 @@
18 18
19#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long))) 19#define irq2port(x) (M32R_ICU_CR1_PORTL + ((x - 1) * sizeof(unsigned long)))
20 20
21#if !defined(CONFIG_SMP)
22typedef struct {
23 unsigned long icucr; /* ICU Control Register */
24} icu_data_t;
25#endif /* CONFIG_SMP */
26
27icu_data_t icu_data[M32700UT_NUM_CPU_IRQ]; 21icu_data_t icu_data[M32700UT_NUM_CPU_IRQ];
28 22
29static void disable_mappi_irq(unsigned int irq) 23static void disable_mappi_irq(unsigned int irq)
diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c
index 2ebce2063fea..b8e68b542302 100644
--- a/arch/m32r/kernel/time.c
+++ b/arch/m32r/kernel/time.c
@@ -57,7 +57,7 @@ static unsigned long do_gettimeoffset(void)
57 57
58#if defined(CONFIG_CHIP_M32102) || defined(CONFIG_CHIP_XNUX2) \ 58#if defined(CONFIG_CHIP_M32102) || defined(CONFIG_CHIP_XNUX2) \
59 || defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_M32700) \ 59 || defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_M32700) \
60 || defined(CONFIG_CHIP_OPSP) 60 || defined(CONFIG_CHIP_OPSP) || defined(CONFIG_CHIP_M32104)
61#ifndef CONFIG_SMP 61#ifndef CONFIG_SMP
62 62
63 unsigned long count; 63 unsigned long count;
@@ -268,7 +268,7 @@ void __init time_init(void)
268 268
269#if defined(CONFIG_CHIP_M32102) || defined(CONFIG_CHIP_XNUX2) \ 269#if defined(CONFIG_CHIP_M32102) || defined(CONFIG_CHIP_XNUX2) \
270 || defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_M32700) \ 270 || defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_M32700) \
271 || defined(CONFIG_CHIP_OPSP) 271 || defined(CONFIG_CHIP_OPSP) || defined(CONFIG_CHIP_M32104)
272 272
273 /* M32102 MFT setup */ 273 /* M32102 MFT setup */
274 setup_irq(M32R_IRQ_MFT2, &irq0); 274 setup_irq(M32R_IRQ_MFT2, &irq0);
diff --git a/arch/m32r/m32104ut/defconfig.m32104ut b/arch/m32r/m32104ut/defconfig.m32104ut
new file mode 100644
index 000000000000..454de336803a
--- /dev/null
+++ b/arch/m32r/m32104ut/defconfig.m32104ut
@@ -0,0 +1,657 @@
1#
2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.14
4# Wed Nov 9 16:04:51 2005
5#
6CONFIG_M32R=y
7# CONFIG_UID16 is not set
8CONFIG_GENERIC_ISA_DMA=y
9CONFIG_GENERIC_HARDIRQS=y
10CONFIG_GENERIC_IRQ_PROBE=y
11
12#
13# Code maturity level options
14#
15CONFIG_EXPERIMENTAL=y
16CONFIG_CLEAN_COMPILE=y
17CONFIG_BROKEN_ON_SMP=y
18CONFIG_INIT_ENV_ARG_LIMIT=32
19
20#
21# General setup
22#
23CONFIG_LOCALVERSION=""
24CONFIG_LOCALVERSION_AUTO=y
25# CONFIG_POSIX_MQUEUE is not set
26# CONFIG_BSD_PROCESS_ACCT is not set
27CONFIG_SYSCTL=y
28# CONFIG_AUDIT is not set
29CONFIG_HOTPLUG=y
30# CONFIG_KOBJECT_UEVENT is not set
31# CONFIG_IKCONFIG is not set
32CONFIG_INITRAMFS_SOURCE=""
33CONFIG_EMBEDDED=y
34# CONFIG_KALLSYMS is not set
35CONFIG_PRINTK=y
36CONFIG_BUG=y
37CONFIG_BASE_FULL=y
38# CONFIG_FUTEX is not set
39# CONFIG_EPOLL is not set
40# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
41CONFIG_CC_ALIGN_FUNCTIONS=0
42CONFIG_CC_ALIGN_LABELS=0
43CONFIG_CC_ALIGN_LOOPS=0
44CONFIG_CC_ALIGN_JUMPS=0
45CONFIG_TINY_SHMEM=y
46CONFIG_BASE_SMALL=0
47
48#
49# Loadable module support
50#
51# CONFIG_MODULES is not set
52
53#
54# Processor type and features
55#
56# CONFIG_PLAT_MAPPI is not set
57# CONFIG_PLAT_USRV is not set
58# CONFIG_PLAT_M32700UT is not set
59# CONFIG_PLAT_OPSPUT is not set
60# CONFIG_PLAT_OAKS32R is not set
61# CONFIG_PLAT_MAPPI2 is not set
62# CONFIG_PLAT_MAPPI3 is not set
63CONFIG_PLAT_M32104UT=y
64# CONFIG_CHIP_M32700 is not set
65# CONFIG_CHIP_M32102 is not set
66CONFIG_CHIP_M32104=y
67# CONFIG_CHIP_VDEC2 is not set
68# CONFIG_CHIP_OPSP is not set
69CONFIG_ISA_M32R=y
70CONFIG_BUS_CLOCK=54000000
71CONFIG_TIMER_DIVIDE=128
72# CONFIG_CPU_LITTLE_ENDIAN is not set
73CONFIG_MEMORY_START=04000000
74CONFIG_MEMORY_SIZE=01000000
75CONFIG_NOHIGHMEM=y
76# CONFIG_ARCH_DISCONTIGMEM_ENABLE is not set
77CONFIG_SELECT_MEMORY_MODEL=y
78CONFIG_FLATMEM_MANUAL=y
79# CONFIG_DISCONTIGMEM_MANUAL is not set
80# CONFIG_SPARSEMEM_MANUAL is not set
81CONFIG_FLATMEM=y
82CONFIG_FLAT_NODE_MEM_MAP=y
83# CONFIG_SPARSEMEM_STATIC is not set
84CONFIG_RWSEM_GENERIC_SPINLOCK=y
85# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set
86CONFIG_GENERIC_CALIBRATE_DELAY=y
87# CONFIG_PREEMPT is not set
88# CONFIG_SMP is not set
89
90#
91# Bus options (PCI, PCMCIA, EISA, MCA, ISA)
92#
93# CONFIG_ISA is not set
94
95#
96# PCCARD (PCMCIA/CardBus) support
97#
98CONFIG_PCCARD=y
99# CONFIG_PCMCIA_DEBUG is not set
100CONFIG_PCMCIA=y
101CONFIG_PCMCIA_LOAD_CIS=y
102CONFIG_PCMCIA_IOCTL=y
103
104#
105# PC-card bridges
106#
107
108#
109# PCI Hotplug Support
110#
111
112#
113# Executable file formats
114#
115CONFIG_BINFMT_FLAT=y
116# CONFIG_BINFMT_ZFLAT is not set
117# CONFIG_BINFMT_SHARED_FLAT is not set
118# CONFIG_BINFMT_MISC is not set
119
120#
121# Networking
122#
123CONFIG_NET=y
124
125#
126# Networking options
127#
128# CONFIG_PACKET is not set
129CONFIG_UNIX=y
130# CONFIG_NET_KEY is not set
131CONFIG_INET=y
132# CONFIG_IP_MULTICAST is not set
133# CONFIG_IP_ADVANCED_ROUTER is not set
134CONFIG_IP_FIB_HASH=y
135CONFIG_IP_PNP=y
136CONFIG_IP_PNP_DHCP=y
137# CONFIG_IP_PNP_BOOTP is not set
138# CONFIG_IP_PNP_RARP is not set
139# CONFIG_NET_IPIP is not set
140# CONFIG_NET_IPGRE is not set
141# CONFIG_ARPD is not set
142# CONFIG_SYN_COOKIES is not set
143# CONFIG_INET_AH is not set
144# CONFIG_INET_ESP is not set
145# CONFIG_INET_IPCOMP is not set
146# CONFIG_INET_TUNNEL is not set
147CONFIG_INET_DIAG=y
148CONFIG_INET_TCP_DIAG=y
149# CONFIG_TCP_CONG_ADVANCED is not set
150CONFIG_TCP_CONG_BIC=y
151# CONFIG_IPV6 is not set
152# CONFIG_NETFILTER is not set
153
154#
155# DCCP Configuration (EXPERIMENTAL)
156#
157# CONFIG_IP_DCCP is not set
158
159#
160# SCTP Configuration (EXPERIMENTAL)
161#
162# CONFIG_IP_SCTP is not set
163# CONFIG_ATM is not set
164# CONFIG_BRIDGE is not set
165# CONFIG_VLAN_8021Q is not set
166# CONFIG_DECNET is not set
167# CONFIG_LLC2 is not set
168# CONFIG_IPX is not set
169# CONFIG_ATALK is not set
170# CONFIG_X25 is not set
171# CONFIG_LAPB is not set
172# CONFIG_NET_DIVERT is not set
173# CONFIG_ECONET is not set
174# CONFIG_WAN_ROUTER is not set
175# CONFIG_NET_SCHED is not set
176# CONFIG_NET_CLS_ROUTE is not set
177
178#
179# Network testing
180#
181# CONFIG_NET_PKTGEN is not set
182# CONFIG_HAMRADIO is not set
183# CONFIG_IRDA is not set
184# CONFIG_BT is not set
185# CONFIG_IEEE80211 is not set
186
187#
188# Device Drivers
189#
190
191#
192# Generic Driver Options
193#
194CONFIG_STANDALONE=y
195CONFIG_PREVENT_FIRMWARE_BUILD=y
196CONFIG_FW_LOADER=y
197# CONFIG_DEBUG_DRIVER is not set
198
199#
200# Connector - unified userspace <-> kernelspace linker
201#
202# CONFIG_CONNECTOR is not set
203
204#
205# Memory Technology Devices (MTD)
206#
207# CONFIG_MTD is not set
208
209#
210# Parallel port support
211#
212# CONFIG_PARPORT is not set
213
214#
215# Plug and Play support
216#
217
218#
219# Block devices
220#
221# CONFIG_BLK_DEV_COW_COMMON is not set
222CONFIG_BLK_DEV_LOOP=y
223# CONFIG_BLK_DEV_CRYPTOLOOP is not set
224CONFIG_BLK_DEV_NBD=y
225CONFIG_BLK_DEV_RAM=y
226CONFIG_BLK_DEV_RAM_COUNT=16
227CONFIG_BLK_DEV_RAM_SIZE=4096
228CONFIG_BLK_DEV_INITRD=y
229# CONFIG_CDROM_PKTCDVD is not set
230
231#
232# IO Schedulers
233#
234CONFIG_IOSCHED_NOOP=y
235# CONFIG_IOSCHED_AS is not set
236# CONFIG_IOSCHED_DEADLINE is not set
237# CONFIG_IOSCHED_CFQ is not set
238# CONFIG_ATA_OVER_ETH is not set
239
240#
241# ATA/ATAPI/MFM/RLL support
242#
243# CONFIG_IDE is not set
244
245#
246# SCSI device support
247#
248# CONFIG_RAID_ATTRS is not set
249# CONFIG_SCSI is not set
250
251#
252# Multi-device support (RAID and LVM)
253#
254# CONFIG_MD is not set
255
256#
257# Fusion MPT device support
258#
259# CONFIG_FUSION is not set
260
261#
262# IEEE 1394 (FireWire) support
263#
264
265#
266# I2O device support
267#
268
269#
270# Network device support
271#
272CONFIG_NETDEVICES=y
273CONFIG_DUMMY=y
274# CONFIG_BONDING is not set
275# CONFIG_EQUALIZER is not set
276# CONFIG_TUN is not set
277
278#
279# PHY device support
280#
281# CONFIG_PHYLIB is not set
282
283#
284# Ethernet (10 or 100Mbit)
285#
286CONFIG_NET_ETHERNET=y
287CONFIG_MII=y
288CONFIG_SMC91X=y
289# CONFIG_NE2000 is not set
290
291#
292# Ethernet (1000 Mbit)
293#
294
295#
296# Ethernet (10000 Mbit)
297#
298
299#
300# Token Ring devices
301#
302
303#
304# Wireless LAN (non-hamradio)
305#
306# CONFIG_NET_RADIO is not set
307
308#
309# PCMCIA network device support
310#
311# CONFIG_NET_PCMCIA is not set
312
313#
314# Wan interfaces
315#
316# CONFIG_WAN is not set
317# CONFIG_PPP is not set
318# CONFIG_SLIP is not set
319# CONFIG_SHAPER is not set
320# CONFIG_NETCONSOLE is not set
321# CONFIG_NETPOLL is not set
322# CONFIG_NET_POLL_CONTROLLER is not set
323
324#
325# ISDN subsystem
326#
327# CONFIG_ISDN is not set
328
329#
330# Telephony Support
331#
332# CONFIG_PHONE is not set
333
334#
335# Input device support
336#
337# CONFIG_INPUT is not set
338
339#
340# Hardware I/O ports
341#
342# CONFIG_SERIO is not set
343# CONFIG_GAMEPORT is not set
344
345#
346# Character devices
347#
348# CONFIG_VT is not set
349# CONFIG_SERIAL_NONSTANDARD is not set
350
351#
352# Serial drivers
353#
354# CONFIG_SERIAL_8250 is not set
355
356#
357# Non-8250 serial port support
358#
359CONFIG_SERIAL_CORE=y
360CONFIG_SERIAL_CORE_CONSOLE=y
361CONFIG_SERIAL_M32R_SIO=y
362CONFIG_SERIAL_M32R_SIO_CONSOLE=y
363CONFIG_UNIX98_PTYS=y
364CONFIG_LEGACY_PTYS=y
365CONFIG_LEGACY_PTY_COUNT=256
366
367#
368# IPMI
369#
370# CONFIG_IPMI_HANDLER is not set
371
372#
373# Watchdog Cards
374#
375CONFIG_WATCHDOG=y
376# CONFIG_WATCHDOG_NOWAYOUT is not set
377
378#
379# Watchdog Device Drivers
380#
381CONFIG_SOFT_WATCHDOG=y
382# CONFIG_RTC is not set
383# CONFIG_DTLK is not set
384# CONFIG_R3964 is not set
385
386#
387# Ftape, the floppy tape device driver
388#
389
390#
391# PCMCIA character devices
392#
393# CONFIG_SYNCLINK_CS is not set
394# CONFIG_RAW_DRIVER is not set
395
396#
397# TPM devices
398#
399
400#
401# I2C support
402#
403# CONFIG_I2C is not set
404
405#
406# Dallas's 1-wire bus
407#
408# CONFIG_W1 is not set
409
410#
411# Hardware Monitoring support
412#
413# CONFIG_HWMON is not set
414# CONFIG_HWMON_VID is not set
415
416#
417# Misc devices
418#
419
420#
421# Multimedia Capabilities Port drivers
422#
423
424#
425# Multimedia devices
426#
427# CONFIG_VIDEO_DEV is not set
428
429#
430# Digital Video Broadcasting Devices
431#
432# CONFIG_DVB is not set
433
434#
435# Graphics support
436#
437# CONFIG_FB is not set
438
439#
440# Sound
441#
442# CONFIG_SOUND is not set
443
444#
445# USB support
446#
447# CONFIG_USB_ARCH_HAS_HCD is not set
448# CONFIG_USB_ARCH_HAS_OHCI is not set
449
450#
451# USB Gadget Support
452#
453# CONFIG_USB_GADGET is not set
454
455#
456# MMC/SD Card support
457#
458# CONFIG_MMC is not set
459
460#
461# InfiniBand support
462#
463
464#
465# SN Devices
466#
467
468#
469# File systems
470#
471CONFIG_EXT2_FS=y
472# CONFIG_EXT2_FS_XATTR is not set
473# CONFIG_EXT2_FS_XIP is not set
474CONFIG_EXT3_FS=y
475CONFIG_EXT3_FS_XATTR=y
476CONFIG_EXT3_FS_POSIX_ACL=y
477# CONFIG_EXT3_FS_SECURITY is not set
478CONFIG_JBD=y
479# CONFIG_JBD_DEBUG is not set
480CONFIG_FS_MBCACHE=y
481# CONFIG_REISERFS_FS is not set
482# CONFIG_JFS_FS is not set
483CONFIG_FS_POSIX_ACL=y
484# CONFIG_XFS_FS is not set
485# CONFIG_MINIX_FS is not set
486# CONFIG_ROMFS_FS is not set
487# CONFIG_INOTIFY is not set
488# CONFIG_QUOTA is not set
489CONFIG_DNOTIFY=y
490# CONFIG_AUTOFS_FS is not set
491# CONFIG_AUTOFS4_FS is not set
492# CONFIG_FUSE_FS is not set
493
494#
495# CD-ROM/DVD Filesystems
496#
497# CONFIG_ISO9660_FS is not set
498# CONFIG_UDF_FS is not set
499
500#
501# DOS/FAT/NT Filesystems
502#
503CONFIG_FAT_FS=y
504CONFIG_MSDOS_FS=y
505CONFIG_VFAT_FS=y
506CONFIG_FAT_DEFAULT_CODEPAGE=932
507CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
508# CONFIG_NTFS_FS is not set
509
510#
511# Pseudo filesystems
512#
513CONFIG_PROC_FS=y
514CONFIG_SYSFS=y
515CONFIG_TMPFS=y
516# CONFIG_HUGETLB_PAGE is not set
517CONFIG_RAMFS=y
518# CONFIG_RELAYFS_FS is not set
519
520#
521# Miscellaneous filesystems
522#
523# CONFIG_ADFS_FS is not set
524# CONFIG_AFFS_FS is not set
525# CONFIG_HFS_FS is not set
526# CONFIG_HFSPLUS_FS is not set
527# CONFIG_BEFS_FS is not set
528# CONFIG_BFS_FS is not set
529# CONFIG_EFS_FS is not set
530CONFIG_CRAMFS=y
531# CONFIG_VXFS_FS is not set
532# CONFIG_HPFS_FS is not set
533# CONFIG_QNX4FS_FS is not set
534# CONFIG_SYSV_FS is not set
535# CONFIG_UFS_FS is not set
536
537#
538# Network File Systems
539#
540CONFIG_NFS_FS=y
541CONFIG_NFS_V3=y
542# CONFIG_NFS_V3_ACL is not set
543# CONFIG_NFS_V4 is not set
544# CONFIG_NFS_DIRECTIO is not set
545# CONFIG_NFSD is not set
546CONFIG_ROOT_NFS=y
547CONFIG_LOCKD=y
548CONFIG_LOCKD_V4=y
549CONFIG_NFS_COMMON=y
550CONFIG_SUNRPC=y
551# CONFIG_RPCSEC_GSS_KRB5 is not set
552# CONFIG_RPCSEC_GSS_SPKM3 is not set
553# CONFIG_SMB_FS is not set
554# CONFIG_CIFS is not set
555# CONFIG_NCP_FS is not set
556# CONFIG_CODA_FS is not set
557# CONFIG_AFS_FS is not set
558# CONFIG_9P_FS is not set
559
560#
561# Partition Types
562#
563# CONFIG_PARTITION_ADVANCED is not set
564CONFIG_MSDOS_PARTITION=y
565
566#
567# Native Language Support
568#
569CONFIG_NLS=y
570CONFIG_NLS_DEFAULT="iso8859-1"
571CONFIG_NLS_CODEPAGE_437=y
572# CONFIG_NLS_CODEPAGE_737 is not set
573# CONFIG_NLS_CODEPAGE_775 is not set
574# CONFIG_NLS_CODEPAGE_850 is not set
575# CONFIG_NLS_CODEPAGE_852 is not set
576# CONFIG_NLS_CODEPAGE_855 is not set
577# CONFIG_NLS_CODEPAGE_857 is not set
578# CONFIG_NLS_CODEPAGE_860 is not set
579# CONFIG_NLS_CODEPAGE_861 is not set
580# CONFIG_NLS_CODEPAGE_862 is not set
581# CONFIG_NLS_CODEPAGE_863 is not set
582# CONFIG_NLS_CODEPAGE_864 is not set
583# CONFIG_NLS_CODEPAGE_865 is not set
584# CONFIG_NLS_CODEPAGE_866 is not set
585# CONFIG_NLS_CODEPAGE_869 is not set
586# CONFIG_NLS_CODEPAGE_936 is not set
587# CONFIG_NLS_CODEPAGE_950 is not set
588CONFIG_NLS_CODEPAGE_932=y
589# CONFIG_NLS_CODEPAGE_949 is not set
590# CONFIG_NLS_CODEPAGE_874 is not set
591# CONFIG_NLS_ISO8859_8 is not set
592# CONFIG_NLS_CODEPAGE_1250 is not set
593# CONFIG_NLS_CODEPAGE_1251 is not set
594# CONFIG_NLS_ASCII is not set
595# CONFIG_NLS_ISO8859_1 is not set
596# CONFIG_NLS_ISO8859_2 is not set
597# CONFIG_NLS_ISO8859_3 is not set
598# CONFIG_NLS_ISO8859_4 is not set
599# CONFIG_NLS_ISO8859_5 is not set
600# CONFIG_NLS_ISO8859_6 is not set
601# CONFIG_NLS_ISO8859_7 is not set
602# CONFIG_NLS_ISO8859_9 is not set
603# CONFIG_NLS_ISO8859_13 is not set
604# CONFIG_NLS_ISO8859_14 is not set
605# CONFIG_NLS_ISO8859_15 is not set
606# CONFIG_NLS_KOI8_R is not set
607# CONFIG_NLS_KOI8_U is not set
608CONFIG_NLS_UTF8=y
609
610#
611# Profiling support
612#
613# CONFIG_PROFILING is not set
614
615#
616# Kernel hacking
617#
618# CONFIG_PRINTK_TIME is not set
619CONFIG_DEBUG_KERNEL=y
620CONFIG_MAGIC_SYSRQ=y
621CONFIG_LOG_BUF_SHIFT=14
622CONFIG_DETECT_SOFTLOCKUP=y
623# CONFIG_SCHEDSTATS is not set
624# CONFIG_DEBUG_SLAB is not set
625# CONFIG_DEBUG_SPINLOCK is not set
626# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
627# CONFIG_DEBUG_KOBJECT is not set
628# CONFIG_DEBUG_BUGVERBOSE is not set
629CONFIG_DEBUG_INFO=y
630# CONFIG_DEBUG_FS is not set
631# CONFIG_FRAME_POINTER is not set
632# CONFIG_DEBUG_STACKOVERFLOW is not set
633# CONFIG_DEBUG_STACK_USAGE is not set
634
635#
636# Security options
637#
638# CONFIG_KEYS is not set
639# CONFIG_SECURITY is not set
640
641#
642# Cryptographic options
643#
644# CONFIG_CRYPTO is not set
645
646#
647# Hardware crypto devices
648#
649
650#
651# Library routines
652#
653# CONFIG_CRC_CCITT is not set
654# CONFIG_CRC16 is not set
655CONFIG_CRC32=y
656CONFIG_LIBCRC32C=y
657CONFIG_ZLIB_INFLATE=y
diff --git a/arch/m32r/mm/cache.c b/arch/m32r/mm/cache.c
index 31b0789c1992..9f54dd937013 100644
--- a/arch/m32r/mm/cache.c
+++ b/arch/m32r/mm/cache.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * linux/arch/m32r/mm/cache.c 2 * linux/arch/m32r/mm/cache.c
3 * 3 *
4 * Copyright (C) 2002 Hirokazu Takata 4 * Copyright (C) 2002-2005 Hirokazu Takata, Hayato Fujiwara
5 */ 5 */
6 6
7#include <linux/config.h> 7#include <linux/config.h>
@@ -9,7 +9,8 @@
9 9
10#undef MCCR 10#undef MCCR
11 11
12#if defined(CONFIG_CHIP_XNUX2) || defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_OPSP) 12#if defined(CONFIG_CHIP_XNUX2) || defined(CONFIG_CHIP_M32700) \
13 || defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_OPSP)
13/* Cache Control Register */ 14/* Cache Control Register */
14#define MCCR ((volatile unsigned long*)0xfffffffc) 15#define MCCR ((volatile unsigned long*)0xfffffffc)
15#define MCCR_CC (1UL << 7) /* Cache mode modify bit */ 16#define MCCR_CC (1UL << 7) /* Cache mode modify bit */
@@ -26,7 +27,17 @@
26#define MCCR ((volatile unsigned char*)0xfffffffe) 27#define MCCR ((volatile unsigned char*)0xfffffffe)
27#define MCCR_IIV (1UL << 0) /* I-cache invalidate */ 28#define MCCR_IIV (1UL << 0) /* I-cache invalidate */
28#define MCCR_ICACHE_INV MCCR_IIV 29#define MCCR_ICACHE_INV MCCR_IIV
29#endif /* CONFIG_CHIP_XNUX2 || CONFIG_CHIP_M32700 */ 30#elif defined(CONFIG_CHIP_M32104)
31#define MCCR ((volatile unsigned short*)0xfffffffe)
32#define MCCR_IIV (1UL << 8) /* I-cache invalidate */
33#define MCCR_DIV (1UL << 9) /* D-cache invalidate */
34#define MCCR_DCB (1UL << 10) /* D-cache copy back */
35#define MCCR_ICM (1UL << 0) /* I-cache mode [0:off,1:on] */
36#define MCCR_DCM (1UL << 1) /* D-cache mode [0:off,1:on] */
37#define MCCR_ICACHE_INV MCCR_IIV
38#define MCCR_DCACHE_CB MCCR_DCB
39#define MCCR_DCACHE_CBINV (MCCR_DIV|MCCR_DCB)
40#endif
30 41
31#ifndef MCCR 42#ifndef MCCR
32#error Unknown cache type. 43#error Unknown cache type.
@@ -37,29 +48,42 @@
37void _flush_cache_all(void) 48void _flush_cache_all(void)
38{ 49{
39#if defined(CONFIG_CHIP_M32102) 50#if defined(CONFIG_CHIP_M32102)
51 unsigned char mccr;
40 *MCCR = MCCR_ICACHE_INV; 52 *MCCR = MCCR_ICACHE_INV;
53#elif defined(CONFIG_CHIP_M32104)
54 unsigned short mccr;
55
56 /* Copyback and invalidate D-cache */
57 /* Invalidate I-cache */
58 *MCCR |= (MCCR_ICACHE_INV | MCCR_DCACHE_CBINV);
41#else 59#else
42 unsigned long mccr; 60 unsigned long mccr;
43 61
44 /* Copyback and invalidate D-cache */ 62 /* Copyback and invalidate D-cache */
45 /* Invalidate I-cache */ 63 /* Invalidate I-cache */
46 *MCCR = MCCR_ICACHE_INV | MCCR_DCACHE_CBINV; 64 *MCCR = MCCR_ICACHE_INV | MCCR_DCACHE_CBINV;
47 while ((mccr = *MCCR) & MCCR_IIV); /* loop while invalidating... */
48#endif 65#endif
66 while ((mccr = *MCCR) & MCCR_IIV); /* loop while invalidating... */
49} 67}
50 68
51/* Copy back D-cache and invalidate I-cache all */ 69/* Copy back D-cache and invalidate I-cache all */
52void _flush_cache_copyback_all(void) 70void _flush_cache_copyback_all(void)
53{ 71{
54#if defined(CONFIG_CHIP_M32102) 72#if defined(CONFIG_CHIP_M32102)
73 unsigned char mccr;
55 *MCCR = MCCR_ICACHE_INV; 74 *MCCR = MCCR_ICACHE_INV;
75#elif defined(CONFIG_CHIP_M32104)
76 unsigned short mccr;
77
78 /* Copyback and invalidate D-cache */
79 /* Invalidate I-cache */
80 *MCCR |= (MCCR_ICACHE_INV | MCCR_DCACHE_CB);
56#else 81#else
57 unsigned long mccr; 82 unsigned long mccr;
58 83
59 /* Copyback D-cache */ 84 /* Copyback D-cache */
60 /* Invalidate I-cache */ 85 /* Invalidate I-cache */
61 *MCCR = MCCR_ICACHE_INV | MCCR_DCACHE_CB; 86 *MCCR = MCCR_ICACHE_INV | MCCR_DCACHE_CB;
62 while ((mccr = *MCCR) & MCCR_IIV); /* loop while invalidating... */
63
64#endif 87#endif
88 while ((mccr = *MCCR) & MCCR_IIV); /* loop while invalidating... */
65} 89}
diff --git a/arch/m68knommu/kernel/m68k_ksyms.c b/arch/m68knommu/kernel/m68k_ksyms.c
index e93a5ad56496..b2c62eeb3bab 100644
--- a/arch/m68knommu/kernel/m68k_ksyms.c
+++ b/arch/m68knommu/kernel/m68k_ksyms.c
@@ -38,8 +38,6 @@ EXPORT_SYMBOL(strncmp);
38 38
39EXPORT_SYMBOL(ip_fast_csum); 39EXPORT_SYMBOL(ip_fast_csum);
40 40
41EXPORT_SYMBOL(mach_enable_irq);
42EXPORT_SYMBOL(mach_disable_irq);
43EXPORT_SYMBOL(kernel_thread); 41EXPORT_SYMBOL(kernel_thread);
44 42
45/* Networking helper routines. */ 43/* Networking helper routines. */
diff --git a/arch/m68knommu/kernel/setup.c b/arch/m68knommu/kernel/setup.c
index abb80fa2b940..93120b9bfff1 100644
--- a/arch/m68knommu/kernel/setup.c
+++ b/arch/m68knommu/kernel/setup.c
@@ -65,8 +65,6 @@ void (*mach_kbd_leds) (unsigned int) = NULL;
65/* machine dependent irq functions */ 65/* machine dependent irq functions */
66void (*mach_init_IRQ) (void) = NULL; 66void (*mach_init_IRQ) (void) = NULL;
67irqreturn_t (*(*mach_default_handler)[]) (int, void *, struct pt_regs *) = NULL; 67irqreturn_t (*(*mach_default_handler)[]) (int, void *, struct pt_regs *) = NULL;
68void (*mach_enable_irq) (unsigned int) = NULL;
69void (*mach_disable_irq) (unsigned int) = NULL;
70int (*mach_get_irq_list) (struct seq_file *, void *) = NULL; 68int (*mach_get_irq_list) (struct seq_file *, void *) = NULL;
71void (*mach_process_int) (int irq, struct pt_regs *fp) = NULL; 69void (*mach_process_int) (int irq, struct pt_regs *fp) = NULL;
72void (*mach_trap_init) (void); 70void (*mach_trap_init) (void);
diff --git a/arch/ppc/boot/simple/Makefile b/arch/ppc/boot/simple/Makefile
index f3e9c534aa82..9533f8de238f 100644
--- a/arch/ppc/boot/simple/Makefile
+++ b/arch/ppc/boot/simple/Makefile
@@ -190,6 +190,8 @@ boot-$(CONFIG_REDWOOD_5) += embed_config.o
190boot-$(CONFIG_REDWOOD_6) += embed_config.o 190boot-$(CONFIG_REDWOOD_6) += embed_config.o
191boot-$(CONFIG_8xx) += embed_config.o 191boot-$(CONFIG_8xx) += embed_config.o
192boot-$(CONFIG_8260) += embed_config.o 192boot-$(CONFIG_8260) += embed_config.o
193boot-$(CONFIG_EP405) += embed_config.o
194boot-$(CONFIG_XILINX_ML300) += embed_config.o
193boot-$(CONFIG_BSEIP) += iic.o 195boot-$(CONFIG_BSEIP) += iic.o
194boot-$(CONFIG_MBX) += iic.o pci.o qspan_pci.o 196boot-$(CONFIG_MBX) += iic.o pci.o qspan_pci.o
195boot-$(CONFIG_MV64X60) += misc-mv64x60.o 197boot-$(CONFIG_MV64X60) += misc-mv64x60.o
diff --git a/arch/ppc/kernel/idle.c b/arch/ppc/kernel/idle.c
index 821a75e45602..1be3ca5bae40 100644
--- a/arch/ppc/kernel/idle.c
+++ b/arch/ppc/kernel/idle.c
@@ -37,7 +37,6 @@
37void default_idle(void) 37void default_idle(void)
38{ 38{
39 void (*powersave)(void); 39 void (*powersave)(void);
40 int cpu = smp_processor_id();
41 40
42 powersave = ppc_md.power_save; 41 powersave = ppc_md.power_save;
43 42
@@ -47,7 +46,8 @@ void default_idle(void)
47#ifdef CONFIG_SMP 46#ifdef CONFIG_SMP
48 else { 47 else {
49 set_thread_flag(TIF_POLLING_NRFLAG); 48 set_thread_flag(TIF_POLLING_NRFLAG);
50 while (!need_resched() && !cpu_is_offline(cpu)) 49 while (!need_resched() &&
50 !cpu_is_offline(smp_processor_id()))
51 barrier(); 51 barrier();
52 clear_thread_flag(TIF_POLLING_NRFLAG); 52 clear_thread_flag(TIF_POLLING_NRFLAG);
53 } 53 }
diff --git a/arch/ppc/platforms/4xx/ibm440gx.c b/arch/ppc/platforms/4xx/ibm440gx.c
index 956f45e4ef97..d24c09ee7b18 100644
--- a/arch/ppc/platforms/4xx/ibm440gx.c
+++ b/arch/ppc/platforms/4xx/ibm440gx.c
@@ -58,7 +58,6 @@ static struct ocp_func_emac_data ibm440gx_emac2_def = {
58 .wol_irq = 65, /* WOL interrupt number */ 58 .wol_irq = 65, /* WOL interrupt number */
59 .mdio_idx = -1, /* No shared MDIO */ 59 .mdio_idx = -1, /* No shared MDIO */
60 .tah_idx = 0, /* TAH device index */ 60 .tah_idx = 0, /* TAH device index */
61 .jumbo = 1, /* Jumbo frames supported */
62}; 61};
63 62
64static struct ocp_func_emac_data ibm440gx_emac3_def = { 63static struct ocp_func_emac_data ibm440gx_emac3_def = {
@@ -72,7 +71,6 @@ static struct ocp_func_emac_data ibm440gx_emac3_def = {
72 .wol_irq = 67, /* WOL interrupt number */ 71 .wol_irq = 67, /* WOL interrupt number */
73 .mdio_idx = -1, /* No shared MDIO */ 72 .mdio_idx = -1, /* No shared MDIO */
74 .tah_idx = 1, /* TAH device index */ 73 .tah_idx = 1, /* TAH device index */
75 .jumbo = 1, /* Jumbo frames supported */
76}; 74};
77OCP_SYSFS_EMAC_DATA() 75OCP_SYSFS_EMAC_DATA()
78 76
diff --git a/arch/ppc/platforms/4xx/ibm440sp.c b/arch/ppc/platforms/4xx/ibm440sp.c
index feb17e41ef69..71a0117d3597 100644
--- a/arch/ppc/platforms/4xx/ibm440sp.c
+++ b/arch/ppc/platforms/4xx/ibm440sp.c
@@ -31,7 +31,6 @@ static struct ocp_func_emac_data ibm440sp_emac0_def = {
31 .wol_irq = 61, /* WOL interrupt number */ 31 .wol_irq = 61, /* WOL interrupt number */
32 .mdio_idx = -1, /* No shared MDIO */ 32 .mdio_idx = -1, /* No shared MDIO */
33 .tah_idx = -1, /* No TAH */ 33 .tah_idx = -1, /* No TAH */
34 .jumbo = 1, /* Jumbo frames supported */
35}; 34};
36OCP_SYSFS_EMAC_DATA() 35OCP_SYSFS_EMAC_DATA()
37 36
diff --git a/arch/ppc/platforms/lite5200.c b/arch/ppc/platforms/lite5200.c
index d44cc991179f..7ed52dc340c9 100644
--- a/arch/ppc/platforms/lite5200.c
+++ b/arch/ppc/platforms/lite5200.c
@@ -196,8 +196,10 @@ platform_init(unsigned long r3, unsigned long r4, unsigned long r5,
196 mpc52xx_set_bat(); 196 mpc52xx_set_bat();
197 197
198 /* No ISA bus by default */ 198 /* No ISA bus by default */
199#ifdef CONFIG_PCI
199 isa_io_base = 0; 200 isa_io_base = 0;
200 isa_mem_base = 0; 201 isa_mem_base = 0;
202#endif
201 203
202 /* Powersave */ 204 /* Powersave */
203 /* This is provided as an example on how to do it. But you 205 /* This is provided as an example on how to do it. But you
diff --git a/arch/ppc/platforms/mpc5200.c b/arch/ppc/platforms/mpc5200.c
deleted file mode 100644
index a58db438c162..000000000000
--- a/arch/ppc/platforms/mpc5200.c
+++ /dev/null
@@ -1,53 +0,0 @@
1/*
2 * arch/ppc/platforms/mpc5200.c
3 *
4 * OCP Definitions for the boards based on MPC5200 processor. Contains
5 * definitions for every common peripherals. (Mostly all but PSCs)
6 *
7 * Maintainer : Sylvain Munaut <tnt@246tNt.com>
8 *
9 * Copyright 2004 Sylvain Munaut <tnt@246tNt.com>
10 *
11 * This file is licensed under the terms of the GNU General Public License
12 * version 2. This program is licensed "as is" without any warranty of any
13 * kind, whether express or implied.
14 */
15
16#include <asm/ocp.h>
17#include <asm/mpc52xx.h>
18
19
20static struct ocp_fs_i2c_data mpc5200_i2c_def = {
21 .flags = FS_I2C_CLOCK_5200,
22};
23
24
25/* Here is the core_ocp struct.
26 * With all the devices common to all board. Even if port multiplexing is
27 * not setup for them (if the user don't want them, just don't select the
28 * config option). The potentially conflicting devices (like PSCs) goes in
29 * board specific file.
30 */
31struct ocp_def core_ocp[] = {
32 {
33 .vendor = OCP_VENDOR_FREESCALE,
34 .function = OCP_FUNC_IIC,
35 .index = 0,
36 .paddr = MPC52xx_I2C1,
37 .irq = OCP_IRQ_NA, /* MPC52xx_IRQ_I2C1 - Buggy */
38 .pm = OCP_CPM_NA,
39 .additions = &mpc5200_i2c_def,
40 },
41 {
42 .vendor = OCP_VENDOR_FREESCALE,
43 .function = OCP_FUNC_IIC,
44 .index = 1,
45 .paddr = MPC52xx_I2C2,
46 .irq = OCP_IRQ_NA, /* MPC52xx_IRQ_I2C2 - Buggy */
47 .pm = OCP_CPM_NA,
48 .additions = &mpc5200_i2c_def,
49 },
50 { /* Terminating entry */
51 .vendor = OCP_VENDOR_INVALID
52 }
53};
diff --git a/arch/ppc/syslib/mpc52xx_pci.c b/arch/ppc/syslib/mpc52xx_pci.c
index 4ac19080eb85..313c96ec7eb1 100644
--- a/arch/ppc/syslib/mpc52xx_pci.c
+++ b/arch/ppc/syslib/mpc52xx_pci.c
@@ -24,6 +24,12 @@
24#include <asm/machdep.h> 24#include <asm/machdep.h>
25 25
26 26
27/* This macro is defined to activate the workaround for the bug
28 435 of the MPC5200 (L25R). With it activated, we don't do any
29 32 bits configuration access during type-1 cycles */
30#define MPC5200_BUG_435_WORKAROUND
31
32
27static int 33static int
28mpc52xx_pci_read_config(struct pci_bus *bus, unsigned int devfn, 34mpc52xx_pci_read_config(struct pci_bus *bus, unsigned int devfn,
29 int offset, int len, u32 *val) 35 int offset, int len, u32 *val)
@@ -40,17 +46,39 @@ mpc52xx_pci_read_config(struct pci_bus *bus, unsigned int devfn,
40 ((bus->number - hose->bus_offset) << 16) | 46 ((bus->number - hose->bus_offset) << 16) |
41 (devfn << 8) | 47 (devfn << 8) |
42 (offset & 0xfc)); 48 (offset & 0xfc));
49 mb();
50
51#ifdef MPC5200_BUG_435_WORKAROUND
52 if (bus->number != hose->bus_offset) {
53 switch (len) {
54 case 1:
55 value = in_8(((u8 __iomem *)hose->cfg_data) + (offset & 3));
56 break;
57 case 2:
58 value = in_le16(((u16 __iomem *)hose->cfg_data) + ((offset>>1) & 1));
59 break;
60
61 default:
62 value = in_le16((u16 __iomem *)hose->cfg_data) |
63 (in_le16(((u16 __iomem *)hose->cfg_data) + 1) << 16);
64 break;
65 }
66 }
67 else
68#endif
69 {
70 value = in_le32(hose->cfg_data);
43 71
44 value = in_le32(hose->cfg_data); 72 if (len != 4) {
45 73 value >>= ((offset & 0x3) << 3);
46 if (len != 4) { 74 value &= 0xffffffff >> (32 - (len << 3));
47 value >>= ((offset & 0x3) << 3); 75 }
48 value &= 0xffffffff >> (32 - (len << 3));
49 } 76 }
50 77
51 *val = value; 78 *val = value;
52 79
53 out_be32(hose->cfg_addr, 0); 80 out_be32(hose->cfg_addr, 0);
81 mb();
54 82
55 return PCIBIOS_SUCCESSFUL; 83 return PCIBIOS_SUCCESSFUL;
56} 84}
@@ -71,21 +99,48 @@ mpc52xx_pci_write_config(struct pci_bus *bus, unsigned int devfn,
71 ((bus->number - hose->bus_offset) << 16) | 99 ((bus->number - hose->bus_offset) << 16) |
72 (devfn << 8) | 100 (devfn << 8) |
73 (offset & 0xfc)); 101 (offset & 0xfc));
102 mb();
103
104#ifdef MPC5200_BUG_435_WORKAROUND
105 if (bus->number != hose->bus_offset) {
106 switch (len) {
107 case 1:
108 out_8(((u8 __iomem *)hose->cfg_data) +
109 (offset & 3), val);
110 break;
111 case 2:
112 out_le16(((u16 __iomem *)hose->cfg_data) +
113 ((offset>>1) & 1), val);
114 break;
115
116 default:
117 out_le16((u16 __iomem *)hose->cfg_data,
118 (u16)val);
119 out_le16(((u16 __iomem *)hose->cfg_data) + 1,
120 (u16)(val>>16));
121 break;
122 }
123 }
124 else
125#endif
126 {
127 if (len != 4) {
128 value = in_le32(hose->cfg_data);
74 129
75 if (len != 4) { 130 offset = (offset & 0x3) << 3;
76 value = in_le32(hose->cfg_data); 131 mask = (0xffffffff >> (32 - (len << 3)));
132 mask <<= offset;
77 133
78 offset = (offset & 0x3) << 3; 134 value &= ~mask;
79 mask = (0xffffffff >> (32 - (len << 3))); 135 val = value | ((val << offset) & mask);
80 mask <<= offset; 136 }
81 137
82 value &= ~mask; 138 out_le32(hose->cfg_data, val);
83 val = value | ((val << offset) & mask);
84 } 139 }
85 140 mb();
86 out_le32(hose->cfg_data, val);
87 141
88 out_be32(hose->cfg_addr, 0); 142 out_be32(hose->cfg_addr, 0);
143 mb();
89 144
90 return PCIBIOS_SUCCESSFUL; 145 return PCIBIOS_SUCCESSFUL;
91} 146}
@@ -99,9 +154,12 @@ static struct pci_ops mpc52xx_pci_ops = {
99static void __init 154static void __init
100mpc52xx_pci_setup(struct mpc52xx_pci __iomem *pci_regs) 155mpc52xx_pci_setup(struct mpc52xx_pci __iomem *pci_regs)
101{ 156{
157 u32 tmp;
102 158
103 /* Setup control regs */ 159 /* Setup control regs */
104 /* Nothing to do afaik */ 160 tmp = in_be32(&pci_regs->scr);
161 tmp |= PCI_COMMAND_MASTER | PCI_COMMAND_MEMORY;
162 out_be32(&pci_regs->scr, tmp);
105 163
106 /* Setup windows */ 164 /* Setup windows */
107 out_be32(&pci_regs->iw0btar, MPC52xx_PCI_IWBTAR_TRANSLATION( 165 out_be32(&pci_regs->iw0btar, MPC52xx_PCI_IWBTAR_TRANSLATION(
@@ -142,16 +200,15 @@ mpc52xx_pci_setup(struct mpc52xx_pci __iomem *pci_regs)
142 /* Not necessary and can be a bad thing if for example the bootloader 200 /* Not necessary and can be a bad thing if for example the bootloader
143 is displaying a splash screen or ... Just left here for 201 is displaying a splash screen or ... Just left here for
144 documentation purpose if anyone need it */ 202 documentation purpose if anyone need it */
145#if 0
146 u32 tmp;
147 tmp = in_be32(&pci_regs->gscr); 203 tmp = in_be32(&pci_regs->gscr);
204#if 0
148 out_be32(&pci_regs->gscr, tmp | MPC52xx_PCI_GSCR_PR); 205 out_be32(&pci_regs->gscr, tmp | MPC52xx_PCI_GSCR_PR);
149 udelay(50); 206 udelay(50);
150 out_be32(&pci_regs->gscr, tmp);
151#endif 207#endif
208 out_be32(&pci_regs->gscr, tmp & ~MPC52xx_PCI_GSCR_PR);
152} 209}
153 210
154static void __init 211static void
155mpc52xx_pci_fixup_resources(struct pci_dev *dev) 212mpc52xx_pci_fixup_resources(struct pci_dev *dev)
156{ 213{
157 int i; 214 int i;
diff --git a/arch/ppc/syslib/mpc52xx_setup.c b/arch/ppc/syslib/mpc52xx_setup.c
index bb2374585a7b..a4a4b02227df 100644
--- a/arch/ppc/syslib/mpc52xx_setup.c
+++ b/arch/ppc/syslib/mpc52xx_setup.c
@@ -84,9 +84,11 @@ mpc52xx_set_bat(void)
84void __init 84void __init
85mpc52xx_map_io(void) 85mpc52xx_map_io(void)
86{ 86{
87 /* Here we only map the MBAR */ 87 /* Here we map the MBAR and the whole upper zone. MBAR is only
88 64k but we can't map only 64k with BATs. Map the whole
89 0xf0000000 range is ok and helps eventual lpb devices placed there */
88 io_block_mapping( 90 io_block_mapping(
89 MPC52xx_MBAR_VIRT, MPC52xx_MBAR, MPC52xx_MBAR_SIZE, _PAGE_IO); 91 MPC52xx_MBAR_VIRT, MPC52xx_MBAR, 0x10000000, _PAGE_IO);
90} 92}
91 93
92 94
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 477ac2758bd5..6fe532d82417 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -23,14 +23,14 @@ config GENERIC_BUST_SPINLOCK
23 23
24mainmenu "Linux Kernel Configuration" 24mainmenu "Linux Kernel Configuration"
25 25
26config ARCH_S390 26config S390
27 bool 27 bool
28 default y 28 default y
29 29
30config UID16 30config UID16
31 bool 31 bool
32 default y 32 default y
33 depends on ARCH_S390X = 'n' 33 depends on !64BIT
34 34
35source "init/Kconfig" 35source "init/Kconfig"
36 36
@@ -38,20 +38,12 @@ menu "Base setup"
38 38
39comment "Processor type and features" 39comment "Processor type and features"
40 40
41config ARCH_S390X 41config 64BIT
42 bool "64 bit kernel" 42 bool "64 bit kernel"
43 help 43 help
44 Select this option if you have a 64 bit IBM zSeries machine 44 Select this option if you have a 64 bit IBM zSeries machine
45 and want to use the 64 bit addressing mode. 45 and want to use the 64 bit addressing mode.
46 46
47config 64BIT
48 def_bool ARCH_S390X
49
50config ARCH_S390_31
51 bool
52 depends on ARCH_S390X = 'n'
53 default y
54
55config SMP 47config SMP
56 bool "Symmetric multi-processing support" 48 bool "Symmetric multi-processing support"
57 ---help--- 49 ---help---
@@ -101,20 +93,15 @@ config MATHEMU
101 on older S/390 machines. Say Y unless you know your machine doesn't 93 on older S/390 machines. Say Y unless you know your machine doesn't
102 need this. 94 need this.
103 95
104config S390_SUPPORT 96config COMPAT
105 bool "Kernel support for 31 bit emulation" 97 bool "Kernel support for 31 bit emulation"
106 depends on ARCH_S390X 98 depends on 64BIT
107 help 99 help
108 Select this option if you want to enable your system kernel to 100 Select this option if you want to enable your system kernel to
109 handle system-calls from ELF binaries for 31 bit ESA. This option 101 handle system-calls from ELF binaries for 31 bit ESA. This option
110 (and some other stuff like libraries and such) is needed for 102 (and some other stuff like libraries and such) is needed for
111 executing 31 bit applications. It is safe to say "Y". 103 executing 31 bit applications. It is safe to say "Y".
112 104
113config COMPAT
114 bool
115 depends on S390_SUPPORT
116 default y
117
118config SYSVIPC_COMPAT 105config SYSVIPC_COMPAT
119 bool 106 bool
120 depends on COMPAT && SYSVIPC 107 depends on COMPAT && SYSVIPC
@@ -122,7 +109,7 @@ config SYSVIPC_COMPAT
122 109
123config BINFMT_ELF32 110config BINFMT_ELF32
124 tristate "Kernel support for 31 bit ELF binaries" 111 tristate "Kernel support for 31 bit ELF binaries"
125 depends on S390_SUPPORT 112 depends on COMPAT
126 help 113 help
127 This allows you to run 32-bit Linux/ELF binaries on your zSeries 114 This allows you to run 32-bit Linux/ELF binaries on your zSeries
128 in 64 bit mode. Everybody wants this; say Y. 115 in 64 bit mode. Everybody wants this; say Y.
@@ -135,7 +122,7 @@ choice
135 122
136config MARCH_G5 123config MARCH_G5
137 bool "S/390 model G5 and G6" 124 bool "S/390 model G5 and G6"
138 depends on ARCH_S390_31 125 depends on !64BIT
139 help 126 help
140 Select this to build a 31 bit kernel that works 127 Select this to build a 31 bit kernel that works
141 on all S/390 and zSeries machines. 128 on all S/390 and zSeries machines.
@@ -240,8 +227,8 @@ config MACHCHK_WARNING
240config QDIO 227config QDIO
241 tristate "QDIO support" 228 tristate "QDIO support"
242 ---help--- 229 ---help---
243 This driver provides the Queued Direct I/O base support for the 230 This driver provides the Queued Direct I/O base support for
244 IBM S/390 (G5 and G6) and eServer zSeries (z800, z890, z900 and z990). 231 IBM mainframes.
245 232
246 For details please refer to the documentation provided by IBM at 233 For details please refer to the documentation provided by IBM at
247 <http://www10.software.ibm.com/developerworks/opensource/linux390> 234 <http://www10.software.ibm.com/developerworks/opensource/linux390>
@@ -263,7 +250,8 @@ config QDIO_DEBUG
263 bool "Extended debugging information" 250 bool "Extended debugging information"
264 depends on QDIO 251 depends on QDIO
265 help 252 help
266 Say Y here to get extended debugging output in /proc/s390dbf/qdio... 253 Say Y here to get extended debugging output in
254 /sys/kernel/debug/s390dbf/qdio...
267 Warning: this option reduces the performance of the QDIO module. 255 Warning: this option reduces the performance of the QDIO module.
268 256
269 If unsure, say N. 257 If unsure, say N.
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index 73a09a6ee6c8..6c6b197898d0 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -13,16 +13,14 @@
13# Copyright (C) 1994 by Linus Torvalds 13# Copyright (C) 1994 by Linus Torvalds
14# 14#
15 15
16ifdef CONFIG_ARCH_S390_31 16ifndef CONFIG_64BIT
17LDFLAGS := -m elf_s390 17LDFLAGS := -m elf_s390
18CFLAGS += -m31 18CFLAGS += -m31
19AFLAGS += -m31 19AFLAGS += -m31
20UTS_MACHINE := s390 20UTS_MACHINE := s390
21STACK_SIZE := 8192 21STACK_SIZE := 8192
22CHECKFLAGS += -D__s390__ 22CHECKFLAGS += -D__s390__
23endif 23else
24
25ifdef CONFIG_ARCH_S390X
26LDFLAGS := -m elf64_s390 24LDFLAGS := -m elf64_s390
27MODFLAGS += -fpic -D__PIC__ 25MODFLAGS += -fpic -D__PIC__
28CFLAGS += -m64 26CFLAGS += -m64
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index dee6ab54984d..d06a8d71c71d 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -40,7 +40,7 @@
40 40
41#define TOD_MICRO 0x01000 /* nr. of TOD clock units 41#define TOD_MICRO 0x01000 /* nr. of TOD clock units
42 for 1 microsecond */ 42 for 1 microsecond */
43#ifndef CONFIG_ARCH_S390X 43#ifndef CONFIG_64BIT
44 44
45#define APPLDATA_START_INTERVAL_REC 0x00 /* Function codes for */ 45#define APPLDATA_START_INTERVAL_REC 0x00 /* Function codes for */
46#define APPLDATA_STOP_REC 0x01 /* DIAG 0xDC */ 46#define APPLDATA_STOP_REC 0x01 /* DIAG 0xDC */
@@ -54,13 +54,13 @@
54#define APPLDATA_GEN_EVENT_RECORD 0x82 54#define APPLDATA_GEN_EVENT_RECORD 0x82
55#define APPLDATA_START_CONFIG_REC 0x83 55#define APPLDATA_START_CONFIG_REC 0x83
56 56
57#endif /* CONFIG_ARCH_S390X */ 57#endif /* CONFIG_64BIT */
58 58
59 59
60/* 60/*
61 * Parameter list for DIAGNOSE X'DC' 61 * Parameter list for DIAGNOSE X'DC'
62 */ 62 */
63#ifndef CONFIG_ARCH_S390X 63#ifndef CONFIG_64BIT
64struct appldata_parameter_list { 64struct appldata_parameter_list {
65 u16 diag; /* The DIAGNOSE code X'00DC' */ 65 u16 diag; /* The DIAGNOSE code X'00DC' */
66 u8 function; /* The function code for the DIAGNOSE */ 66 u8 function; /* The function code for the DIAGNOSE */
@@ -82,7 +82,7 @@ struct appldata_parameter_list {
82 u64 product_id_addr; 82 u64 product_id_addr;
83 u64 buffer_addr; 83 u64 buffer_addr;
84}; 84};
85#endif /* CONFIG_ARCH_S390X */ 85#endif /* CONFIG_64BIT */
86 86
87/* 87/*
88 * /proc entries (sysctl) 88 * /proc entries (sysctl)
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index e0a476bf4fd6..99ddd3bf2fba 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -141,19 +141,19 @@ static void appldata_get_os_data(void *data)
141 j = 0; 141 j = 0;
142 for_each_online_cpu(i) { 142 for_each_online_cpu(i) {
143 os_data->os_cpu[j].per_cpu_user = 143 os_data->os_cpu[j].per_cpu_user =
144 kstat_cpu(i).cpustat.user; 144 cputime_to_jiffies(kstat_cpu(i).cpustat.user);
145 os_data->os_cpu[j].per_cpu_nice = 145 os_data->os_cpu[j].per_cpu_nice =
146 kstat_cpu(i).cpustat.nice; 146 cputime_to_jiffies(kstat_cpu(i).cpustat.nice);
147 os_data->os_cpu[j].per_cpu_system = 147 os_data->os_cpu[j].per_cpu_system =
148 kstat_cpu(i).cpustat.system; 148 cputime_to_jiffies(kstat_cpu(i).cpustat.system);
149 os_data->os_cpu[j].per_cpu_idle = 149 os_data->os_cpu[j].per_cpu_idle =
150 kstat_cpu(i).cpustat.idle; 150 cputime_to_jiffies(kstat_cpu(i).cpustat.idle);
151 os_data->os_cpu[j].per_cpu_irq = 151 os_data->os_cpu[j].per_cpu_irq =
152 kstat_cpu(i).cpustat.irq; 152 cputime_to_jiffies(kstat_cpu(i).cpustat.irq);
153 os_data->os_cpu[j].per_cpu_softirq = 153 os_data->os_cpu[j].per_cpu_softirq =
154 kstat_cpu(i).cpustat.softirq; 154 cputime_to_jiffies(kstat_cpu(i).cpustat.softirq);
155 os_data->os_cpu[j].per_cpu_iowait = 155 os_data->os_cpu[j].per_cpu_iowait =
156 kstat_cpu(i).cpustat.iowait; 156 cputime_to_jiffies(kstat_cpu(i).cpustat.iowait);
157 j++; 157 j++;
158 } 158 }
159 159
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 96a05e6b51e0..bfe2541dc5cf 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -2,7 +2,9 @@
2# Cryptographic API 2# Cryptographic API
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_SHA1_Z990) += sha1_z990.o 5obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o
6obj-$(CONFIG_CRYPTO_DES_Z990) += des_z990.o des_check_key.o 6obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256_s390.o
7obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o des_check_key.o
8obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
7 9
8obj-$(CONFIG_CRYPTO_TEST) += crypt_z990_query.o 10obj-$(CONFIG_CRYPTO_TEST) += crypt_s390_query.o
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
new file mode 100644
index 000000000000..7a1033d8e00f
--- /dev/null
+++ b/arch/s390/crypto/aes_s390.c
@@ -0,0 +1,248 @@
1/*
2 * Cryptographic API.
3 *
4 * s390 implementation of the AES Cipher Algorithm.
5 *
6 * s390 Version:
7 * Copyright (C) 2005 IBM Deutschland GmbH, IBM Corporation
8 * Author(s): Jan Glauber (jang@de.ibm.com)
9 *
10 * Derived from "crypto/aes.c"
11 *
12 * This program is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option)
15 * any later version.
16 *
17 */
18
19#include <linux/module.h>
20#include <linux/init.h>
21#include <linux/crypto.h>
22#include "crypt_s390.h"
23
24#define AES_MIN_KEY_SIZE 16
25#define AES_MAX_KEY_SIZE 32
26
27/* data block size for all key lengths */
28#define AES_BLOCK_SIZE 16
29
30int has_aes_128 = 0;
31int has_aes_192 = 0;
32int has_aes_256 = 0;
33
34struct s390_aes_ctx {
35 u8 iv[AES_BLOCK_SIZE];
36 u8 key[AES_MAX_KEY_SIZE];
37 int key_len;
38};
39
40static int aes_set_key(void *ctx, const u8 *in_key, unsigned int key_len,
41 u32 *flags)
42{
43 struct s390_aes_ctx *sctx = ctx;
44
45 switch (key_len) {
46 case 16:
47 if (!has_aes_128)
48 goto fail;
49 break;
50 case 24:
51 if (!has_aes_192)
52 goto fail;
53
54 break;
55 case 32:
56 if (!has_aes_256)
57 goto fail;
58 break;
59 default:
60 /* invalid key length */
61 goto fail;
62 break;
63 }
64
65 sctx->key_len = key_len;
66 memcpy(sctx->key, in_key, key_len);
67 return 0;
68fail:
69 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
70 return -EINVAL;
71}
72
73static void aes_encrypt(void *ctx, u8 *out, const u8 *in)
74{
75 const struct s390_aes_ctx *sctx = ctx;
76
77 switch (sctx->key_len) {
78 case 16:
79 crypt_s390_km(KM_AES_128_ENCRYPT, &sctx->key, out, in,
80 AES_BLOCK_SIZE);
81 break;
82 case 24:
83 crypt_s390_km(KM_AES_192_ENCRYPT, &sctx->key, out, in,
84 AES_BLOCK_SIZE);
85 break;
86 case 32:
87 crypt_s390_km(KM_AES_256_ENCRYPT, &sctx->key, out, in,
88 AES_BLOCK_SIZE);
89 break;
90 }
91}
92
93static void aes_decrypt(void *ctx, u8 *out, const u8 *in)
94{
95 const struct s390_aes_ctx *sctx = ctx;
96
97 switch (sctx->key_len) {
98 case 16:
99 crypt_s390_km(KM_AES_128_DECRYPT, &sctx->key, out, in,
100 AES_BLOCK_SIZE);
101 break;
102 case 24:
103 crypt_s390_km(KM_AES_192_DECRYPT, &sctx->key, out, in,
104 AES_BLOCK_SIZE);
105 break;
106 case 32:
107 crypt_s390_km(KM_AES_256_DECRYPT, &sctx->key, out, in,
108 AES_BLOCK_SIZE);
109 break;
110 }
111}
112
113static unsigned int aes_encrypt_ecb(const struct cipher_desc *desc, u8 *out,
114 const u8 *in, unsigned int nbytes)
115{
116 struct s390_aes_ctx *sctx = crypto_tfm_ctx(desc->tfm);
117
118 switch (sctx->key_len) {
119 case 16:
120 crypt_s390_km(KM_AES_128_ENCRYPT, &sctx->key, out, in, nbytes);
121 break;
122 case 24:
123 crypt_s390_km(KM_AES_192_ENCRYPT, &sctx->key, out, in, nbytes);
124 break;
125 case 32:
126 crypt_s390_km(KM_AES_256_ENCRYPT, &sctx->key, out, in, nbytes);
127 break;
128 }
129 return nbytes & ~(AES_BLOCK_SIZE - 1);
130}
131
132static unsigned int aes_decrypt_ecb(const struct cipher_desc *desc, u8 *out,
133 const u8 *in, unsigned int nbytes)
134{
135 struct s390_aes_ctx *sctx = crypto_tfm_ctx(desc->tfm);
136
137 switch (sctx->key_len) {
138 case 16:
139 crypt_s390_km(KM_AES_128_DECRYPT, &sctx->key, out, in, nbytes);
140 break;
141 case 24:
142 crypt_s390_km(KM_AES_192_DECRYPT, &sctx->key, out, in, nbytes);
143 break;
144 case 32:
145 crypt_s390_km(KM_AES_256_DECRYPT, &sctx->key, out, in, nbytes);
146 break;
147 }
148 return nbytes & ~(AES_BLOCK_SIZE - 1);
149}
150
151static unsigned int aes_encrypt_cbc(const struct cipher_desc *desc, u8 *out,
152 const u8 *in, unsigned int nbytes)
153{
154 struct s390_aes_ctx *sctx = crypto_tfm_ctx(desc->tfm);
155
156 memcpy(&sctx->iv, desc->info, AES_BLOCK_SIZE);
157 switch (sctx->key_len) {
158 case 16:
159 crypt_s390_kmc(KMC_AES_128_ENCRYPT, &sctx->iv, out, in, nbytes);
160 break;
161 case 24:
162 crypt_s390_kmc(KMC_AES_192_ENCRYPT, &sctx->iv, out, in, nbytes);
163 break;
164 case 32:
165 crypt_s390_kmc(KMC_AES_256_ENCRYPT, &sctx->iv, out, in, nbytes);
166 break;
167 }
168 memcpy(desc->info, &sctx->iv, AES_BLOCK_SIZE);
169
170 return nbytes & ~(AES_BLOCK_SIZE - 1);
171}
172
173static unsigned int aes_decrypt_cbc(const struct cipher_desc *desc, u8 *out,
174 const u8 *in, unsigned int nbytes)
175{
176 struct s390_aes_ctx *sctx = crypto_tfm_ctx(desc->tfm);
177
178 memcpy(&sctx->iv, desc->info, AES_BLOCK_SIZE);
179 switch (sctx->key_len) {
180 case 16:
181 crypt_s390_kmc(KMC_AES_128_DECRYPT, &sctx->iv, out, in, nbytes);
182 break;
183 case 24:
184 crypt_s390_kmc(KMC_AES_192_DECRYPT, &sctx->iv, out, in, nbytes);
185 break;
186 case 32:
187 crypt_s390_kmc(KMC_AES_256_DECRYPT, &sctx->iv, out, in, nbytes);
188 break;
189 }
190 return nbytes & ~(AES_BLOCK_SIZE - 1);
191}
192
193
194static struct crypto_alg aes_alg = {
195 .cra_name = "aes",
196 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
197 .cra_blocksize = AES_BLOCK_SIZE,
198 .cra_ctxsize = sizeof(struct s390_aes_ctx),
199 .cra_module = THIS_MODULE,
200 .cra_list = LIST_HEAD_INIT(aes_alg.cra_list),
201 .cra_u = {
202 .cipher = {
203 .cia_min_keysize = AES_MIN_KEY_SIZE,
204 .cia_max_keysize = AES_MAX_KEY_SIZE,
205 .cia_setkey = aes_set_key,
206 .cia_encrypt = aes_encrypt,
207 .cia_decrypt = aes_decrypt,
208 .cia_encrypt_ecb = aes_encrypt_ecb,
209 .cia_decrypt_ecb = aes_decrypt_ecb,
210 .cia_encrypt_cbc = aes_encrypt_cbc,
211 .cia_decrypt_cbc = aes_decrypt_cbc,
212 }
213 }
214};
215
216static int __init aes_init(void)
217{
218 int ret;
219
220 if (crypt_s390_func_available(KM_AES_128_ENCRYPT))
221 has_aes_128 = 1;
222 if (crypt_s390_func_available(KM_AES_192_ENCRYPT))
223 has_aes_192 = 1;
224 if (crypt_s390_func_available(KM_AES_256_ENCRYPT))
225 has_aes_256 = 1;
226
227 if (!has_aes_128 && !has_aes_192 && !has_aes_256)
228 return -ENOSYS;
229
230 ret = crypto_register_alg(&aes_alg);
231 if (ret != 0)
232 printk(KERN_INFO "crypt_s390: aes_s390 couldn't be loaded.\n");
233 return ret;
234}
235
236static void __exit aes_fini(void)
237{
238 crypto_unregister_alg(&aes_alg);
239}
240
241module_init(aes_init);
242module_exit(aes_fini);
243
244MODULE_ALIAS("aes");
245
246MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
247MODULE_LICENSE("GPL");
248
diff --git a/arch/s390/crypto/crypt_z990.h b/arch/s390/crypto/crypt_s390.h
index 4df660b99e5a..d1c259a7fe33 100644
--- a/arch/s390/crypto/crypt_z990.h
+++ b/arch/s390/crypto/crypt_s390.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Cryptographic API. 2 * Cryptographic API.
3 * 3 *
4 * Support for z990 cryptographic instructions. 4 * Support for s390 cryptographic instructions.
5 * 5 *
6 * Copyright (C) 2003 IBM Deutschland GmbH, IBM Corporation 6 * Copyright (C) 2003 IBM Deutschland GmbH, IBM Corporation
7 * Author(s): Thomas Spatzier (tspat@de.ibm.com) 7 * Author(s): Thomas Spatzier (tspat@de.ibm.com)
@@ -12,84 +12,108 @@
12 * any later version. 12 * any later version.
13 * 13 *
14 */ 14 */
15#ifndef _CRYPTO_ARCH_S390_CRYPT_Z990_H 15#ifndef _CRYPTO_ARCH_S390_CRYPT_S390_H
16#define _CRYPTO_ARCH_S390_CRYPT_Z990_H 16#define _CRYPTO_ARCH_S390_CRYPT_S390_H
17 17
18#include <asm/errno.h> 18#include <asm/errno.h>
19 19
20#define CRYPT_Z990_OP_MASK 0xFF00 20#define CRYPT_S390_OP_MASK 0xFF00
21#define CRYPT_Z990_FUNC_MASK 0x00FF 21#define CRYPT_S390_FUNC_MASK 0x00FF
22 22
23 23/* s930 cryptographic operations */
24/*z990 cryptographic operations*/ 24enum crypt_s390_operations {
25enum crypt_z990_operations { 25 CRYPT_S390_KM = 0x0100,
26 CRYPT_Z990_KM = 0x0100, 26 CRYPT_S390_KMC = 0x0200,
27 CRYPT_Z990_KMC = 0x0200, 27 CRYPT_S390_KIMD = 0x0300,
28 CRYPT_Z990_KIMD = 0x0300, 28 CRYPT_S390_KLMD = 0x0400,
29 CRYPT_Z990_KLMD = 0x0400, 29 CRYPT_S390_KMAC = 0x0500
30 CRYPT_Z990_KMAC = 0x0500
31}; 30};
32 31
33/*function codes for KM (CIPHER MESSAGE) instruction*/ 32/* function codes for KM (CIPHER MESSAGE) instruction
34enum crypt_z990_km_func { 33 * 0x80 is the decipher modifier bit
35 KM_QUERY = CRYPT_Z990_KM | 0, 34 */
36 KM_DEA_ENCRYPT = CRYPT_Z990_KM | 1, 35enum crypt_s390_km_func {
37 KM_DEA_DECRYPT = CRYPT_Z990_KM | 1 | 0x80, //modifier bit->decipher 36 KM_QUERY = CRYPT_S390_KM | 0x0,
38 KM_TDEA_128_ENCRYPT = CRYPT_Z990_KM | 2, 37 KM_DEA_ENCRYPT = CRYPT_S390_KM | 0x1,
39 KM_TDEA_128_DECRYPT = CRYPT_Z990_KM | 2 | 0x80, 38 KM_DEA_DECRYPT = CRYPT_S390_KM | 0x1 | 0x80,
40 KM_TDEA_192_ENCRYPT = CRYPT_Z990_KM | 3, 39 KM_TDEA_128_ENCRYPT = CRYPT_S390_KM | 0x2,
41 KM_TDEA_192_DECRYPT = CRYPT_Z990_KM | 3 | 0x80, 40 KM_TDEA_128_DECRYPT = CRYPT_S390_KM | 0x2 | 0x80,
41 KM_TDEA_192_ENCRYPT = CRYPT_S390_KM | 0x3,
42 KM_TDEA_192_DECRYPT = CRYPT_S390_KM | 0x3 | 0x80,
43 KM_AES_128_ENCRYPT = CRYPT_S390_KM | 0x12,
44 KM_AES_128_DECRYPT = CRYPT_S390_KM | 0x12 | 0x80,
45 KM_AES_192_ENCRYPT = CRYPT_S390_KM | 0x13,
46 KM_AES_192_DECRYPT = CRYPT_S390_KM | 0x13 | 0x80,
47 KM_AES_256_ENCRYPT = CRYPT_S390_KM | 0x14,
48 KM_AES_256_DECRYPT = CRYPT_S390_KM | 0x14 | 0x80,
42}; 49};
43 50
44/*function codes for KMC (CIPHER MESSAGE WITH CHAINING) instruction*/ 51/* function codes for KMC (CIPHER MESSAGE WITH CHAINING)
45enum crypt_z990_kmc_func { 52 * instruction
46 KMC_QUERY = CRYPT_Z990_KMC | 0, 53 */
47 KMC_DEA_ENCRYPT = CRYPT_Z990_KMC | 1, 54enum crypt_s390_kmc_func {
48 KMC_DEA_DECRYPT = CRYPT_Z990_KMC | 1 | 0x80, //modifier bit->decipher 55 KMC_QUERY = CRYPT_S390_KMC | 0x0,
49 KMC_TDEA_128_ENCRYPT = CRYPT_Z990_KMC | 2, 56 KMC_DEA_ENCRYPT = CRYPT_S390_KMC | 0x1,
50 KMC_TDEA_128_DECRYPT = CRYPT_Z990_KMC | 2 | 0x80, 57 KMC_DEA_DECRYPT = CRYPT_S390_KMC | 0x1 | 0x80,
51 KMC_TDEA_192_ENCRYPT = CRYPT_Z990_KMC | 3, 58 KMC_TDEA_128_ENCRYPT = CRYPT_S390_KMC | 0x2,
52 KMC_TDEA_192_DECRYPT = CRYPT_Z990_KMC | 3 | 0x80, 59 KMC_TDEA_128_DECRYPT = CRYPT_S390_KMC | 0x2 | 0x80,
60 KMC_TDEA_192_ENCRYPT = CRYPT_S390_KMC | 0x3,
61 KMC_TDEA_192_DECRYPT = CRYPT_S390_KMC | 0x3 | 0x80,
62 KMC_AES_128_ENCRYPT = CRYPT_S390_KMC | 0x12,
63 KMC_AES_128_DECRYPT = CRYPT_S390_KMC | 0x12 | 0x80,
64 KMC_AES_192_ENCRYPT = CRYPT_S390_KMC | 0x13,
65 KMC_AES_192_DECRYPT = CRYPT_S390_KMC | 0x13 | 0x80,
66 KMC_AES_256_ENCRYPT = CRYPT_S390_KMC | 0x14,
67 KMC_AES_256_DECRYPT = CRYPT_S390_KMC | 0x14 | 0x80,
53}; 68};
54 69
55/*function codes for KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) instruction*/ 70/* function codes for KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST)
56enum crypt_z990_kimd_func { 71 * instruction
57 KIMD_QUERY = CRYPT_Z990_KIMD | 0, 72 */
58 KIMD_SHA_1 = CRYPT_Z990_KIMD | 1, 73enum crypt_s390_kimd_func {
74 KIMD_QUERY = CRYPT_S390_KIMD | 0,
75 KIMD_SHA_1 = CRYPT_S390_KIMD | 1,
76 KIMD_SHA_256 = CRYPT_S390_KIMD | 2,
59}; 77};
60 78
61/*function codes for KLMD (COMPUTE LAST MESSAGE DIGEST) instruction*/ 79/* function codes for KLMD (COMPUTE LAST MESSAGE DIGEST)
62enum crypt_z990_klmd_func { 80 * instruction
63 KLMD_QUERY = CRYPT_Z990_KLMD | 0, 81 */
64 KLMD_SHA_1 = CRYPT_Z990_KLMD | 1, 82enum crypt_s390_klmd_func {
83 KLMD_QUERY = CRYPT_S390_KLMD | 0,
84 KLMD_SHA_1 = CRYPT_S390_KLMD | 1,
85 KLMD_SHA_256 = CRYPT_S390_KLMD | 2,
65}; 86};
66 87
67/*function codes for KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) instruction*/ 88/* function codes for KMAC (COMPUTE MESSAGE AUTHENTICATION CODE)
68enum crypt_z990_kmac_func { 89 * instruction
69 KMAC_QUERY = CRYPT_Z990_KMAC | 0, 90 */
70 KMAC_DEA = CRYPT_Z990_KMAC | 1, 91enum crypt_s390_kmac_func {
71 KMAC_TDEA_128 = CRYPT_Z990_KMAC | 2, 92 KMAC_QUERY = CRYPT_S390_KMAC | 0,
72 KMAC_TDEA_192 = CRYPT_Z990_KMAC | 3 93 KMAC_DEA = CRYPT_S390_KMAC | 1,
94 KMAC_TDEA_128 = CRYPT_S390_KMAC | 2,
95 KMAC_TDEA_192 = CRYPT_S390_KMAC | 3
73}; 96};
74 97
75/*status word for z990 crypto instructions' QUERY functions*/ 98/* status word for s390 crypto instructions' QUERY functions */
76struct crypt_z990_query_status { 99struct crypt_s390_query_status {
77 u64 high; 100 u64 high;
78 u64 low; 101 u64 low;
79}; 102};
80 103
81/* 104/*
82 * Standard fixup and ex_table sections for crypt_z990 inline functions. 105 * Standard fixup and ex_table sections for crypt_s390 inline functions.
83 * label 0: the z990 crypto operation 106 * label 0: the s390 crypto operation
84 * label 1: just after 1 to catch illegal operation exception on non-z990 107 * label 1: just after 1 to catch illegal operation exception
108 * (unsupported model)
85 * label 6: the return point after fixup 109 * label 6: the return point after fixup
86 * label 7: set error value if exception _in_ crypto operation 110 * label 7: set error value if exception _in_ crypto operation
87 * label 8: set error value if illegal operation exception 111 * label 8: set error value if illegal operation exception
88 * [ret] is the variable to receive the error code 112 * [ret] is the variable to receive the error code
89 * [ERR] is the error code value 113 * [ERR] is the error code value
90 */ 114 */
91#ifndef __s390x__ 115#ifndef CONFIG_64BIT
92#define __crypt_z990_fixup \ 116#define __crypt_s390_fixup \
93 ".section .fixup,\"ax\" \n" \ 117 ".section .fixup,\"ax\" \n" \
94 "7: lhi %0,%h[e1] \n" \ 118 "7: lhi %0,%h[e1] \n" \
95 " bras 1,9f \n" \ 119 " bras 1,9f \n" \
@@ -105,8 +129,8 @@ struct crypt_z990_query_status {
105 " .long 0b,7b \n" \ 129 " .long 0b,7b \n" \
106 " .long 1b,8b \n" \ 130 " .long 1b,8b \n" \
107 ".previous" 131 ".previous"
108#else /* __s390x__ */ 132#else /* CONFIG_64BIT */
109#define __crypt_z990_fixup \ 133#define __crypt_s390_fixup \
110 ".section .fixup,\"ax\" \n" \ 134 ".section .fixup,\"ax\" \n" \
111 "7: lhi %0,%h[e1] \n" \ 135 "7: lhi %0,%h[e1] \n" \
112 " jg 6b \n" \ 136 " jg 6b \n" \
@@ -118,25 +142,25 @@ struct crypt_z990_query_status {
118 " .quad 0b,7b \n" \ 142 " .quad 0b,7b \n" \
119 " .quad 1b,8b \n" \ 143 " .quad 1b,8b \n" \
120 ".previous" 144 ".previous"
121#endif /* __s390x__ */ 145#endif /* CONFIG_64BIT */
122 146
123/* 147/*
124 * Standard code for setting the result of z990 crypto instructions. 148 * Standard code for setting the result of s390 crypto instructions.
125 * %0: the register which will receive the result 149 * %0: the register which will receive the result
126 * [result]: the register containing the result (e.g. second operand length 150 * [result]: the register containing the result (e.g. second operand length
127 * to compute number of processed bytes]. 151 * to compute number of processed bytes].
128 */ 152 */
129#ifndef __s390x__ 153#ifndef CONFIG_64BIT
130#define __crypt_z990_set_result \ 154#define __crypt_s390_set_result \
131 " lr %0,%[result] \n" 155 " lr %0,%[result] \n"
132#else /* __s390x__ */ 156#else /* CONFIG_64BIT */
133#define __crypt_z990_set_result \ 157#define __crypt_s390_set_result \
134 " lgr %0,%[result] \n" 158 " lgr %0,%[result] \n"
135#endif 159#endif
136 160
137/* 161/*
138 * Executes the KM (CIPHER MESSAGE) operation of the z990 CPU. 162 * Executes the KM (CIPHER MESSAGE) operation of the CPU.
139 * @param func: the function code passed to KM; see crypt_z990_km_func 163 * @param func: the function code passed to KM; see crypt_s390_km_func
140 * @param param: address of parameter block; see POP for details on each func 164 * @param param: address of parameter block; see POP for details on each func
141 * @param dest: address of destination memory area 165 * @param dest: address of destination memory area
142 * @param src: address of source memory area 166 * @param src: address of source memory area
@@ -145,9 +169,9 @@ struct crypt_z990_query_status {
145 * for encryption/decryption funcs 169 * for encryption/decryption funcs
146 */ 170 */
147static inline int 171static inline int
148crypt_z990_km(long func, void* param, u8* dest, const u8* src, long src_len) 172crypt_s390_km(long func, void* param, u8* dest, const u8* src, long src_len)
149{ 173{
150 register long __func asm("0") = func & CRYPT_Z990_FUNC_MASK; 174 register long __func asm("0") = func & CRYPT_S390_FUNC_MASK;
151 register void* __param asm("1") = param; 175 register void* __param asm("1") = param;
152 register u8* __dest asm("4") = dest; 176 register u8* __dest asm("4") = dest;
153 register const u8* __src asm("2") = src; 177 register const u8* __src asm("2") = src;
@@ -156,26 +180,26 @@ crypt_z990_km(long func, void* param, u8* dest, const u8* src, long src_len)
156 180
157 ret = 0; 181 ret = 0;
158 __asm__ __volatile__ ( 182 __asm__ __volatile__ (
159 "0: .insn rre,0xB92E0000,%1,%2 \n" //KM opcode 183 "0: .insn rre,0xB92E0000,%1,%2 \n" /* KM opcode */
160 "1: brc 1,0b \n" //handle partial completion 184 "1: brc 1,0b \n" /* handle partial completion */
161 __crypt_z990_set_result 185 __crypt_s390_set_result
162 "6: \n" 186 "6: \n"
163 __crypt_z990_fixup 187 __crypt_s390_fixup
164 : "+d" (ret), "+a" (__dest), "+a" (__src), 188 : "+d" (ret), "+a" (__dest), "+a" (__src),
165 [result] "+d" (__src_len) 189 [result] "+d" (__src_len)
166 : [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func), 190 : [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
167 "a" (__param) 191 "a" (__param)
168 : "cc", "memory" 192 : "cc", "memory"
169 ); 193 );
170 if (ret >= 0 && func & CRYPT_Z990_FUNC_MASK){ 194 if (ret >= 0 && func & CRYPT_S390_FUNC_MASK){
171 ret = src_len - ret; 195 ret = src_len - ret;
172 } 196 }
173 return ret; 197 return ret;
174} 198}
175 199
176/* 200/*
177 * Executes the KMC (CIPHER MESSAGE WITH CHAINING) operation of the z990 CPU. 201 * Executes the KMC (CIPHER MESSAGE WITH CHAINING) operation of the CPU.
178 * @param func: the function code passed to KM; see crypt_z990_kmc_func 202 * @param func: the function code passed to KM; see crypt_s390_kmc_func
179 * @param param: address of parameter block; see POP for details on each func 203 * @param param: address of parameter block; see POP for details on each func
180 * @param dest: address of destination memory area 204 * @param dest: address of destination memory area
181 * @param src: address of source memory area 205 * @param src: address of source memory area
@@ -184,9 +208,9 @@ crypt_z990_km(long func, void* param, u8* dest, const u8* src, long src_len)
184 * for encryption/decryption funcs 208 * for encryption/decryption funcs
185 */ 209 */
186static inline int 210static inline int
187crypt_z990_kmc(long func, void* param, u8* dest, const u8* src, long src_len) 211crypt_s390_kmc(long func, void* param, u8* dest, const u8* src, long src_len)
188{ 212{
189 register long __func asm("0") = func & CRYPT_Z990_FUNC_MASK; 213 register long __func asm("0") = func & CRYPT_S390_FUNC_MASK;
190 register void* __param asm("1") = param; 214 register void* __param asm("1") = param;
191 register u8* __dest asm("4") = dest; 215 register u8* __dest asm("4") = dest;
192 register const u8* __src asm("2") = src; 216 register const u8* __src asm("2") = src;
@@ -195,18 +219,18 @@ crypt_z990_kmc(long func, void* param, u8* dest, const u8* src, long src_len)
195 219
196 ret = 0; 220 ret = 0;
197 __asm__ __volatile__ ( 221 __asm__ __volatile__ (
198 "0: .insn rre,0xB92F0000,%1,%2 \n" //KMC opcode 222 "0: .insn rre,0xB92F0000,%1,%2 \n" /* KMC opcode */
199 "1: brc 1,0b \n" //handle partial completion 223 "1: brc 1,0b \n" /* handle partial completion */
200 __crypt_z990_set_result 224 __crypt_s390_set_result
201 "6: \n" 225 "6: \n"
202 __crypt_z990_fixup 226 __crypt_s390_fixup
203 : "+d" (ret), "+a" (__dest), "+a" (__src), 227 : "+d" (ret), "+a" (__dest), "+a" (__src),
204 [result] "+d" (__src_len) 228 [result] "+d" (__src_len)
205 : [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func), 229 : [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
206 "a" (__param) 230 "a" (__param)
207 : "cc", "memory" 231 : "cc", "memory"
208 ); 232 );
209 if (ret >= 0 && func & CRYPT_Z990_FUNC_MASK){ 233 if (ret >= 0 && func & CRYPT_S390_FUNC_MASK){
210 ret = src_len - ret; 234 ret = src_len - ret;
211 } 235 }
212 return ret; 236 return ret;
@@ -214,8 +238,8 @@ crypt_z990_kmc(long func, void* param, u8* dest, const u8* src, long src_len)
214 238
215/* 239/*
216 * Executes the KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) operation 240 * Executes the KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) operation
217 * of the z990 CPU. 241 * of the CPU.
218 * @param func: the function code passed to KM; see crypt_z990_kimd_func 242 * @param func: the function code passed to KM; see crypt_s390_kimd_func
219 * @param param: address of parameter block; see POP for details on each func 243 * @param param: address of parameter block; see POP for details on each func
220 * @param src: address of source memory area 244 * @param src: address of source memory area
221 * @param src_len: length of src operand in bytes 245 * @param src_len: length of src operand in bytes
@@ -223,9 +247,9 @@ crypt_z990_kmc(long func, void* param, u8* dest, const u8* src, long src_len)
223 * for digest funcs 247 * for digest funcs
224 */ 248 */
225static inline int 249static inline int
226crypt_z990_kimd(long func, void* param, const u8* src, long src_len) 250crypt_s390_kimd(long func, void* param, const u8* src, long src_len)
227{ 251{
228 register long __func asm("0") = func & CRYPT_Z990_FUNC_MASK; 252 register long __func asm("0") = func & CRYPT_S390_FUNC_MASK;
229 register void* __param asm("1") = param; 253 register void* __param asm("1") = param;
230 register const u8* __src asm("2") = src; 254 register const u8* __src asm("2") = src;
231 register long __src_len asm("3") = src_len; 255 register long __src_len asm("3") = src_len;
@@ -233,25 +257,25 @@ crypt_z990_kimd(long func, void* param, const u8* src, long src_len)
233 257
234 ret = 0; 258 ret = 0;
235 __asm__ __volatile__ ( 259 __asm__ __volatile__ (
236 "0: .insn rre,0xB93E0000,%1,%1 \n" //KIMD opcode 260 "0: .insn rre,0xB93E0000,%1,%1 \n" /* KIMD opcode */
237 "1: brc 1,0b \n" /*handle partical completion of kimd*/ 261 "1: brc 1,0b \n" /* handle partical completion */
238 __crypt_z990_set_result 262 __crypt_s390_set_result
239 "6: \n" 263 "6: \n"
240 __crypt_z990_fixup 264 __crypt_s390_fixup
241 : "+d" (ret), "+a" (__src), [result] "+d" (__src_len) 265 : "+d" (ret), "+a" (__src), [result] "+d" (__src_len)
242 : [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func), 266 : [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
243 "a" (__param) 267 "a" (__param)
244 : "cc", "memory" 268 : "cc", "memory"
245 ); 269 );
246 if (ret >= 0 && (func & CRYPT_Z990_FUNC_MASK)){ 270 if (ret >= 0 && (func & CRYPT_S390_FUNC_MASK)){
247 ret = src_len - ret; 271 ret = src_len - ret;
248 } 272 }
249 return ret; 273 return ret;
250} 274}
251 275
252/* 276/*
253 * Executes the KLMD (COMPUTE LAST MESSAGE DIGEST) operation of the z990 CPU. 277 * Executes the KLMD (COMPUTE LAST MESSAGE DIGEST) operation of the CPU.
254 * @param func: the function code passed to KM; see crypt_z990_klmd_func 278 * @param func: the function code passed to KM; see crypt_s390_klmd_func
255 * @param param: address of parameter block; see POP for details on each func 279 * @param param: address of parameter block; see POP for details on each func
256 * @param src: address of source memory area 280 * @param src: address of source memory area
257 * @param src_len: length of src operand in bytes 281 * @param src_len: length of src operand in bytes
@@ -259,9 +283,9 @@ crypt_z990_kimd(long func, void* param, const u8* src, long src_len)
259 * for digest funcs 283 * for digest funcs
260 */ 284 */
261static inline int 285static inline int
262crypt_z990_klmd(long func, void* param, const u8* src, long src_len) 286crypt_s390_klmd(long func, void* param, const u8* src, long src_len)
263{ 287{
264 register long __func asm("0") = func & CRYPT_Z990_FUNC_MASK; 288 register long __func asm("0") = func & CRYPT_S390_FUNC_MASK;
265 register void* __param asm("1") = param; 289 register void* __param asm("1") = param;
266 register const u8* __src asm("2") = src; 290 register const u8* __src asm("2") = src;
267 register long __src_len asm("3") = src_len; 291 register long __src_len asm("3") = src_len;
@@ -269,17 +293,17 @@ crypt_z990_klmd(long func, void* param, const u8* src, long src_len)
269 293
270 ret = 0; 294 ret = 0;
271 __asm__ __volatile__ ( 295 __asm__ __volatile__ (
272 "0: .insn rre,0xB93F0000,%1,%1 \n" //KLMD opcode 296 "0: .insn rre,0xB93F0000,%1,%1 \n" /* KLMD opcode */
273 "1: brc 1,0b \n" /*handle partical completion of klmd*/ 297 "1: brc 1,0b \n" /* handle partical completion */
274 __crypt_z990_set_result 298 __crypt_s390_set_result
275 "6: \n" 299 "6: \n"
276 __crypt_z990_fixup 300 __crypt_s390_fixup
277 : "+d" (ret), "+a" (__src), [result] "+d" (__src_len) 301 : "+d" (ret), "+a" (__src), [result] "+d" (__src_len)
278 : [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func), 302 : [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
279 "a" (__param) 303 "a" (__param)
280 : "cc", "memory" 304 : "cc", "memory"
281 ); 305 );
282 if (ret >= 0 && func & CRYPT_Z990_FUNC_MASK){ 306 if (ret >= 0 && func & CRYPT_S390_FUNC_MASK){
283 ret = src_len - ret; 307 ret = src_len - ret;
284 } 308 }
285 return ret; 309 return ret;
@@ -287,8 +311,8 @@ crypt_z990_klmd(long func, void* param, const u8* src, long src_len)
287 311
288/* 312/*
289 * Executes the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) operation 313 * Executes the KMAC (COMPUTE MESSAGE AUTHENTICATION CODE) operation
290 * of the z990 CPU. 314 * of the CPU.
291 * @param func: the function code passed to KM; see crypt_z990_klmd_func 315 * @param func: the function code passed to KM; see crypt_s390_klmd_func
292 * @param param: address of parameter block; see POP for details on each func 316 * @param param: address of parameter block; see POP for details on each func
293 * @param src: address of source memory area 317 * @param src: address of source memory area
294 * @param src_len: length of src operand in bytes 318 * @param src_len: length of src operand in bytes
@@ -296,9 +320,9 @@ crypt_z990_klmd(long func, void* param, const u8* src, long src_len)
296 * for digest funcs 320 * for digest funcs
297 */ 321 */
298static inline int 322static inline int
299crypt_z990_kmac(long func, void* param, const u8* src, long src_len) 323crypt_s390_kmac(long func, void* param, const u8* src, long src_len)
300{ 324{
301 register long __func asm("0") = func & CRYPT_Z990_FUNC_MASK; 325 register long __func asm("0") = func & CRYPT_S390_FUNC_MASK;
302 register void* __param asm("1") = param; 326 register void* __param asm("1") = param;
303 register const u8* __src asm("2") = src; 327 register const u8* __src asm("2") = src;
304 register long __src_len asm("3") = src_len; 328 register long __src_len asm("3") = src_len;
@@ -306,58 +330,58 @@ crypt_z990_kmac(long func, void* param, const u8* src, long src_len)
306 330
307 ret = 0; 331 ret = 0;
308 __asm__ __volatile__ ( 332 __asm__ __volatile__ (
309 "0: .insn rre,0xB91E0000,%5,%5 \n" //KMAC opcode 333 "0: .insn rre,0xB91E0000,%5,%5 \n" /* KMAC opcode */
310 "1: brc 1,0b \n" /*handle partical completion of klmd*/ 334 "1: brc 1,0b \n" /* handle partical completion */
311 __crypt_z990_set_result 335 __crypt_s390_set_result
312 "6: \n" 336 "6: \n"
313 __crypt_z990_fixup 337 __crypt_s390_fixup
314 : "+d" (ret), "+a" (__src), [result] "+d" (__src_len) 338 : "+d" (ret), "+a" (__src), [result] "+d" (__src_len)
315 : [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func), 339 : [e1] "K" (-EFAULT), [e2] "K" (-ENOSYS), "d" (__func),
316 "a" (__param) 340 "a" (__param)
317 : "cc", "memory" 341 : "cc", "memory"
318 ); 342 );
319 if (ret >= 0 && func & CRYPT_Z990_FUNC_MASK){ 343 if (ret >= 0 && func & CRYPT_S390_FUNC_MASK){
320 ret = src_len - ret; 344 ret = src_len - ret;
321 } 345 }
322 return ret; 346 return ret;
323} 347}
324 348
325/** 349/**
326 * Tests if a specific z990 crypto function is implemented on the machine. 350 * Tests if a specific crypto function is implemented on the machine.
327 * @param func: the function code of the specific function; 0 if op in general 351 * @param func: the function code of the specific function; 0 if op in general
328 * @return 1 if func available; 0 if func or op in general not available 352 * @return 1 if func available; 0 if func or op in general not available
329 */ 353 */
330static inline int 354static inline int
331crypt_z990_func_available(int func) 355crypt_s390_func_available(int func)
332{ 356{
333 int ret; 357 int ret;
334 358
335 struct crypt_z990_query_status status = { 359 struct crypt_s390_query_status status = {
336 .high = 0, 360 .high = 0,
337 .low = 0 361 .low = 0
338 }; 362 };
339 switch (func & CRYPT_Z990_OP_MASK){ 363 switch (func & CRYPT_S390_OP_MASK){
340 case CRYPT_Z990_KM: 364 case CRYPT_S390_KM:
341 ret = crypt_z990_km(KM_QUERY, &status, NULL, NULL, 0); 365 ret = crypt_s390_km(KM_QUERY, &status, NULL, NULL, 0);
342 break; 366 break;
343 case CRYPT_Z990_KMC: 367 case CRYPT_S390_KMC:
344 ret = crypt_z990_kmc(KMC_QUERY, &status, NULL, NULL, 0); 368 ret = crypt_s390_kmc(KMC_QUERY, &status, NULL, NULL, 0);
345 break; 369 break;
346 case CRYPT_Z990_KIMD: 370 case CRYPT_S390_KIMD:
347 ret = crypt_z990_kimd(KIMD_QUERY, &status, NULL, 0); 371 ret = crypt_s390_kimd(KIMD_QUERY, &status, NULL, 0);
348 break; 372 break;
349 case CRYPT_Z990_KLMD: 373 case CRYPT_S390_KLMD:
350 ret = crypt_z990_klmd(KLMD_QUERY, &status, NULL, 0); 374 ret = crypt_s390_klmd(KLMD_QUERY, &status, NULL, 0);
351 break; 375 break;
352 case CRYPT_Z990_KMAC: 376 case CRYPT_S390_KMAC:
353 ret = crypt_z990_kmac(KMAC_QUERY, &status, NULL, 0); 377 ret = crypt_s390_kmac(KMAC_QUERY, &status, NULL, 0);
354 break; 378 break;
355 default: 379 default:
356 ret = 0; 380 ret = 0;
357 return ret; 381 return ret;
358 } 382 }
359 if (ret >= 0){ 383 if (ret >= 0){
360 func &= CRYPT_Z990_FUNC_MASK; 384 func &= CRYPT_S390_FUNC_MASK;
361 func &= 0x7f; //mask modifier bit 385 func &= 0x7f; //mask modifier bit
362 if (func < 64){ 386 if (func < 64){
363 ret = (status.high >> (64 - func - 1)) & 0x1; 387 ret = (status.high >> (64 - func - 1)) & 0x1;
@@ -370,5 +394,4 @@ crypt_z990_func_available(int func)
370 return ret; 394 return ret;
371} 395}
372 396
373 397#endif // _CRYPTO_ARCH_S390_CRYPT_S390_H
374#endif // _CRYPTO_ARCH_S390_CRYPT_Z990_H
diff --git a/arch/s390/crypto/crypt_s390_query.c b/arch/s390/crypto/crypt_s390_query.c
new file mode 100644
index 000000000000..def02bdc44a4
--- /dev/null
+++ b/arch/s390/crypto/crypt_s390_query.c
@@ -0,0 +1,129 @@
1/*
2 * Cryptographic API.
3 *
4 * Support for s390 cryptographic instructions.
5 * Testing module for querying processor crypto capabilities.
6 *
7 * Copyright (c) 2003 IBM Deutschland Entwicklung GmbH, IBM Corporation
8 * Author(s): Thomas Spatzier (tspat@de.ibm.com)
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License as published by the Free
12 * Software Foundation; either version 2 of the License, or (at your option)
13 * any later version.
14 *
15 */
16#include <linux/module.h>
17#include <linux/init.h>
18#include <linux/kernel.h>
19#include <asm/errno.h>
20#include "crypt_s390.h"
21
22static void query_available_functions(void)
23{
24 printk(KERN_INFO "#####################\n");
25
26 /* query available KM functions */
27 printk(KERN_INFO "KM_QUERY: %d\n",
28 crypt_s390_func_available(KM_QUERY));
29 printk(KERN_INFO "KM_DEA: %d\n",
30 crypt_s390_func_available(KM_DEA_ENCRYPT));
31 printk(KERN_INFO "KM_TDEA_128: %d\n",
32 crypt_s390_func_available(KM_TDEA_128_ENCRYPT));
33 printk(KERN_INFO "KM_TDEA_192: %d\n",
34 crypt_s390_func_available(KM_TDEA_192_ENCRYPT));
35 printk(KERN_INFO "KM_AES_128: %d\n",
36 crypt_s390_func_available(KM_AES_128_ENCRYPT));
37 printk(KERN_INFO "KM_AES_192: %d\n",
38 crypt_s390_func_available(KM_AES_192_ENCRYPT));
39 printk(KERN_INFO "KM_AES_256: %d\n",
40 crypt_s390_func_available(KM_AES_256_ENCRYPT));
41
42 /* query available KMC functions */
43 printk(KERN_INFO "KMC_QUERY: %d\n",
44 crypt_s390_func_available(KMC_QUERY));
45 printk(KERN_INFO "KMC_DEA: %d\n",
46 crypt_s390_func_available(KMC_DEA_ENCRYPT));
47 printk(KERN_INFO "KMC_TDEA_128: %d\n",
48 crypt_s390_func_available(KMC_TDEA_128_ENCRYPT));
49 printk(KERN_INFO "KMC_TDEA_192: %d\n",
50 crypt_s390_func_available(KMC_TDEA_192_ENCRYPT));
51 printk(KERN_INFO "KMC_AES_128: %d\n",
52 crypt_s390_func_available(KMC_AES_128_ENCRYPT));
53 printk(KERN_INFO "KMC_AES_192: %d\n",
54 crypt_s390_func_available(KMC_AES_192_ENCRYPT));
55 printk(KERN_INFO "KMC_AES_256: %d\n",
56 crypt_s390_func_available(KMC_AES_256_ENCRYPT));
57
58 /* query available KIMD fucntions */
59 printk(KERN_INFO "KIMD_QUERY: %d\n",
60 crypt_s390_func_available(KIMD_QUERY));
61 printk(KERN_INFO "KIMD_SHA_1: %d\n",
62 crypt_s390_func_available(KIMD_SHA_1));
63 printk(KERN_INFO "KIMD_SHA_256: %d\n",
64 crypt_s390_func_available(KIMD_SHA_256));
65
66 /* query available KLMD functions */
67 printk(KERN_INFO "KLMD_QUERY: %d\n",
68 crypt_s390_func_available(KLMD_QUERY));
69 printk(KERN_INFO "KLMD_SHA_1: %d\n",
70 crypt_s390_func_available(KLMD_SHA_1));
71 printk(KERN_INFO "KLMD_SHA_256: %d\n",
72 crypt_s390_func_available(KLMD_SHA_256));
73
74 /* query available KMAC functions */
75 printk(KERN_INFO "KMAC_QUERY: %d\n",
76 crypt_s390_func_available(KMAC_QUERY));
77 printk(KERN_INFO "KMAC_DEA: %d\n",
78 crypt_s390_func_available(KMAC_DEA));
79 printk(KERN_INFO "KMAC_TDEA_128: %d\n",
80 crypt_s390_func_available(KMAC_TDEA_128));
81 printk(KERN_INFO "KMAC_TDEA_192: %d\n",
82 crypt_s390_func_available(KMAC_TDEA_192));
83}
84
85static int init(void)
86{
87 struct crypt_s390_query_status status = {
88 .high = 0,
89 .low = 0
90 };
91
92 printk(KERN_INFO "crypt_s390: querying available crypto functions\n");
93 crypt_s390_km(KM_QUERY, &status, NULL, NULL, 0);
94 printk(KERN_INFO "KM:\t%016llx %016llx\n",
95 (unsigned long long) status.high,
96 (unsigned long long) status.low);
97 status.high = status.low = 0;
98 crypt_s390_kmc(KMC_QUERY, &status, NULL, NULL, 0);
99 printk(KERN_INFO "KMC:\t%016llx %016llx\n",
100 (unsigned long long) status.high,
101 (unsigned long long) status.low);
102 status.high = status.low = 0;
103 crypt_s390_kimd(KIMD_QUERY, &status, NULL, 0);
104 printk(KERN_INFO "KIMD:\t%016llx %016llx\n",
105 (unsigned long long) status.high,
106 (unsigned long long) status.low);
107 status.high = status.low = 0;
108 crypt_s390_klmd(KLMD_QUERY, &status, NULL, 0);
109 printk(KERN_INFO "KLMD:\t%016llx %016llx\n",
110 (unsigned long long) status.high,
111 (unsigned long long) status.low);
112 status.high = status.low = 0;
113 crypt_s390_kmac(KMAC_QUERY, &status, NULL, 0);
114 printk(KERN_INFO "KMAC:\t%016llx %016llx\n",
115 (unsigned long long) status.high,
116 (unsigned long long) status.low);
117
118 query_available_functions();
119 return -ECANCELED;
120}
121
122static void __exit cleanup(void)
123{
124}
125
126module_init(init);
127module_exit(cleanup);
128
129MODULE_LICENSE("GPL");
diff --git a/arch/s390/crypto/crypt_z990_query.c b/arch/s390/crypto/crypt_z990_query.c
deleted file mode 100644
index 7133983d1384..000000000000
--- a/arch/s390/crypto/crypt_z990_query.c
+++ /dev/null
@@ -1,111 +0,0 @@
1/*
2 * Cryptographic API.
3 *
4 * Support for z990 cryptographic instructions.
5 * Testing module for querying processor crypto capabilities.
6 *
7 * Copyright (c) 2003 IBM Deutschland Entwicklung GmbH, IBM Corporation
8 * Author(s): Thomas Spatzier (tspat@de.ibm.com)
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU General Public License as published by the Free
12 * Software Foundation; either version 2 of the License, or (at your option)
13 * any later version.
14 *
15 */
16#include <linux/module.h>
17#include <linux/init.h>
18#include <linux/kernel.h>
19#include <asm/errno.h>
20#include "crypt_z990.h"
21
22static void
23query_available_functions(void)
24{
25 printk(KERN_INFO "#####################\n");
26 //query available KM functions
27 printk(KERN_INFO "KM_QUERY: %d\n",
28 crypt_z990_func_available(KM_QUERY));
29 printk(KERN_INFO "KM_DEA: %d\n",
30 crypt_z990_func_available(KM_DEA_ENCRYPT));
31 printk(KERN_INFO "KM_TDEA_128: %d\n",
32 crypt_z990_func_available(KM_TDEA_128_ENCRYPT));
33 printk(KERN_INFO "KM_TDEA_192: %d\n",
34 crypt_z990_func_available(KM_TDEA_192_ENCRYPT));
35 //query available KMC functions
36 printk(KERN_INFO "KMC_QUERY: %d\n",
37 crypt_z990_func_available(KMC_QUERY));
38 printk(KERN_INFO "KMC_DEA: %d\n",
39 crypt_z990_func_available(KMC_DEA_ENCRYPT));
40 printk(KERN_INFO "KMC_TDEA_128: %d\n",
41 crypt_z990_func_available(KMC_TDEA_128_ENCRYPT));
42 printk(KERN_INFO "KMC_TDEA_192: %d\n",
43 crypt_z990_func_available(KMC_TDEA_192_ENCRYPT));
44 //query available KIMD fucntions
45 printk(KERN_INFO "KIMD_QUERY: %d\n",
46 crypt_z990_func_available(KIMD_QUERY));
47 printk(KERN_INFO "KIMD_SHA_1: %d\n",
48 crypt_z990_func_available(KIMD_SHA_1));
49 //query available KLMD functions
50 printk(KERN_INFO "KLMD_QUERY: %d\n",
51 crypt_z990_func_available(KLMD_QUERY));
52 printk(KERN_INFO "KLMD_SHA_1: %d\n",
53 crypt_z990_func_available(KLMD_SHA_1));
54 //query available KMAC functions
55 printk(KERN_INFO "KMAC_QUERY: %d\n",
56 crypt_z990_func_available(KMAC_QUERY));
57 printk(KERN_INFO "KMAC_DEA: %d\n",
58 crypt_z990_func_available(KMAC_DEA));
59 printk(KERN_INFO "KMAC_TDEA_128: %d\n",
60 crypt_z990_func_available(KMAC_TDEA_128));
61 printk(KERN_INFO "KMAC_TDEA_192: %d\n",
62 crypt_z990_func_available(KMAC_TDEA_192));
63}
64
65static int
66init(void)
67{
68 struct crypt_z990_query_status status = {
69 .high = 0,
70 .low = 0
71 };
72
73 printk(KERN_INFO "crypt_z990: querying available crypto functions\n");
74 crypt_z990_km(KM_QUERY, &status, NULL, NULL, 0);
75 printk(KERN_INFO "KM: %016llx %016llx\n",
76 (unsigned long long) status.high,
77 (unsigned long long) status.low);
78 status.high = status.low = 0;
79 crypt_z990_kmc(KMC_QUERY, &status, NULL, NULL, 0);
80 printk(KERN_INFO "KMC: %016llx %016llx\n",
81 (unsigned long long) status.high,
82 (unsigned long long) status.low);
83 status.high = status.low = 0;
84 crypt_z990_kimd(KIMD_QUERY, &status, NULL, 0);
85 printk(KERN_INFO "KIMD: %016llx %016llx\n",
86 (unsigned long long) status.high,
87 (unsigned long long) status.low);
88 status.high = status.low = 0;
89 crypt_z990_klmd(KLMD_QUERY, &status, NULL, 0);
90 printk(KERN_INFO "KLMD: %016llx %016llx\n",
91 (unsigned long long) status.high,
92 (unsigned long long) status.low);
93 status.high = status.low = 0;
94 crypt_z990_kmac(KMAC_QUERY, &status, NULL, 0);
95 printk(KERN_INFO "KMAC: %016llx %016llx\n",
96 (unsigned long long) status.high,
97 (unsigned long long) status.low);
98
99 query_available_functions();
100 return -1;
101}
102
103static void __exit
104cleanup(void)
105{
106}
107
108module_init(init);
109module_exit(cleanup);
110
111MODULE_LICENSE("GPL");
diff --git a/arch/s390/crypto/des_z990.c b/arch/s390/crypto/des_s390.c
index 813cf37b1177..a38bb2a3eef6 100644
--- a/arch/s390/crypto/des_z990.c
+++ b/arch/s390/crypto/des_s390.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Cryptographic API. 2 * Cryptographic API.
3 * 3 *
4 * z990 implementation of the DES Cipher Algorithm. 4 * s390 implementation of the DES Cipher Algorithm.
5 * 5 *
6 * Copyright (c) 2003 IBM Deutschland Entwicklung GmbH, IBM Corporation 6 * Copyright (c) 2003 IBM Deutschland Entwicklung GmbH, IBM Corporation
7 * Author(s): Thomas Spatzier (tspat@de.ibm.com) 7 * Author(s): Thomas Spatzier (tspat@de.ibm.com)
@@ -19,7 +19,7 @@
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <asm/scatterlist.h> 20#include <asm/scatterlist.h>
21#include <linux/crypto.h> 21#include <linux/crypto.h>
22#include "crypt_z990.h" 22#include "crypt_s390.h"
23#include "crypto_des.h" 23#include "crypto_des.h"
24 24
25#define DES_BLOCK_SIZE 8 25#define DES_BLOCK_SIZE 8
@@ -31,17 +31,17 @@
31#define DES3_192_KEY_SIZE (3 * DES_KEY_SIZE) 31#define DES3_192_KEY_SIZE (3 * DES_KEY_SIZE)
32#define DES3_192_BLOCK_SIZE DES_BLOCK_SIZE 32#define DES3_192_BLOCK_SIZE DES_BLOCK_SIZE
33 33
34struct crypt_z990_des_ctx { 34struct crypt_s390_des_ctx {
35 u8 iv[DES_BLOCK_SIZE]; 35 u8 iv[DES_BLOCK_SIZE];
36 u8 key[DES_KEY_SIZE]; 36 u8 key[DES_KEY_SIZE];
37}; 37};
38 38
39struct crypt_z990_des3_128_ctx { 39struct crypt_s390_des3_128_ctx {
40 u8 iv[DES_BLOCK_SIZE]; 40 u8 iv[DES_BLOCK_SIZE];
41 u8 key[DES3_128_KEY_SIZE]; 41 u8 key[DES3_128_KEY_SIZE];
42}; 42};
43 43
44struct crypt_z990_des3_192_ctx { 44struct crypt_s390_des3_192_ctx {
45 u8 iv[DES_BLOCK_SIZE]; 45 u8 iv[DES_BLOCK_SIZE];
46 u8 key[DES3_192_KEY_SIZE]; 46 u8 key[DES3_192_KEY_SIZE];
47}; 47};
@@ -49,7 +49,7 @@ struct crypt_z990_des3_192_ctx {
49static int 49static int
50des_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags) 50des_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
51{ 51{
52 struct crypt_z990_des_ctx *dctx; 52 struct crypt_s390_des_ctx *dctx;
53 int ret; 53 int ret;
54 54
55 dctx = ctx; 55 dctx = ctx;
@@ -65,26 +65,26 @@ des_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
65static void 65static void
66des_encrypt(void *ctx, u8 *dst, const u8 *src) 66des_encrypt(void *ctx, u8 *dst, const u8 *src)
67{ 67{
68 struct crypt_z990_des_ctx *dctx; 68 struct crypt_s390_des_ctx *dctx;
69 69
70 dctx = ctx; 70 dctx = ctx;
71 crypt_z990_km(KM_DEA_ENCRYPT, dctx->key, dst, src, DES_BLOCK_SIZE); 71 crypt_s390_km(KM_DEA_ENCRYPT, dctx->key, dst, src, DES_BLOCK_SIZE);
72} 72}
73 73
74static void 74static void
75des_decrypt(void *ctx, u8 *dst, const u8 *src) 75des_decrypt(void *ctx, u8 *dst, const u8 *src)
76{ 76{
77 struct crypt_z990_des_ctx *dctx; 77 struct crypt_s390_des_ctx *dctx;
78 78
79 dctx = ctx; 79 dctx = ctx;
80 crypt_z990_km(KM_DEA_DECRYPT, dctx->key, dst, src, DES_BLOCK_SIZE); 80 crypt_s390_km(KM_DEA_DECRYPT, dctx->key, dst, src, DES_BLOCK_SIZE);
81} 81}
82 82
83static struct crypto_alg des_alg = { 83static struct crypto_alg des_alg = {
84 .cra_name = "des", 84 .cra_name = "des",
85 .cra_flags = CRYPTO_ALG_TYPE_CIPHER, 85 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
86 .cra_blocksize = DES_BLOCK_SIZE, 86 .cra_blocksize = DES_BLOCK_SIZE,
87 .cra_ctxsize = sizeof(struct crypt_z990_des_ctx), 87 .cra_ctxsize = sizeof(struct crypt_s390_des_ctx),
88 .cra_module = THIS_MODULE, 88 .cra_module = THIS_MODULE,
89 .cra_list = LIST_HEAD_INIT(des_alg.cra_list), 89 .cra_list = LIST_HEAD_INIT(des_alg.cra_list),
90 .cra_u = { .cipher = { 90 .cra_u = { .cipher = {
@@ -111,7 +111,7 @@ static int
111des3_128_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags) 111des3_128_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
112{ 112{
113 int i, ret; 113 int i, ret;
114 struct crypt_z990_des3_128_ctx *dctx; 114 struct crypt_s390_des3_128_ctx *dctx;
115 const u8* temp_key = key; 115 const u8* temp_key = key;
116 116
117 dctx = ctx; 117 dctx = ctx;
@@ -132,20 +132,20 @@ des3_128_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
132static void 132static void
133des3_128_encrypt(void *ctx, u8 *dst, const u8 *src) 133des3_128_encrypt(void *ctx, u8 *dst, const u8 *src)
134{ 134{
135 struct crypt_z990_des3_128_ctx *dctx; 135 struct crypt_s390_des3_128_ctx *dctx;
136 136
137 dctx = ctx; 137 dctx = ctx;
138 crypt_z990_km(KM_TDEA_128_ENCRYPT, dctx->key, dst, (void*)src, 138 crypt_s390_km(KM_TDEA_128_ENCRYPT, dctx->key, dst, (void*)src,
139 DES3_128_BLOCK_SIZE); 139 DES3_128_BLOCK_SIZE);
140} 140}
141 141
142static void 142static void
143des3_128_decrypt(void *ctx, u8 *dst, const u8 *src) 143des3_128_decrypt(void *ctx, u8 *dst, const u8 *src)
144{ 144{
145 struct crypt_z990_des3_128_ctx *dctx; 145 struct crypt_s390_des3_128_ctx *dctx;
146 146
147 dctx = ctx; 147 dctx = ctx;
148 crypt_z990_km(KM_TDEA_128_DECRYPT, dctx->key, dst, (void*)src, 148 crypt_s390_km(KM_TDEA_128_DECRYPT, dctx->key, dst, (void*)src,
149 DES3_128_BLOCK_SIZE); 149 DES3_128_BLOCK_SIZE);
150} 150}
151 151
@@ -153,7 +153,7 @@ static struct crypto_alg des3_128_alg = {
153 .cra_name = "des3_ede128", 153 .cra_name = "des3_ede128",
154 .cra_flags = CRYPTO_ALG_TYPE_CIPHER, 154 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
155 .cra_blocksize = DES3_128_BLOCK_SIZE, 155 .cra_blocksize = DES3_128_BLOCK_SIZE,
156 .cra_ctxsize = sizeof(struct crypt_z990_des3_128_ctx), 156 .cra_ctxsize = sizeof(struct crypt_s390_des3_128_ctx),
157 .cra_module = THIS_MODULE, 157 .cra_module = THIS_MODULE,
158 .cra_list = LIST_HEAD_INIT(des3_128_alg.cra_list), 158 .cra_list = LIST_HEAD_INIT(des3_128_alg.cra_list),
159 .cra_u = { .cipher = { 159 .cra_u = { .cipher = {
@@ -181,7 +181,7 @@ static int
181des3_192_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags) 181des3_192_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
182{ 182{
183 int i, ret; 183 int i, ret;
184 struct crypt_z990_des3_192_ctx *dctx; 184 struct crypt_s390_des3_192_ctx *dctx;
185 const u8* temp_key; 185 const u8* temp_key;
186 186
187 dctx = ctx; 187 dctx = ctx;
@@ -206,20 +206,20 @@ des3_192_setkey(void *ctx, const u8 *key, unsigned int keylen, u32 *flags)
206static void 206static void
207des3_192_encrypt(void *ctx, u8 *dst, const u8 *src) 207des3_192_encrypt(void *ctx, u8 *dst, const u8 *src)
208{ 208{
209 struct crypt_z990_des3_192_ctx *dctx; 209 struct crypt_s390_des3_192_ctx *dctx;
210 210
211 dctx = ctx; 211 dctx = ctx;
212 crypt_z990_km(KM_TDEA_192_ENCRYPT, dctx->key, dst, (void*)src, 212 crypt_s390_km(KM_TDEA_192_ENCRYPT, dctx->key, dst, (void*)src,
213 DES3_192_BLOCK_SIZE); 213 DES3_192_BLOCK_SIZE);
214} 214}
215 215
216static void 216static void
217des3_192_decrypt(void *ctx, u8 *dst, const u8 *src) 217des3_192_decrypt(void *ctx, u8 *dst, const u8 *src)
218{ 218{
219 struct crypt_z990_des3_192_ctx *dctx; 219 struct crypt_s390_des3_192_ctx *dctx;
220 220
221 dctx = ctx; 221 dctx = ctx;
222 crypt_z990_km(KM_TDEA_192_DECRYPT, dctx->key, dst, (void*)src, 222 crypt_s390_km(KM_TDEA_192_DECRYPT, dctx->key, dst, (void*)src,
223 DES3_192_BLOCK_SIZE); 223 DES3_192_BLOCK_SIZE);
224} 224}
225 225
@@ -227,7 +227,7 @@ static struct crypto_alg des3_192_alg = {
227 .cra_name = "des3_ede", 227 .cra_name = "des3_ede",
228 .cra_flags = CRYPTO_ALG_TYPE_CIPHER, 228 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
229 .cra_blocksize = DES3_192_BLOCK_SIZE, 229 .cra_blocksize = DES3_192_BLOCK_SIZE,
230 .cra_ctxsize = sizeof(struct crypt_z990_des3_192_ctx), 230 .cra_ctxsize = sizeof(struct crypt_s390_des3_192_ctx),
231 .cra_module = THIS_MODULE, 231 .cra_module = THIS_MODULE,
232 .cra_list = LIST_HEAD_INIT(des3_192_alg.cra_list), 232 .cra_list = LIST_HEAD_INIT(des3_192_alg.cra_list),
233 .cra_u = { .cipher = { 233 .cra_u = { .cipher = {
@@ -245,9 +245,9 @@ init(void)
245{ 245{
246 int ret; 246 int ret;
247 247
248 if (!crypt_z990_func_available(KM_DEA_ENCRYPT) || 248 if (!crypt_s390_func_available(KM_DEA_ENCRYPT) ||
249 !crypt_z990_func_available(KM_TDEA_128_ENCRYPT) || 249 !crypt_s390_func_available(KM_TDEA_128_ENCRYPT) ||
250 !crypt_z990_func_available(KM_TDEA_192_ENCRYPT)){ 250 !crypt_s390_func_available(KM_TDEA_192_ENCRYPT)){
251 return -ENOSYS; 251 return -ENOSYS;
252 } 252 }
253 253
@@ -262,7 +262,7 @@ init(void)
262 return -EEXIST; 262 return -EEXIST;
263 } 263 }
264 264
265 printk(KERN_INFO "crypt_z990: des_z990 loaded.\n"); 265 printk(KERN_INFO "crypt_s390: des_s390 loaded.\n");
266 return 0; 266 return 0;
267} 267}
268 268
diff --git a/arch/s390/crypto/sha1_z990.c b/arch/s390/crypto/sha1_s390.c
index 298174ddf5b1..98c896b86dcd 100644
--- a/arch/s390/crypto/sha1_z990.c
+++ b/arch/s390/crypto/sha1_s390.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Cryptographic API. 2 * Cryptographic API.
3 * 3 *
4 * z990 implementation of the SHA1 Secure Hash Algorithm. 4 * s390 implementation of the SHA1 Secure Hash Algorithm.
5 * 5 *
6 * Derived from cryptoapi implementation, adapted for in-place 6 * Derived from cryptoapi implementation, adapted for in-place
7 * scatterlist interface. Originally based on the public domain 7 * scatterlist interface. Originally based on the public domain
@@ -28,22 +28,22 @@
28#include <linux/crypto.h> 28#include <linux/crypto.h>
29#include <asm/scatterlist.h> 29#include <asm/scatterlist.h>
30#include <asm/byteorder.h> 30#include <asm/byteorder.h>
31#include "crypt_z990.h" 31#include "crypt_s390.h"
32 32
33#define SHA1_DIGEST_SIZE 20 33#define SHA1_DIGEST_SIZE 20
34#define SHA1_BLOCK_SIZE 64 34#define SHA1_BLOCK_SIZE 64
35 35
36struct crypt_z990_sha1_ctx { 36struct crypt_s390_sha1_ctx {
37 u64 count; 37 u64 count;
38 u32 state[5]; 38 u32 state[5];
39 u32 buf_len; 39 u32 buf_len;
40 u8 buffer[2 * SHA1_BLOCK_SIZE]; 40 u8 buffer[2 * SHA1_BLOCK_SIZE];
41}; 41};
42 42
43static void 43static void
44sha1_init(void *ctx) 44sha1_init(void *ctx)
45{ 45{
46 static const struct crypt_z990_sha1_ctx initstate = { 46 static const struct crypt_s390_sha1_ctx initstate = {
47 .state = { 47 .state = {
48 0x67452301, 48 0x67452301,
49 0xEFCDAB89, 49 0xEFCDAB89,
@@ -58,7 +58,7 @@ sha1_init(void *ctx)
58static void 58static void
59sha1_update(void *ctx, const u8 *data, unsigned int len) 59sha1_update(void *ctx, const u8 *data, unsigned int len)
60{ 60{
61 struct crypt_z990_sha1_ctx *sctx; 61 struct crypt_s390_sha1_ctx *sctx;
62 long imd_len; 62 long imd_len;
63 63
64 sctx = ctx; 64 sctx = ctx;
@@ -69,7 +69,7 @@ sha1_update(void *ctx, const u8 *data, unsigned int len)
69 //complete full block and hash 69 //complete full block and hash
70 memcpy(sctx->buffer + sctx->buf_len, data, 70 memcpy(sctx->buffer + sctx->buf_len, data,
71 SHA1_BLOCK_SIZE - sctx->buf_len); 71 SHA1_BLOCK_SIZE - sctx->buf_len);
72 crypt_z990_kimd(KIMD_SHA_1, sctx->state, sctx->buffer, 72 crypt_s390_kimd(KIMD_SHA_1, sctx->state, sctx->buffer,
73 SHA1_BLOCK_SIZE); 73 SHA1_BLOCK_SIZE);
74 data += SHA1_BLOCK_SIZE - sctx->buf_len; 74 data += SHA1_BLOCK_SIZE - sctx->buf_len;
75 len -= SHA1_BLOCK_SIZE - sctx->buf_len; 75 len -= SHA1_BLOCK_SIZE - sctx->buf_len;
@@ -79,7 +79,7 @@ sha1_update(void *ctx, const u8 *data, unsigned int len)
79 //rest of data contains full blocks? 79 //rest of data contains full blocks?
80 imd_len = len & ~0x3ful; 80 imd_len = len & ~0x3ful;
81 if (imd_len){ 81 if (imd_len){
82 crypt_z990_kimd(KIMD_SHA_1, sctx->state, data, imd_len); 82 crypt_s390_kimd(KIMD_SHA_1, sctx->state, data, imd_len);
83 data += imd_len; 83 data += imd_len;
84 len -= imd_len; 84 len -= imd_len;
85 } 85 }
@@ -92,7 +92,7 @@ sha1_update(void *ctx, const u8 *data, unsigned int len)
92 92
93 93
94static void 94static void
95pad_message(struct crypt_z990_sha1_ctx* sctx) 95pad_message(struct crypt_s390_sha1_ctx* sctx)
96{ 96{
97 int index; 97 int index;
98 98
@@ -113,11 +113,11 @@ pad_message(struct crypt_z990_sha1_ctx* sctx)
113static void 113static void
114sha1_final(void* ctx, u8 *out) 114sha1_final(void* ctx, u8 *out)
115{ 115{
116 struct crypt_z990_sha1_ctx *sctx = ctx; 116 struct crypt_s390_sha1_ctx *sctx = ctx;
117 117
118 //must perform manual padding 118 //must perform manual padding
119 pad_message(sctx); 119 pad_message(sctx);
120 crypt_z990_kimd(KIMD_SHA_1, sctx->state, sctx->buffer, sctx->buf_len); 120 crypt_s390_kimd(KIMD_SHA_1, sctx->state, sctx->buffer, sctx->buf_len);
121 //copy digest to out 121 //copy digest to out
122 memcpy(out, sctx->state, SHA1_DIGEST_SIZE); 122 memcpy(out, sctx->state, SHA1_DIGEST_SIZE);
123 /* Wipe context */ 123 /* Wipe context */
@@ -128,7 +128,7 @@ static struct crypto_alg alg = {
128 .cra_name = "sha1", 128 .cra_name = "sha1",
129 .cra_flags = CRYPTO_ALG_TYPE_DIGEST, 129 .cra_flags = CRYPTO_ALG_TYPE_DIGEST,
130 .cra_blocksize = SHA1_BLOCK_SIZE, 130 .cra_blocksize = SHA1_BLOCK_SIZE,
131 .cra_ctxsize = sizeof(struct crypt_z990_sha1_ctx), 131 .cra_ctxsize = sizeof(struct crypt_s390_sha1_ctx),
132 .cra_module = THIS_MODULE, 132 .cra_module = THIS_MODULE,
133 .cra_list = LIST_HEAD_INIT(alg.cra_list), 133 .cra_list = LIST_HEAD_INIT(alg.cra_list),
134 .cra_u = { .digest = { 134 .cra_u = { .digest = {
@@ -143,10 +143,10 @@ init(void)
143{ 143{
144 int ret = -ENOSYS; 144 int ret = -ENOSYS;
145 145
146 if (crypt_z990_func_available(KIMD_SHA_1)){ 146 if (crypt_s390_func_available(KIMD_SHA_1)){
147 ret = crypto_register_alg(&alg); 147 ret = crypto_register_alg(&alg);
148 if (ret == 0){ 148 if (ret == 0){
149 printk(KERN_INFO "crypt_z990: sha1_z990 loaded.\n"); 149 printk(KERN_INFO "crypt_s390: sha1_s390 loaded.\n");
150 } 150 }
151 } 151 }
152 return ret; 152 return ret;
diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c
new file mode 100644
index 000000000000..b75bdbd476c7
--- /dev/null
+++ b/arch/s390/crypto/sha256_s390.c
@@ -0,0 +1,151 @@
1/*
2 * Cryptographic API.
3 *
4 * s390 implementation of the SHA256 Secure Hash Algorithm.
5 *
6 * s390 Version:
7 * Copyright (C) 2005 IBM Deutschland GmbH, IBM Corporation
8 * Author(s): Jan Glauber (jang@de.ibm.com)
9 *
10 * Derived from "crypto/sha256.c"
11 * and "arch/s390/crypto/sha1_s390.c"
12 *
13 * This program is free software; you can redistribute it and/or modify it
14 * under the terms of the GNU General Public License as published by the Free
15 * Software Foundation; either version 2 of the License, or (at your option)
16 * any later version.
17 *
18 */
19#include <linux/init.h>
20#include <linux/module.h>
21#include <linux/crypto.h>
22
23#include "crypt_s390.h"
24
25#define SHA256_DIGEST_SIZE 32
26#define SHA256_BLOCK_SIZE 64
27
28struct s390_sha256_ctx {
29 u64 count;
30 u32 state[8];
31 u8 buf[2 * SHA256_BLOCK_SIZE];
32};
33
34static void sha256_init(void *ctx)
35{
36 struct s390_sha256_ctx *sctx = ctx;
37
38 sctx->state[0] = 0x6a09e667;
39 sctx->state[1] = 0xbb67ae85;
40 sctx->state[2] = 0x3c6ef372;
41 sctx->state[3] = 0xa54ff53a;
42 sctx->state[4] = 0x510e527f;
43 sctx->state[5] = 0x9b05688c;
44 sctx->state[6] = 0x1f83d9ab;
45 sctx->state[7] = 0x5be0cd19;
46 sctx->count = 0;
47 memset(sctx->buf, 0, sizeof(sctx->buf));
48}
49
50static void sha256_update(void *ctx, const u8 *data, unsigned int len)
51{
52 struct s390_sha256_ctx *sctx = ctx;
53 unsigned int index;
54
55 /* how much is already in the buffer? */
56 index = sctx->count / 8 & 0x3f;
57
58 /* update message bit length */
59 sctx->count += len * 8;
60
61 /* process one block */
62 if ((index + len) >= SHA256_BLOCK_SIZE) {
63 memcpy(sctx->buf + index, data, SHA256_BLOCK_SIZE - index);
64 crypt_s390_kimd(KIMD_SHA_256, sctx->state, sctx->buf,
65 SHA256_BLOCK_SIZE);
66 data += SHA256_BLOCK_SIZE - index;
67 len -= SHA256_BLOCK_SIZE - index;
68 }
69
70 /* anything left? */
71 if (len)
72 memcpy(sctx->buf + index , data, len);
73}
74
75static void pad_message(struct s390_sha256_ctx* sctx)
76{
77 int index, end;
78
79 index = sctx->count / 8 & 0x3f;
80 end = index < 56 ? SHA256_BLOCK_SIZE : 2 * SHA256_BLOCK_SIZE;
81
82 /* start pad with 1 */
83 sctx->buf[index] = 0x80;
84
85 /* pad with zeros */
86 index++;
87 memset(sctx->buf + index, 0x00, end - index - 8);
88
89 /* append message length */
90 memcpy(sctx->buf + end - 8, &sctx->count, sizeof sctx->count);
91
92 sctx->count = end * 8;
93}
94
95/* Add padding and return the message digest */
96static void sha256_final(void* ctx, u8 *out)
97{
98 struct s390_sha256_ctx *sctx = ctx;
99
100 /* must perform manual padding */
101 pad_message(sctx);
102
103 crypt_s390_kimd(KIMD_SHA_256, sctx->state, sctx->buf,
104 sctx->count / 8);
105
106 /* copy digest to out */
107 memcpy(out, sctx->state, SHA256_DIGEST_SIZE);
108
109 /* wipe context */
110 memset(sctx, 0, sizeof *sctx);
111}
112
113static struct crypto_alg alg = {
114 .cra_name = "sha256",
115 .cra_flags = CRYPTO_ALG_TYPE_DIGEST,
116 .cra_blocksize = SHA256_BLOCK_SIZE,
117 .cra_ctxsize = sizeof(struct s390_sha256_ctx),
118 .cra_module = THIS_MODULE,
119 .cra_list = LIST_HEAD_INIT(alg.cra_list),
120 .cra_u = { .digest = {
121 .dia_digestsize = SHA256_DIGEST_SIZE,
122 .dia_init = sha256_init,
123 .dia_update = sha256_update,
124 .dia_final = sha256_final } }
125};
126
127static int init(void)
128{
129 int ret;
130
131 if (!crypt_s390_func_available(KIMD_SHA_256))
132 return -ENOSYS;
133
134 ret = crypto_register_alg(&alg);
135 if (ret != 0)
136 printk(KERN_INFO "crypt_s390: sha256_s390 couldn't be loaded.");
137 return ret;
138}
139
140static void __exit fini(void)
141{
142 crypto_unregister_alg(&alg);
143}
144
145module_init(init);
146module_exit(fini);
147
148MODULE_ALIAS("sha256");
149
150MODULE_LICENSE("GPL");
151MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm");
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index 45d44c6bb39d..7d23edc6facb 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -1,12 +1,12 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.14-rc1 3# Linux kernel version: 2.6.15-rc2
4# Wed Sep 14 16:46:19 2005 4# Mon Nov 21 13:51:30 2005
5# 5#
6CONFIG_MMU=y 6CONFIG_MMU=y
7CONFIG_RWSEM_XCHGADD_ALGORITHM=y 7CONFIG_RWSEM_XCHGADD_ALGORITHM=y
8CONFIG_GENERIC_CALIBRATE_DELAY=y 8CONFIG_GENERIC_CALIBRATE_DELAY=y
9CONFIG_ARCH_S390=y 9CONFIG_S390=y
10CONFIG_UID16=y 10CONFIG_UID16=y
11 11
12# 12#
@@ -65,15 +65,31 @@ CONFIG_KMOD=y
65CONFIG_STOP_MACHINE=y 65CONFIG_STOP_MACHINE=y
66 66
67# 67#
68# Block layer
69#
70# CONFIG_LBD is not set
71
72#
73# IO Schedulers
74#
75CONFIG_IOSCHED_NOOP=y
76CONFIG_IOSCHED_AS=y
77CONFIG_IOSCHED_DEADLINE=y
78CONFIG_IOSCHED_CFQ=y
79CONFIG_DEFAULT_AS=y
80# CONFIG_DEFAULT_DEADLINE is not set
81# CONFIG_DEFAULT_CFQ is not set
82# CONFIG_DEFAULT_NOOP is not set
83CONFIG_DEFAULT_IOSCHED="anticipatory"
84
85#
68# Base setup 86# Base setup
69# 87#
70 88
71# 89#
72# Processor type and features 90# Processor type and features
73# 91#
74# CONFIG_ARCH_S390X is not set
75# CONFIG_64BIT is not set 92# CONFIG_64BIT is not set
76CONFIG_ARCH_S390_31=y
77CONFIG_SMP=y 93CONFIG_SMP=y
78CONFIG_NR_CPUS=32 94CONFIG_NR_CPUS=32
79CONFIG_HOTPLUG_CPU=y 95CONFIG_HOTPLUG_CPU=y
@@ -97,6 +113,7 @@ CONFIG_FLATMEM_MANUAL=y
97CONFIG_FLATMEM=y 113CONFIG_FLATMEM=y
98CONFIG_FLAT_NODE_MEM_MAP=y 114CONFIG_FLAT_NODE_MEM_MAP=y
99# CONFIG_SPARSEMEM_STATIC is not set 115# CONFIG_SPARSEMEM_STATIC is not set
116CONFIG_SPLIT_PTLOCK_CPUS=4
100 117
101# 118#
102# I/O subsystem configuration 119# I/O subsystem configuration
@@ -188,10 +205,18 @@ CONFIG_IPV6=y
188# CONFIG_NET_DIVERT is not set 205# CONFIG_NET_DIVERT is not set
189# CONFIG_ECONET is not set 206# CONFIG_ECONET is not set
190# CONFIG_WAN_ROUTER is not set 207# CONFIG_WAN_ROUTER is not set
208
209#
210# QoS and/or fair queueing
211#
191CONFIG_NET_SCHED=y 212CONFIG_NET_SCHED=y
192CONFIG_NET_SCH_CLK_JIFFIES=y 213CONFIG_NET_SCH_CLK_JIFFIES=y
193# CONFIG_NET_SCH_CLK_GETTIMEOFDAY is not set 214# CONFIG_NET_SCH_CLK_GETTIMEOFDAY is not set
194# CONFIG_NET_SCH_CLK_CPU is not set 215# CONFIG_NET_SCH_CLK_CPU is not set
216
217#
218# Queueing/Scheduling
219#
195CONFIG_NET_SCH_CBQ=m 220CONFIG_NET_SCH_CBQ=m
196# CONFIG_NET_SCH_HTB is not set 221# CONFIG_NET_SCH_HTB is not set
197# CONFIG_NET_SCH_HFSC is not set 222# CONFIG_NET_SCH_HFSC is not set
@@ -204,8 +229,10 @@ CONFIG_NET_SCH_GRED=m
204CONFIG_NET_SCH_DSMARK=m 229CONFIG_NET_SCH_DSMARK=m
205# CONFIG_NET_SCH_NETEM is not set 230# CONFIG_NET_SCH_NETEM is not set
206# CONFIG_NET_SCH_INGRESS is not set 231# CONFIG_NET_SCH_INGRESS is not set
207CONFIG_NET_QOS=y 232
208CONFIG_NET_ESTIMATOR=y 233#
234# Classification
235#
209CONFIG_NET_CLS=y 236CONFIG_NET_CLS=y
210# CONFIG_NET_CLS_BASIC is not set 237# CONFIG_NET_CLS_BASIC is not set
211CONFIG_NET_CLS_TCINDEX=m 238CONFIG_NET_CLS_TCINDEX=m
@@ -214,18 +241,18 @@ CONFIG_NET_CLS_ROUTE=y
214CONFIG_NET_CLS_FW=m 241CONFIG_NET_CLS_FW=m
215CONFIG_NET_CLS_U32=m 242CONFIG_NET_CLS_U32=m
216# CONFIG_CLS_U32_PERF is not set 243# CONFIG_CLS_U32_PERF is not set
217# CONFIG_NET_CLS_IND is not set
218CONFIG_NET_CLS_RSVP=m 244CONFIG_NET_CLS_RSVP=m
219CONFIG_NET_CLS_RSVP6=m 245CONFIG_NET_CLS_RSVP6=m
220# CONFIG_NET_EMATCH is not set 246# CONFIG_NET_EMATCH is not set
221# CONFIG_NET_CLS_ACT is not set 247# CONFIG_NET_CLS_ACT is not set
222CONFIG_NET_CLS_POLICE=y 248CONFIG_NET_CLS_POLICE=y
249# CONFIG_NET_CLS_IND is not set
250CONFIG_NET_ESTIMATOR=y
223 251
224# 252#
225# Network testing 253# Network testing
226# 254#
227# CONFIG_NET_PKTGEN is not set 255# CONFIG_NET_PKTGEN is not set
228# CONFIG_NETFILTER_NETLINK is not set
229# CONFIG_HAMRADIO is not set 256# CONFIG_HAMRADIO is not set
230# CONFIG_IRDA is not set 257# CONFIG_IRDA is not set
231# CONFIG_BT is not set 258# CONFIG_BT is not set
@@ -276,6 +303,7 @@ CONFIG_SCSI_FC_ATTRS=y
276# 303#
277# SCSI low-level drivers 304# SCSI low-level drivers
278# 305#
306# CONFIG_ISCSI_TCP is not set
279# CONFIG_SCSI_SATA is not set 307# CONFIG_SCSI_SATA is not set
280# CONFIG_SCSI_DEBUG is not set 308# CONFIG_SCSI_DEBUG is not set
281CONFIG_ZFCP=y 309CONFIG_ZFCP=y
@@ -292,7 +320,6 @@ CONFIG_BLK_DEV_RAM=y
292CONFIG_BLK_DEV_RAM_COUNT=16 320CONFIG_BLK_DEV_RAM_COUNT=16
293CONFIG_BLK_DEV_RAM_SIZE=4096 321CONFIG_BLK_DEV_RAM_SIZE=4096
294CONFIG_BLK_DEV_INITRD=y 322CONFIG_BLK_DEV_INITRD=y
295# CONFIG_LBD is not set
296# CONFIG_CDROM_PKTCDVD is not set 323# CONFIG_CDROM_PKTCDVD is not set
297 324
298# 325#
@@ -305,15 +332,8 @@ CONFIG_DASD_PROFILE=y
305CONFIG_DASD_ECKD=y 332CONFIG_DASD_ECKD=y
306CONFIG_DASD_FBA=y 333CONFIG_DASD_FBA=y
307CONFIG_DASD_DIAG=y 334CONFIG_DASD_DIAG=y
335CONFIG_DASD_EER=m
308# CONFIG_DASD_CMB is not set 336# CONFIG_DASD_CMB is not set
309
310#
311# IO Schedulers
312#
313CONFIG_IOSCHED_NOOP=y
314CONFIG_IOSCHED_AS=y
315CONFIG_IOSCHED_DEADLINE=y
316CONFIG_IOSCHED_CFQ=y
317# CONFIG_ATA_OVER_ETH is not set 337# CONFIG_ATA_OVER_ETH is not set
318 338
319# 339#
@@ -378,7 +398,6 @@ CONFIG_S390_TAPE_34XX=m
378# CONFIG_VMLOGRDR is not set 398# CONFIG_VMLOGRDR is not set
379# CONFIG_VMCP is not set 399# CONFIG_VMCP is not set
380# CONFIG_MONREADER is not set 400# CONFIG_MONREADER is not set
381# CONFIG_DCSS_SHM is not set
382 401
383# 402#
384# Cryptographic devices 403# Cryptographic devices
@@ -593,6 +612,8 @@ CONFIG_DEBUG_PREEMPT=y
593# CONFIG_DEBUG_KOBJECT is not set 612# CONFIG_DEBUG_KOBJECT is not set
594# CONFIG_DEBUG_INFO is not set 613# CONFIG_DEBUG_INFO is not set
595CONFIG_DEBUG_FS=y 614CONFIG_DEBUG_FS=y
615# CONFIG_DEBUG_VM is not set
616# CONFIG_RCU_TORTURE_TEST is not set
596 617
597# 618#
598# Security options 619# Security options
@@ -609,17 +630,19 @@ CONFIG_CRYPTO=y
609# CONFIG_CRYPTO_MD4 is not set 630# CONFIG_CRYPTO_MD4 is not set
610# CONFIG_CRYPTO_MD5 is not set 631# CONFIG_CRYPTO_MD5 is not set
611# CONFIG_CRYPTO_SHA1 is not set 632# CONFIG_CRYPTO_SHA1 is not set
612# CONFIG_CRYPTO_SHA1_Z990 is not set 633# CONFIG_CRYPTO_SHA1_S390 is not set
613# CONFIG_CRYPTO_SHA256 is not set 634# CONFIG_CRYPTO_SHA256 is not set
635# CONFIG_CRYPTO_SHA256_S390 is not set
614# CONFIG_CRYPTO_SHA512 is not set 636# CONFIG_CRYPTO_SHA512 is not set
615# CONFIG_CRYPTO_WP512 is not set 637# CONFIG_CRYPTO_WP512 is not set
616# CONFIG_CRYPTO_TGR192 is not set 638# CONFIG_CRYPTO_TGR192 is not set
617# CONFIG_CRYPTO_DES is not set 639# CONFIG_CRYPTO_DES is not set
618# CONFIG_CRYPTO_DES_Z990 is not set 640# CONFIG_CRYPTO_DES_S390 is not set
619# CONFIG_CRYPTO_BLOWFISH is not set 641# CONFIG_CRYPTO_BLOWFISH is not set
620# CONFIG_CRYPTO_TWOFISH is not set 642# CONFIG_CRYPTO_TWOFISH is not set
621# CONFIG_CRYPTO_SERPENT is not set 643# CONFIG_CRYPTO_SERPENT is not set
622# CONFIG_CRYPTO_AES is not set 644# CONFIG_CRYPTO_AES is not set
645# CONFIG_CRYPTO_AES_S390 is not set
623# CONFIG_CRYPTO_CAST5 is not set 646# CONFIG_CRYPTO_CAST5 is not set
624# CONFIG_CRYPTO_CAST6 is not set 647# CONFIG_CRYPTO_CAST6 is not set
625# CONFIG_CRYPTO_TEA is not set 648# CONFIG_CRYPTO_TEA is not set
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 7434c32bc631..4865e4b49464 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -8,31 +8,26 @@ obj-y := bitmap.o traps.o time.o process.o \
8 setup.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o \ 8 setup.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o \
9 semaphore.o s390_ext.o debug.o profile.o irq.o reipl_diag.o 9 semaphore.o s390_ext.o debug.o profile.o irq.o reipl_diag.o
10 10
11obj-y += $(if $(CONFIG_64BIT),entry64.o,entry.o)
12obj-y += $(if $(CONFIG_64BIT),reipl64.o,reipl.o)
13
11extra-y += head.o init_task.o vmlinux.lds 14extra-y += head.o init_task.o vmlinux.lds
12 15
13obj-$(CONFIG_MODULES) += s390_ksyms.o module.o 16obj-$(CONFIG_MODULES) += s390_ksyms.o module.o
14obj-$(CONFIG_SMP) += smp.o 17obj-$(CONFIG_SMP) += smp.o
15 18
16obj-$(CONFIG_S390_SUPPORT) += compat_linux.o compat_signal.o \ 19obj-$(CONFIG_COMPAT) += compat_linux.o compat_signal.o \
17 compat_ioctl.o compat_wrapper.o \ 20 compat_ioctl.o compat_wrapper.o \
18 compat_exec_domain.o 21 compat_exec_domain.o
19obj-$(CONFIG_BINFMT_ELF32) += binfmt_elf32.o 22obj-$(CONFIG_BINFMT_ELF32) += binfmt_elf32.o
20 23
21obj-$(CONFIG_ARCH_S390_31) += entry.o reipl.o
22obj-$(CONFIG_ARCH_S390X) += entry64.o reipl64.o
23
24obj-$(CONFIG_VIRT_TIMER) += vtime.o 24obj-$(CONFIG_VIRT_TIMER) += vtime.o
25 25
26# Kexec part 26# Kexec part
27S390_KEXEC_OBJS := machine_kexec.o crash.o 27S390_KEXEC_OBJS := machine_kexec.o crash.o
28ifeq ($(CONFIG_ARCH_S390X),y) 28S390_KEXEC_OBJS += $(if $(CONFIG_64BIT),relocate_kernel64.o,relocate_kernel.o)
29S390_KEXEC_OBJS += relocate_kernel64.o
30else
31S390_KEXEC_OBJS += relocate_kernel.o
32endif
33obj-$(CONFIG_KEXEC) += $(S390_KEXEC_OBJS) 29obj-$(CONFIG_KEXEC) += $(S390_KEXEC_OBJS)
34 30
35
36# 31#
37# This is just to get the dependencies... 32# This is just to get the dependencies...
38# 33#
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index ed877d0f27e6..41b197a3f3a3 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -279,7 +279,7 @@ asmlinkage long sys32_getegid16(void)
279 279
280static inline long get_tv32(struct timeval *o, struct compat_timeval *i) 280static inline long get_tv32(struct timeval *o, struct compat_timeval *i)
281{ 281{
282 return (!access_ok(VERIFY_READ, tv32, sizeof(*tv32)) || 282 return (!access_ok(VERIFY_READ, o, sizeof(*o)) ||
283 (__get_user(o->tv_sec, &i->tv_sec) || 283 (__get_user(o->tv_sec, &i->tv_sec) ||
284 __get_user(o->tv_usec, &i->tv_usec))); 284 __get_user(o->tv_usec, &i->tv_usec)));
285} 285}
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 4ff6808456ea..fa2b3bc22f20 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -467,8 +467,6 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
467 if (err) 467 if (err)
468 goto badframe; 468 goto badframe;
469 469
470 /* It is more difficult to avoid calling this function than to
471 call it and ignore errors. */
472 set_fs (KERNEL_DS); 470 set_fs (KERNEL_DS);
473 do_sigaltstack((stack_t __user *)&st, NULL, regs->gprs[15]); 471 do_sigaltstack((stack_t __user *)&st, NULL, regs->gprs[15]);
474 set_fs (old_fs); 472 set_fs (old_fs);
diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c
index d47fecb42cc5..4ef44e536b2c 100644
--- a/arch/s390/kernel/cpcmd.c
+++ b/arch/s390/kernel/cpcmd.c
@@ -39,7 +39,7 @@ int __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
39 39
40 if (response != NULL && rlen > 0) { 40 if (response != NULL && rlen > 0) {
41 memset(response, 0, rlen); 41 memset(response, 0, rlen);
42#ifndef CONFIG_ARCH_S390X 42#ifndef CONFIG_64BIT
43 asm volatile ( "lra 2,0(%2)\n" 43 asm volatile ( "lra 2,0(%2)\n"
44 "lr 4,%3\n" 44 "lr 4,%3\n"
45 "o 4,%6\n" 45 "o 4,%6\n"
@@ -55,7 +55,7 @@ int __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
55 : "a" (cpcmd_buf), "d" (cmdlen), 55 : "a" (cpcmd_buf), "d" (cmdlen),
56 "a" (response), "d" (rlen), "m" (mask) 56 "a" (response), "d" (rlen), "m" (mask)
57 : "cc", "2", "3", "4", "5" ); 57 : "cc", "2", "3", "4", "5" );
58#else /* CONFIG_ARCH_S390X */ 58#else /* CONFIG_64BIT */
59 asm volatile ( "lrag 2,0(%2)\n" 59 asm volatile ( "lrag 2,0(%2)\n"
60 "lgr 4,%3\n" 60 "lgr 4,%3\n"
61 "o 4,%6\n" 61 "o 4,%6\n"
@@ -73,11 +73,11 @@ int __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
73 : "a" (cpcmd_buf), "d" (cmdlen), 73 : "a" (cpcmd_buf), "d" (cmdlen),
74 "a" (response), "d" (rlen), "m" (mask) 74 "a" (response), "d" (rlen), "m" (mask)
75 : "cc", "2", "3", "4", "5" ); 75 : "cc", "2", "3", "4", "5" );
76#endif /* CONFIG_ARCH_S390X */ 76#endif /* CONFIG_64BIT */
77 EBCASC(response, rlen); 77 EBCASC(response, rlen);
78 } else { 78 } else {
79 return_len = 0; 79 return_len = 0;
80#ifndef CONFIG_ARCH_S390X 80#ifndef CONFIG_64BIT
81 asm volatile ( "lra 2,0(%1)\n" 81 asm volatile ( "lra 2,0(%1)\n"
82 "lr 3,%2\n" 82 "lr 3,%2\n"
83 "diag 2,3,0x8\n" 83 "diag 2,3,0x8\n"
@@ -85,7 +85,7 @@ int __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
85 : "=d" (return_code) 85 : "=d" (return_code)
86 : "a" (cpcmd_buf), "d" (cmdlen) 86 : "a" (cpcmd_buf), "d" (cmdlen)
87 : "2", "3" ); 87 : "2", "3" );
88#else /* CONFIG_ARCH_S390X */ 88#else /* CONFIG_64BIT */
89 asm volatile ( "lrag 2,0(%1)\n" 89 asm volatile ( "lrag 2,0(%1)\n"
90 "lgr 3,%2\n" 90 "lgr 3,%2\n"
91 "sam31\n" 91 "sam31\n"
@@ -95,7 +95,7 @@ int __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
95 : "=d" (return_code) 95 : "=d" (return_code)
96 : "a" (cpcmd_buf), "d" (cmdlen) 96 : "a" (cpcmd_buf), "d" (cmdlen)
97 : "2", "3" ); 97 : "2", "3" );
98#endif /* CONFIG_ARCH_S390X */ 98#endif /* CONFIG_64BIT */
99 } 99 }
100 spin_unlock_irqrestore(&cpcmd_lock, flags); 100 spin_unlock_irqrestore(&cpcmd_lock, flags);
101 if (response_code != NULL) 101 if (response_code != NULL)
@@ -105,7 +105,7 @@ int __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
105 105
106EXPORT_SYMBOL(__cpcmd); 106EXPORT_SYMBOL(__cpcmd);
107 107
108#ifdef CONFIG_ARCH_S390X 108#ifdef CONFIG_64BIT
109int cpcmd(const char *cmd, char *response, int rlen, int *response_code) 109int cpcmd(const char *cmd, char *response, int rlen, int *response_code)
110{ 110{
111 char *lowbuf; 111 char *lowbuf;
@@ -129,4 +129,4 @@ int cpcmd(const char *cmd, char *response, int rlen, int *response_code)
129} 129}
130 130
131EXPORT_SYMBOL(cpcmd); 131EXPORT_SYMBOL(cpcmd);
132#endif /* CONFIG_ARCH_S390X */ 132#endif /* CONFIG_64BIT */
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index 4eb71ffcf484..369ab4413ec7 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -213,7 +213,7 @@ sysc_nr_ok:
213 mvc SP_ARGS(8,%r15),SP_R7(%r15) 213 mvc SP_ARGS(8,%r15),SP_R7(%r15)
214sysc_do_restart: 214sysc_do_restart:
215 larl %r10,sys_call_table 215 larl %r10,sys_call_table
216#ifdef CONFIG_S390_SUPPORT 216#ifdef CONFIG_COMPAT
217 tm __TI_flags+5(%r9),(_TIF_31BIT>>16) # running in 31 bit mode ? 217 tm __TI_flags+5(%r9),(_TIF_31BIT>>16) # running in 31 bit mode ?
218 jno sysc_noemu 218 jno sysc_noemu
219 larl %r10,sys_call_table_emu # use 31 bit emulation system calls 219 larl %r10,sys_call_table_emu # use 31 bit emulation system calls
@@ -361,7 +361,7 @@ sys_clone_glue:
361 la %r2,SP_PTREGS(%r15) # load pt_regs 361 la %r2,SP_PTREGS(%r15) # load pt_regs
362 jg sys_clone # branch to sys_clone 362 jg sys_clone # branch to sys_clone
363 363
364#ifdef CONFIG_S390_SUPPORT 364#ifdef CONFIG_COMPAT
365sys32_clone_glue: 365sys32_clone_glue:
366 la %r2,SP_PTREGS(%r15) # load pt_regs 366 la %r2,SP_PTREGS(%r15) # load pt_regs
367 jg sys32_clone # branch to sys32_clone 367 jg sys32_clone # branch to sys32_clone
@@ -383,7 +383,7 @@ sys_execve_glue:
383 bnz 0(%r12) # it did fail -> store result in gpr2 383 bnz 0(%r12) # it did fail -> store result in gpr2
384 b 6(%r12) # SKIP STG 2,SP_R2(15) in 384 b 6(%r12) # SKIP STG 2,SP_R2(15) in
385 # system_call/sysc_tracesys 385 # system_call/sysc_tracesys
386#ifdef CONFIG_S390_SUPPORT 386#ifdef CONFIG_COMPAT
387sys32_execve_glue: 387sys32_execve_glue:
388 la %r2,SP_PTREGS(%r15) # load pt_regs 388 la %r2,SP_PTREGS(%r15) # load pt_regs
389 lgr %r12,%r14 # save return address 389 lgr %r12,%r14 # save return address
@@ -398,7 +398,7 @@ sys_sigreturn_glue:
398 la %r2,SP_PTREGS(%r15) # load pt_regs as parameter 398 la %r2,SP_PTREGS(%r15) # load pt_regs as parameter
399 jg sys_sigreturn # branch to sys_sigreturn 399 jg sys_sigreturn # branch to sys_sigreturn
400 400
401#ifdef CONFIG_S390_SUPPORT 401#ifdef CONFIG_COMPAT
402sys32_sigreturn_glue: 402sys32_sigreturn_glue:
403 la %r2,SP_PTREGS(%r15) # load pt_regs as parameter 403 la %r2,SP_PTREGS(%r15) # load pt_regs as parameter
404 jg sys32_sigreturn # branch to sys32_sigreturn 404 jg sys32_sigreturn # branch to sys32_sigreturn
@@ -408,7 +408,7 @@ sys_rt_sigreturn_glue:
408 la %r2,SP_PTREGS(%r15) # load pt_regs as parameter 408 la %r2,SP_PTREGS(%r15) # load pt_regs as parameter
409 jg sys_rt_sigreturn # branch to sys_sigreturn 409 jg sys_rt_sigreturn # branch to sys_sigreturn
410 410
411#ifdef CONFIG_S390_SUPPORT 411#ifdef CONFIG_COMPAT
412sys32_rt_sigreturn_glue: 412sys32_rt_sigreturn_glue:
413 la %r2,SP_PTREGS(%r15) # load pt_regs as parameter 413 la %r2,SP_PTREGS(%r15) # load pt_regs as parameter
414 jg sys32_rt_sigreturn # branch to sys32_sigreturn 414 jg sys32_rt_sigreturn # branch to sys32_sigreturn
@@ -429,7 +429,7 @@ sys_sigsuspend_glue:
429 la %r14,6(%r14) # skip store of return value 429 la %r14,6(%r14) # skip store of return value
430 jg sys_sigsuspend # branch to sys_sigsuspend 430 jg sys_sigsuspend # branch to sys_sigsuspend
431 431
432#ifdef CONFIG_S390_SUPPORT 432#ifdef CONFIG_COMPAT
433sys32_sigsuspend_glue: 433sys32_sigsuspend_glue:
434 llgfr %r4,%r4 # unsigned long 434 llgfr %r4,%r4 # unsigned long
435 lgr %r5,%r4 # move mask back 435 lgr %r5,%r4 # move mask back
@@ -449,7 +449,7 @@ sys_rt_sigsuspend_glue:
449 la %r14,6(%r14) # skip store of return value 449 la %r14,6(%r14) # skip store of return value
450 jg sys_rt_sigsuspend # branch to sys_rt_sigsuspend 450 jg sys_rt_sigsuspend # branch to sys_rt_sigsuspend
451 451
452#ifdef CONFIG_S390_SUPPORT 452#ifdef CONFIG_COMPAT
453sys32_rt_sigsuspend_glue: 453sys32_rt_sigsuspend_glue:
454 llgfr %r3,%r3 # size_t 454 llgfr %r3,%r3 # size_t
455 lgr %r4,%r3 # move sigsetsize parameter 455 lgr %r4,%r3 # move sigsetsize parameter
@@ -464,7 +464,7 @@ sys_sigaltstack_glue:
464 la %r4,SP_PTREGS(%r15) # load pt_regs as parameter 464 la %r4,SP_PTREGS(%r15) # load pt_regs as parameter
465 jg sys_sigaltstack # branch to sys_sigreturn 465 jg sys_sigaltstack # branch to sys_sigreturn
466 466
467#ifdef CONFIG_S390_SUPPORT 467#ifdef CONFIG_COMPAT
468sys32_sigaltstack_glue: 468sys32_sigaltstack_glue:
469 la %r4,SP_PTREGS(%r15) # load pt_regs as parameter 469 la %r4,SP_PTREGS(%r15) # load pt_regs as parameter
470 jg sys32_sigaltstack_wrapper # branch to sys_sigreturn 470 jg sys32_sigaltstack_wrapper # branch to sys_sigreturn
@@ -1009,7 +1009,7 @@ sys_call_table:
1009#include "syscalls.S" 1009#include "syscalls.S"
1010#undef SYSCALL 1010#undef SYSCALL
1011 1011
1012#ifdef CONFIG_S390_SUPPORT 1012#ifdef CONFIG_COMPAT
1013 1013
1014#define SYSCALL(esa,esame,emu) .long emu 1014#define SYSCALL(esa,esame,emu) .long emu
1015 .globl sys_call_table_emu 1015 .globl sys_call_table_emu
diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S
index d31a97c89f68..ea88d066bf04 100644
--- a/arch/s390/kernel/head.S
+++ b/arch/s390/kernel/head.S
@@ -30,7 +30,7 @@
30#include <asm/thread_info.h> 30#include <asm/thread_info.h>
31#include <asm/page.h> 31#include <asm/page.h>
32 32
33#ifdef CONFIG_ARCH_S390X 33#ifdef CONFIG_64BIT
34#define ARCH_OFFSET 4 34#define ARCH_OFFSET 4
35#else 35#else
36#define ARCH_OFFSET 0 36#define ARCH_OFFSET 0
@@ -539,7 +539,7 @@ ipl_devno:
539 .word 0 539 .word 0
540.endm 540.endm
541 541
542#ifdef CONFIG_ARCH_S390X 542#ifdef CONFIG_64BIT
543#include "head64.S" 543#include "head64.S"
544#else 544#else
545#include "head31.S" 545#include "head31.S"
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 5aa71b05b8ae..f0ed5c642c74 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -85,7 +85,7 @@ kexec_halt_all_cpus(void *kernel_image)
85 pfault_fini(); 85 pfault_fini();
86#endif 86#endif
87 87
88 if (atomic_compare_and_swap(-1, smp_processor_id(), &cpuid)) 88 if (atomic_cmpxchg(&cpuid, -1, smp_processor_id()) != -1)
89 signal_processor(smp_processor_id(), sigp_stop); 89 signal_processor(smp_processor_id(), sigp_stop);
90 90
91 /* Wait for all other cpus to enter stopped state */ 91 /* Wait for all other cpus to enter stopped state */
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 607d506689c8..c271cdab58e2 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -37,11 +37,11 @@
37#define DEBUGP(fmt , ...) 37#define DEBUGP(fmt , ...)
38#endif 38#endif
39 39
40#ifndef CONFIG_ARCH_S390X 40#ifndef CONFIG_64BIT
41#define PLT_ENTRY_SIZE 12 41#define PLT_ENTRY_SIZE 12
42#else /* CONFIG_ARCH_S390X */ 42#else /* CONFIG_64BIT */
43#define PLT_ENTRY_SIZE 20 43#define PLT_ENTRY_SIZE 20
44#endif /* CONFIG_ARCH_S390X */ 44#endif /* CONFIG_64BIT */
45 45
46void *module_alloc(unsigned long size) 46void *module_alloc(unsigned long size)
47{ 47{
@@ -294,17 +294,17 @@ apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
294 unsigned int *ip; 294 unsigned int *ip;
295 ip = me->module_core + me->arch.plt_offset + 295 ip = me->module_core + me->arch.plt_offset +
296 info->plt_offset; 296 info->plt_offset;
297#ifndef CONFIG_ARCH_S390X 297#ifndef CONFIG_64BIT
298 ip[0] = 0x0d105810; /* basr 1,0; l 1,6(1); br 1 */ 298 ip[0] = 0x0d105810; /* basr 1,0; l 1,6(1); br 1 */
299 ip[1] = 0x100607f1; 299 ip[1] = 0x100607f1;
300 ip[2] = val; 300 ip[2] = val;
301#else /* CONFIG_ARCH_S390X */ 301#else /* CONFIG_64BIT */
302 ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */ 302 ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */
303 ip[1] = 0x100a0004; 303 ip[1] = 0x100a0004;
304 ip[2] = 0x07f10000; 304 ip[2] = 0x07f10000;
305 ip[3] = (unsigned int) (val >> 32); 305 ip[3] = (unsigned int) (val >> 32);
306 ip[4] = (unsigned int) val; 306 ip[4] = (unsigned int) val;
307#endif /* CONFIG_ARCH_S390X */ 307#endif /* CONFIG_64BIT */
308 info->plt_initialized = 1; 308 info->plt_initialized = 1;
309 } 309 }
310 if (r_type == R_390_PLTOFF16 || 310 if (r_type == R_390_PLTOFF16 ||
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 78b64fe5e7c2..a942bf2d58e9 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -235,7 +235,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
235 /* Save access registers to new thread structure. */ 235 /* Save access registers to new thread structure. */
236 save_access_regs(&p->thread.acrs[0]); 236 save_access_regs(&p->thread.acrs[0]);
237 237
238#ifndef CONFIG_ARCH_S390X 238#ifndef CONFIG_64BIT
239 /* 239 /*
240 * save fprs to current->thread.fp_regs to merge them with 240 * save fprs to current->thread.fp_regs to merge them with
241 * the emulated registers and then copy the result to the child. 241 * the emulated registers and then copy the result to the child.
@@ -247,7 +247,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
247 /* Set a new TLS ? */ 247 /* Set a new TLS ? */
248 if (clone_flags & CLONE_SETTLS) 248 if (clone_flags & CLONE_SETTLS)
249 p->thread.acrs[0] = regs->gprs[6]; 249 p->thread.acrs[0] = regs->gprs[6];
250#else /* CONFIG_ARCH_S390X */ 250#else /* CONFIG_64BIT */
251 /* Save the fpu registers to new thread structure. */ 251 /* Save the fpu registers to new thread structure. */
252 save_fp_regs(&p->thread.fp_regs); 252 save_fp_regs(&p->thread.fp_regs);
253 p->thread.user_seg = __pa((unsigned long) p->mm->pgd) | _REGION_TABLE; 253 p->thread.user_seg = __pa((unsigned long) p->mm->pgd) | _REGION_TABLE;
@@ -260,7 +260,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
260 p->thread.acrs[1] = (unsigned int) regs->gprs[6]; 260 p->thread.acrs[1] = (unsigned int) regs->gprs[6];
261 } 261 }
262 } 262 }
263#endif /* CONFIG_ARCH_S390X */ 263#endif /* CONFIG_64BIT */
264 /* start new process with ar4 pointing to the correct address space */ 264 /* start new process with ar4 pointing to the correct address space */
265 p->thread.mm_segment = get_fs(); 265 p->thread.mm_segment = get_fs();
266 /* Don't copy debug registers */ 266 /* Don't copy debug registers */
@@ -339,16 +339,16 @@ out:
339 */ 339 */
340int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs) 340int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
341{ 341{
342#ifndef CONFIG_ARCH_S390X 342#ifndef CONFIG_64BIT
343 /* 343 /*
344 * save fprs to current->thread.fp_regs to merge them with 344 * save fprs to current->thread.fp_regs to merge them with
345 * the emulated registers and then copy the result to the dump. 345 * the emulated registers and then copy the result to the dump.
346 */ 346 */
347 save_fp_regs(&current->thread.fp_regs); 347 save_fp_regs(&current->thread.fp_regs);
348 memcpy(fpregs, &current->thread.fp_regs, sizeof(s390_fp_regs)); 348 memcpy(fpregs, &current->thread.fp_regs, sizeof(s390_fp_regs));
349#else /* CONFIG_ARCH_S390X */ 349#else /* CONFIG_64BIT */
350 save_fp_regs(fpregs); 350 save_fp_regs(fpregs);
351#endif /* CONFIG_ARCH_S390X */ 351#endif /* CONFIG_64BIT */
352 return 1; 352 return 1;
353} 353}
354 354
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 06afa3103ace..8ecda6d66de4 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -42,7 +42,7 @@
42#include <asm/uaccess.h> 42#include <asm/uaccess.h>
43#include <asm/unistd.h> 43#include <asm/unistd.h>
44 44
45#ifdef CONFIG_S390_SUPPORT 45#ifdef CONFIG_COMPAT
46#include "compat_ptrace.h" 46#include "compat_ptrace.h"
47#endif 47#endif
48 48
@@ -59,7 +59,7 @@ FixPerRegisters(struct task_struct *task)
59 59
60 if (per_info->single_step) { 60 if (per_info->single_step) {
61 per_info->control_regs.bits.starting_addr = 0; 61 per_info->control_regs.bits.starting_addr = 0;
62#ifdef CONFIG_S390_SUPPORT 62#ifdef CONFIG_COMPAT
63 if (test_thread_flag(TIF_31BIT)) 63 if (test_thread_flag(TIF_31BIT))
64 per_info->control_regs.bits.ending_addr = 0x7fffffffUL; 64 per_info->control_regs.bits.ending_addr = 0x7fffffffUL;
65 else 65 else
@@ -112,7 +112,7 @@ ptrace_disable(struct task_struct *child)
112 clear_single_step(child); 112 clear_single_step(child);
113} 113}
114 114
115#ifndef CONFIG_ARCH_S390X 115#ifndef CONFIG_64BIT
116# define __ADDR_MASK 3 116# define __ADDR_MASK 3
117#else 117#else
118# define __ADDR_MASK 7 118# define __ADDR_MASK 7
@@ -138,7 +138,7 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
138 * an alignment of 4. Programmers from hell... 138 * an alignment of 4. Programmers from hell...
139 */ 139 */
140 mask = __ADDR_MASK; 140 mask = __ADDR_MASK;
141#ifdef CONFIG_ARCH_S390X 141#ifdef CONFIG_64BIT
142 if (addr >= (addr_t) &dummy->regs.acrs && 142 if (addr >= (addr_t) &dummy->regs.acrs &&
143 addr < (addr_t) &dummy->regs.orig_gpr2) 143 addr < (addr_t) &dummy->regs.orig_gpr2)
144 mask = 3; 144 mask = 3;
@@ -160,7 +160,7 @@ peek_user(struct task_struct *child, addr_t addr, addr_t data)
160 * access registers are stored in the thread structure 160 * access registers are stored in the thread structure
161 */ 161 */
162 offset = addr - (addr_t) &dummy->regs.acrs; 162 offset = addr - (addr_t) &dummy->regs.acrs;
163#ifdef CONFIG_ARCH_S390X 163#ifdef CONFIG_64BIT
164 /* 164 /*
165 * Very special case: old & broken 64 bit gdb reading 165 * Very special case: old & broken 64 bit gdb reading
166 * from acrs[15]. Result is a 64 bit value. Read the 166 * from acrs[15]. Result is a 64 bit value. Read the
@@ -218,7 +218,7 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
218 * an alignment of 4. Programmers from hell indeed... 218 * an alignment of 4. Programmers from hell indeed...
219 */ 219 */
220 mask = __ADDR_MASK; 220 mask = __ADDR_MASK;
221#ifdef CONFIG_ARCH_S390X 221#ifdef CONFIG_64BIT
222 if (addr >= (addr_t) &dummy->regs.acrs && 222 if (addr >= (addr_t) &dummy->regs.acrs &&
223 addr < (addr_t) &dummy->regs.orig_gpr2) 223 addr < (addr_t) &dummy->regs.orig_gpr2)
224 mask = 3; 224 mask = 3;
@@ -231,13 +231,13 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
231 * psw and gprs are stored on the stack 231 * psw and gprs are stored on the stack
232 */ 232 */
233 if (addr == (addr_t) &dummy->regs.psw.mask && 233 if (addr == (addr_t) &dummy->regs.psw.mask &&
234#ifdef CONFIG_S390_SUPPORT 234#ifdef CONFIG_COMPAT
235 data != PSW_MASK_MERGE(PSW_USER32_BITS, data) && 235 data != PSW_MASK_MERGE(PSW_USER32_BITS, data) &&
236#endif 236#endif
237 data != PSW_MASK_MERGE(PSW_USER_BITS, data)) 237 data != PSW_MASK_MERGE(PSW_USER_BITS, data))
238 /* Invalid psw mask. */ 238 /* Invalid psw mask. */
239 return -EINVAL; 239 return -EINVAL;
240#ifndef CONFIG_ARCH_S390X 240#ifndef CONFIG_64BIT
241 if (addr == (addr_t) &dummy->regs.psw.addr) 241 if (addr == (addr_t) &dummy->regs.psw.addr)
242 /* I'd like to reject addresses without the 242 /* I'd like to reject addresses without the
243 high order bit but older gdb's rely on it */ 243 high order bit but older gdb's rely on it */
@@ -250,7 +250,7 @@ poke_user(struct task_struct *child, addr_t addr, addr_t data)
250 * access registers are stored in the thread structure 250 * access registers are stored in the thread structure
251 */ 251 */
252 offset = addr - (addr_t) &dummy->regs.acrs; 252 offset = addr - (addr_t) &dummy->regs.acrs;
253#ifdef CONFIG_ARCH_S390X 253#ifdef CONFIG_64BIT
254 /* 254 /*
255 * Very special case: old & broken 64 bit gdb writing 255 * Very special case: old & broken 64 bit gdb writing
256 * to acrs[15] with a 64 bit value. Ignore the lower 256 * to acrs[15] with a 64 bit value. Ignore the lower
@@ -357,7 +357,7 @@ do_ptrace_normal(struct task_struct *child, long request, long addr, long data)
357 return ptrace_request(child, request, addr, data); 357 return ptrace_request(child, request, addr, data);
358} 358}
359 359
360#ifdef CONFIG_S390_SUPPORT 360#ifdef CONFIG_COMPAT
361/* 361/*
362 * Now the fun part starts... a 31 bit program running in the 362 * Now the fun part starts... a 31 bit program running in the
363 * 31 bit emulation tracing another program. PTRACE_PEEKTEXT, 363 * 31 bit emulation tracing another program. PTRACE_PEEKTEXT,
@@ -629,7 +629,7 @@ do_ptrace(struct task_struct *child, long request, long addr, long data)
629 return peek_user(child, addr, data); 629 return peek_user(child, addr, data);
630 if (request == PTRACE_POKEUSR && addr == PT_IEEE_IP) 630 if (request == PTRACE_POKEUSR && addr == PT_IEEE_IP)
631 return poke_user(child, addr, data); 631 return poke_user(child, addr, data);
632#ifdef CONFIG_S390_SUPPORT 632#ifdef CONFIG_COMPAT
633 if (request == PTRACE_PEEKUSR && 633 if (request == PTRACE_PEEKUSR &&
634 addr == PT32_IEEE_IP && test_thread_flag(TIF_31BIT)) 634 addr == PT32_IEEE_IP && test_thread_flag(TIF_31BIT))
635 return peek_user_emu31(child, addr, data); 635 return peek_user_emu31(child, addr, data);
@@ -695,7 +695,7 @@ do_ptrace(struct task_struct *child, long request, long addr, long data)
695 695
696 /* Do requests that differ for 31/64 bit */ 696 /* Do requests that differ for 31/64 bit */
697 default: 697 default:
698#ifdef CONFIG_S390_SUPPORT 698#ifdef CONFIG_COMPAT
699 if (test_thread_flag(TIF_31BIT)) 699 if (test_thread_flag(TIF_31BIT))
700 return do_ptrace_emu31(child, request, addr, data); 700 return do_ptrace_emu31(child, request, addr, data);
701#endif 701#endif
diff --git a/arch/s390/kernel/reipl_diag.c b/arch/s390/kernel/reipl_diag.c
index 83cb42bc0b76..1f33951ba439 100644
--- a/arch/s390/kernel/reipl_diag.c
+++ b/arch/s390/kernel/reipl_diag.c
@@ -26,7 +26,7 @@ void reipl_diag(void)
26 " st %%r4,%0\n" 26 " st %%r4,%0\n"
27 " st %%r5,%1\n" 27 " st %%r5,%1\n"
28 ".section __ex_table,\"a\"\n" 28 ".section __ex_table,\"a\"\n"
29#ifdef __s390x__ 29#ifdef CONFIG_64BIT
30 " .align 8\n" 30 " .align 8\n"
31 " .quad 0b, 0b\n" 31 " .quad 0b, 0b\n"
32#else 32#else
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 31e7b19348b7..b03847d100d9 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -427,7 +427,7 @@ setup_lowcore(void)
427 __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0) + PAGE_SIZE; 427 __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0) + PAGE_SIZE;
428 lc->current_task = (unsigned long) init_thread_union.thread_info.task; 428 lc->current_task = (unsigned long) init_thread_union.thread_info.task;
429 lc->thread_info = (unsigned long) &init_thread_union; 429 lc->thread_info = (unsigned long) &init_thread_union;
430#ifndef CONFIG_ARCH_S390X 430#ifndef CONFIG_64BIT
431 if (MACHINE_HAS_IEEE) { 431 if (MACHINE_HAS_IEEE) {
432 lc->extended_save_area_addr = (__u32) 432 lc->extended_save_area_addr = (__u32)
433 __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0); 433 __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0);
@@ -562,21 +562,21 @@ setup_arch(char **cmdline_p)
562 /* 562 /*
563 * print what head.S has found out about the machine 563 * print what head.S has found out about the machine
564 */ 564 */
565#ifndef CONFIG_ARCH_S390X 565#ifndef CONFIG_64BIT
566 printk((MACHINE_IS_VM) ? 566 printk((MACHINE_IS_VM) ?
567 "We are running under VM (31 bit mode)\n" : 567 "We are running under VM (31 bit mode)\n" :
568 "We are running native (31 bit mode)\n"); 568 "We are running native (31 bit mode)\n");
569 printk((MACHINE_HAS_IEEE) ? 569 printk((MACHINE_HAS_IEEE) ?
570 "This machine has an IEEE fpu\n" : 570 "This machine has an IEEE fpu\n" :
571 "This machine has no IEEE fpu\n"); 571 "This machine has no IEEE fpu\n");
572#else /* CONFIG_ARCH_S390X */ 572#else /* CONFIG_64BIT */
573 printk((MACHINE_IS_VM) ? 573 printk((MACHINE_IS_VM) ?
574 "We are running under VM (64 bit mode)\n" : 574 "We are running under VM (64 bit mode)\n" :
575 "We are running native (64 bit mode)\n"); 575 "We are running native (64 bit mode)\n");
576#endif /* CONFIG_ARCH_S390X */ 576#endif /* CONFIG_64BIT */
577 577
578 ROOT_DEV = Root_RAM0; 578 ROOT_DEV = Root_RAM0;
579#ifndef CONFIG_ARCH_S390X 579#ifndef CONFIG_64BIT
580 memory_end = memory_size & ~0x400000UL; /* align memory end to 4MB */ 580 memory_end = memory_size & ~0x400000UL; /* align memory end to 4MB */
581 /* 581 /*
582 * We need some free virtual space to be able to do vmalloc. 582 * We need some free virtual space to be able to do vmalloc.
@@ -585,9 +585,9 @@ setup_arch(char **cmdline_p)
585 */ 585 */
586 if (memory_end > 1920*1024*1024) 586 if (memory_end > 1920*1024*1024)
587 memory_end = 1920*1024*1024; 587 memory_end = 1920*1024*1024;
588#else /* CONFIG_ARCH_S390X */ 588#else /* CONFIG_64BIT */
589 memory_end = memory_size & ~0x200000UL; /* detected in head.s */ 589 memory_end = memory_size & ~0x200000UL; /* detected in head.s */
590#endif /* CONFIG_ARCH_S390X */ 590#endif /* CONFIG_64BIT */
591 591
592 init_mm.start_code = PAGE_OFFSET; 592 init_mm.start_code = PAGE_OFFSET;
593 init_mm.end_code = (unsigned long) &_etext; 593 init_mm.end_code = (unsigned long) &_etext;
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 6e0110d71191..6ae4a77270b5 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -254,9 +254,9 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
254 if (restore_sigregs(regs, &frame->uc.uc_mcontext)) 254 if (restore_sigregs(regs, &frame->uc.uc_mcontext))
255 goto badframe; 255 goto badframe;
256 256
257 /* It is more difficult to avoid calling this function than to 257 if (do_sigaltstack(&frame->uc.uc_stack, NULL,
258 call it and ignore errors. */ 258 regs->gprs[15]) == -EFAULT)
259 do_sigaltstack(&frame->uc.uc_stack, NULL, regs->gprs[15]); 259 goto badframe;
260 return regs->gprs[2]; 260 return regs->gprs[2];
261 261
262badframe: 262badframe:
@@ -501,7 +501,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
501 501
502 if (signr > 0) { 502 if (signr > 0) {
503 /* Whee! Actually deliver the signal. */ 503 /* Whee! Actually deliver the signal. */
504#ifdef CONFIG_S390_SUPPORT 504#ifdef CONFIG_COMPAT
505 if (test_thread_flag(TIF_31BIT)) { 505 if (test_thread_flag(TIF_31BIT)) {
506 extern void handle_signal32(unsigned long sig, 506 extern void handle_signal32(unsigned long sig,
507 struct k_sigaction *ka, 507 struct k_sigaction *ka,
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 5856b3fda6bf..e10f4ca00499 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -263,7 +263,7 @@ static void do_machine_restart(void * __unused)
263 int cpu; 263 int cpu;
264 static atomic_t cpuid = ATOMIC_INIT(-1); 264 static atomic_t cpuid = ATOMIC_INIT(-1);
265 265
266 if (atomic_compare_and_swap(-1, smp_processor_id(), &cpuid)) 266 if (atomic_cmpxchg(&cpuid, -1, smp_processor_id()) != -1)
267 signal_processor(smp_processor_id(), sigp_stop); 267 signal_processor(smp_processor_id(), sigp_stop);
268 268
269 /* Wait for all other cpus to enter stopped state */ 269 /* Wait for all other cpus to enter stopped state */
@@ -313,7 +313,7 @@ static void do_machine_halt(void * __unused)
313{ 313{
314 static atomic_t cpuid = ATOMIC_INIT(-1); 314 static atomic_t cpuid = ATOMIC_INIT(-1);
315 315
316 if (atomic_compare_and_swap(-1, smp_processor_id(), &cpuid) == 0) { 316 if (atomic_cmpxchg(&cpuid, -1, smp_processor_id()) == -1) {
317 smp_send_stop(); 317 smp_send_stop();
318 if (MACHINE_IS_VM && strlen(vmhalt_cmd) > 0) 318 if (MACHINE_IS_VM && strlen(vmhalt_cmd) > 0)
319 cpcmd(vmhalt_cmd, NULL, 0, NULL); 319 cpcmd(vmhalt_cmd, NULL, 0, NULL);
@@ -332,7 +332,7 @@ static void do_machine_power_off(void * __unused)
332{ 332{
333 static atomic_t cpuid = ATOMIC_INIT(-1); 333 static atomic_t cpuid = ATOMIC_INIT(-1);
334 334
335 if (atomic_compare_and_swap(-1, smp_processor_id(), &cpuid) == 0) { 335 if (atomic_cmpxchg(&cpuid, -1, smp_processor_id()) == -1) {
336 smp_send_stop(); 336 smp_send_stop();
337 if (MACHINE_IS_VM && strlen(vmpoff_cmd) > 0) 337 if (MACHINE_IS_VM && strlen(vmpoff_cmd) > 0)
338 cpcmd(vmpoff_cmd, NULL, 0, NULL); 338 cpcmd(vmpoff_cmd, NULL, 0, NULL);
@@ -402,7 +402,7 @@ static void smp_ext_bitcall_others(ec_bit_sig sig)
402 } 402 }
403} 403}
404 404
405#ifndef CONFIG_ARCH_S390X 405#ifndef CONFIG_64BIT
406/* 406/*
407 * this function sends a 'purge tlb' signal to another CPU. 407 * this function sends a 'purge tlb' signal to another CPU.
408 */ 408 */
@@ -416,7 +416,7 @@ void smp_ptlb_all(void)
416 on_each_cpu(smp_ptlb_callback, NULL, 0, 1); 416 on_each_cpu(smp_ptlb_callback, NULL, 0, 1);
417} 417}
418EXPORT_SYMBOL(smp_ptlb_all); 418EXPORT_SYMBOL(smp_ptlb_all);
419#endif /* ! CONFIG_ARCH_S390X */ 419#endif /* ! CONFIG_64BIT */
420 420
421/* 421/*
422 * this function sends a 'reschedule' IPI to another CPU. 422 * this function sends a 'reschedule' IPI to another CPU.
@@ -783,7 +783,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
783 if (stack == 0ULL) 783 if (stack == 0ULL)
784 panic("smp_boot_cpus failed to allocate memory\n"); 784 panic("smp_boot_cpus failed to allocate memory\n");
785 lowcore_ptr[i]->panic_stack = stack + (PAGE_SIZE); 785 lowcore_ptr[i]->panic_stack = stack + (PAGE_SIZE);
786#ifndef __s390x__ 786#ifndef CONFIG_64BIT
787 if (MACHINE_HAS_IEEE) { 787 if (MACHINE_HAS_IEEE) {
788 lowcore_ptr[i]->extended_save_area_addr = 788 lowcore_ptr[i]->extended_save_area_addr =
789 (__u32) __get_free_pages(GFP_KERNEL,0); 789 (__u32) __get_free_pages(GFP_KERNEL,0);
@@ -793,7 +793,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
793 } 793 }
794#endif 794#endif
795 } 795 }
796#ifndef __s390x__ 796#ifndef CONFIG_64BIT
797 if (MACHINE_HAS_IEEE) 797 if (MACHINE_HAS_IEEE)
798 ctl_set_bit(14, 29); /* enable extended save area */ 798 ctl_set_bit(14, 29); /* enable extended save area */
799#endif 799#endif
diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c
index efe6b83b53f7..6a63553493c5 100644
--- a/arch/s390/kernel/sys_s390.c
+++ b/arch/s390/kernel/sys_s390.c
@@ -26,9 +26,7 @@
26#include <linux/mman.h> 26#include <linux/mman.h>
27#include <linux/file.h> 27#include <linux/file.h>
28#include <linux/utsname.h> 28#include <linux/utsname.h>
29#ifdef CONFIG_ARCH_S390X
30#include <linux/personality.h> 29#include <linux/personality.h>
31#endif /* CONFIG_ARCH_S390X */
32 30
33#include <asm/uaccess.h> 31#include <asm/uaccess.h>
34#include <asm/ipc.h> 32#include <asm/ipc.h>
@@ -121,7 +119,7 @@ out:
121 return error; 119 return error;
122} 120}
123 121
124#ifndef CONFIG_ARCH_S390X 122#ifndef CONFIG_64BIT
125struct sel_arg_struct { 123struct sel_arg_struct {
126 unsigned long n; 124 unsigned long n;
127 fd_set *inp, *outp, *exp; 125 fd_set *inp, *outp, *exp;
@@ -138,7 +136,7 @@ asmlinkage long old_select(struct sel_arg_struct __user *arg)
138 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); 136 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
139 137
140} 138}
141#endif /* CONFIG_ARCH_S390X */ 139#endif /* CONFIG_64BIT */
142 140
143/* 141/*
144 * sys_ipc() is the de-multiplexer for the SysV IPC calls.. 142 * sys_ipc() is the de-multiplexer for the SysV IPC calls..
@@ -211,7 +209,7 @@ asmlinkage long sys_ipc(uint call, int first, unsigned long second,
211 return -EINVAL; 209 return -EINVAL;
212} 210}
213 211
214#ifdef CONFIG_ARCH_S390X 212#ifdef CONFIG_64BIT
215asmlinkage long s390x_newuname(struct new_utsname __user *name) 213asmlinkage long s390x_newuname(struct new_utsname __user *name)
216{ 214{
217 int ret = sys_newuname(name); 215 int ret = sys_newuname(name);
@@ -235,12 +233,12 @@ asmlinkage long s390x_personality(unsigned long personality)
235 233
236 return ret; 234 return ret;
237} 235}
238#endif /* CONFIG_ARCH_S390X */ 236#endif /* CONFIG_64BIT */
239 237
240/* 238/*
241 * Wrapper function for sys_fadvise64/fadvise64_64 239 * Wrapper function for sys_fadvise64/fadvise64_64
242 */ 240 */
243#ifndef CONFIG_ARCH_S390X 241#ifndef CONFIG_64BIT
244 242
245asmlinkage long 243asmlinkage long
246s390_fadvise64(int fd, u32 offset_high, u32 offset_low, size_t len, int advice) 244s390_fadvise64(int fd, u32 offset_high, u32 offset_low, size_t len, int advice)
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index c5bd36fae56b..95d109968619 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -67,13 +67,13 @@ extern pgm_check_handler_t do_monitor_call;
67 67
68#define stack_pointer ({ void **sp; asm("la %0,0(15)" : "=&d" (sp)); sp; }) 68#define stack_pointer ({ void **sp; asm("la %0,0(15)" : "=&d" (sp)); sp; })
69 69
70#ifndef CONFIG_ARCH_S390X 70#ifndef CONFIG_64BIT
71#define FOURLONG "%08lx %08lx %08lx %08lx\n" 71#define FOURLONG "%08lx %08lx %08lx %08lx\n"
72static int kstack_depth_to_print = 12; 72static int kstack_depth_to_print = 12;
73#else /* CONFIG_ARCH_S390X */ 73#else /* CONFIG_64BIT */
74#define FOURLONG "%016lx %016lx %016lx %016lx\n" 74#define FOURLONG "%016lx %016lx %016lx %016lx\n"
75static int kstack_depth_to_print = 20; 75static int kstack_depth_to_print = 20;
76#endif /* CONFIG_ARCH_S390X */ 76#endif /* CONFIG_64BIT */
77 77
78/* 78/*
79 * For show_trace we have tree different stack to consider: 79 * For show_trace we have tree different stack to consider:
@@ -702,12 +702,12 @@ void __init trap_init(void)
702 pgm_check_table[0x11] = &do_dat_exception; 702 pgm_check_table[0x11] = &do_dat_exception;
703 pgm_check_table[0x12] = &translation_exception; 703 pgm_check_table[0x12] = &translation_exception;
704 pgm_check_table[0x13] = &special_op_exception; 704 pgm_check_table[0x13] = &special_op_exception;
705#ifdef CONFIG_ARCH_S390X 705#ifdef CONFIG_64BIT
706 pgm_check_table[0x38] = &do_dat_exception; 706 pgm_check_table[0x38] = &do_dat_exception;
707 pgm_check_table[0x39] = &do_dat_exception; 707 pgm_check_table[0x39] = &do_dat_exception;
708 pgm_check_table[0x3A] = &do_dat_exception; 708 pgm_check_table[0x3A] = &do_dat_exception;
709 pgm_check_table[0x3B] = &do_dat_exception; 709 pgm_check_table[0x3B] = &do_dat_exception;
710#endif /* CONFIG_ARCH_S390X */ 710#endif /* CONFIG_64BIT */
711 pgm_check_table[0x15] = &operand_exception; 711 pgm_check_table[0x15] = &operand_exception;
712 pgm_check_table[0x1C] = &space_switch_exception; 712 pgm_check_table[0x1C] = &space_switch_exception;
713 pgm_check_table[0x1D] = &hfp_sqrt_exception; 713 pgm_check_table[0x1D] = &hfp_sqrt_exception;
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 89fdb3808bc0..9289face3027 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -5,7 +5,7 @@
5#include <asm-generic/vmlinux.lds.h> 5#include <asm-generic/vmlinux.lds.h>
6#include <linux/config.h> 6#include <linux/config.h>
7 7
8#ifndef CONFIG_ARCH_S390X 8#ifndef CONFIG_64BIT
9OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") 9OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
10OUTPUT_ARCH(s390) 10OUTPUT_ARCH(s390)
11ENTRY(_start) 11ENTRY(_start)
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index b701efa1f00e..d9b97b3c597f 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -4,6 +4,5 @@
4 4
5EXTRA_AFLAGS := -traditional 5EXTRA_AFLAGS := -traditional
6 6
7lib-y += delay.o string.o 7lib-y += delay.o string.o spinlock.o
8lib-$(CONFIG_ARCH_S390_31) += uaccess.o spinlock.o 8lib-y += $(if $(CONFIG_64BIT),uaccess64.o,uaccess.o)
9lib-$(CONFIG_ARCH_S390X) += uaccess64.o spinlock.o
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index 2dc14e9c8327..68d79c502081 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -29,7 +29,7 @@ __setup("spin_retry=", spin_retry_setup);
29static inline void 29static inline void
30_diag44(void) 30_diag44(void)
31{ 31{
32#ifdef __s390x__ 32#ifdef CONFIG_64BIT
33 if (MACHINE_HAS_DIAG44) 33 if (MACHINE_HAS_DIAG44)
34#endif 34#endif
35 asm volatile("diag 0,0,0x44"); 35 asm volatile("diag 0,0,0x44");
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 506a33b51e4f..a9566bcab682 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -143,7 +143,7 @@ dcss_diag (__u8 func, void *parameter,
143 rx = (unsigned long) parameter; 143 rx = (unsigned long) parameter;
144 ry = (unsigned long) func; 144 ry = (unsigned long) func;
145 __asm__ __volatile__( 145 __asm__ __volatile__(
146#ifdef CONFIG_ARCH_S390X 146#ifdef CONFIG_64BIT
147 " sam31\n" // switch to 31 bit 147 " sam31\n" // switch to 31 bit
148 " diag %0,%1,0x64\n" 148 " diag %0,%1,0x64\n"
149 " sam64\n" // switch back to 64 bit 149 " sam64\n" // switch back to 64 bit
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index fb2607c369ed..81ade401b073 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -31,17 +31,17 @@
31#include <asm/uaccess.h> 31#include <asm/uaccess.h>
32#include <asm/pgtable.h> 32#include <asm/pgtable.h>
33 33
34#ifndef CONFIG_ARCH_S390X 34#ifndef CONFIG_64BIT
35#define __FAIL_ADDR_MASK 0x7ffff000 35#define __FAIL_ADDR_MASK 0x7ffff000
36#define __FIXUP_MASK 0x7fffffff 36#define __FIXUP_MASK 0x7fffffff
37#define __SUBCODE_MASK 0x0200 37#define __SUBCODE_MASK 0x0200
38#define __PF_RES_FIELD 0ULL 38#define __PF_RES_FIELD 0ULL
39#else /* CONFIG_ARCH_S390X */ 39#else /* CONFIG_64BIT */
40#define __FAIL_ADDR_MASK -4096L 40#define __FAIL_ADDR_MASK -4096L
41#define __FIXUP_MASK ~0L 41#define __FIXUP_MASK ~0L
42#define __SUBCODE_MASK 0x0600 42#define __SUBCODE_MASK 0x0600
43#define __PF_RES_FIELD 0x8000000000000000ULL 43#define __PF_RES_FIELD 0x8000000000000000ULL
44#endif /* CONFIG_ARCH_S390X */ 44#endif /* CONFIG_64BIT */
45 45
46#ifdef CONFIG_SYSCTL 46#ifdef CONFIG_SYSCTL
47extern int sysctl_userprocess_debug; 47extern int sysctl_userprocess_debug;
@@ -393,11 +393,11 @@ int pfault_init(void)
393 "2:\n" 393 "2:\n"
394 ".section __ex_table,\"a\"\n" 394 ".section __ex_table,\"a\"\n"
395 " .align 4\n" 395 " .align 4\n"
396#ifndef CONFIG_ARCH_S390X 396#ifndef CONFIG_64BIT
397 " .long 0b,1b\n" 397 " .long 0b,1b\n"
398#else /* CONFIG_ARCH_S390X */ 398#else /* CONFIG_64BIT */
399 " .quad 0b,1b\n" 399 " .quad 0b,1b\n"
400#endif /* CONFIG_ARCH_S390X */ 400#endif /* CONFIG_64BIT */
401 ".previous" 401 ".previous"
402 : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc" ); 402 : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc" );
403 __ctl_set_bit(0, 9); 403 __ctl_set_bit(0, 9);
@@ -417,11 +417,11 @@ void pfault_fini(void)
417 "0:\n" 417 "0:\n"
418 ".section __ex_table,\"a\"\n" 418 ".section __ex_table,\"a\"\n"
419 " .align 4\n" 419 " .align 4\n"
420#ifndef CONFIG_ARCH_S390X 420#ifndef CONFIG_64BIT
421 " .long 0b,0b\n" 421 " .long 0b,0b\n"
422#else /* CONFIG_ARCH_S390X */ 422#else /* CONFIG_64BIT */
423 " .quad 0b,0b\n" 423 " .quad 0b,0b\n"
424#endif /* CONFIG_ARCH_S390X */ 424#endif /* CONFIG_64BIT */
425 ".previous" 425 ".previous"
426 : : "a" (&refbk), "m" (refbk) : "cc" ); 426 : : "a" (&refbk), "m" (refbk) : "cc" );
427} 427}
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 6ec5cd981e74..df953383724d 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -44,7 +44,7 @@ void diag10(unsigned long addr)
44{ 44{
45 if (addr >= 0x7ff00000) 45 if (addr >= 0x7ff00000)
46 return; 46 return;
47#ifdef __s390x__ 47#ifdef CONFIG_64BIT
48 asm volatile ( 48 asm volatile (
49 " sam31\n" 49 " sam31\n"
50 " diag %0,%0,0x10\n" 50 " diag %0,%0,0x10\n"
@@ -106,7 +106,7 @@ extern unsigned long __initdata zholes_size[];
106 * paging_init() sets up the page tables 106 * paging_init() sets up the page tables
107 */ 107 */
108 108
109#ifndef CONFIG_ARCH_S390X 109#ifndef CONFIG_64BIT
110void __init paging_init(void) 110void __init paging_init(void)
111{ 111{
112 pgd_t * pg_dir; 112 pgd_t * pg_dir;
@@ -175,7 +175,7 @@ void __init paging_init(void)
175 return; 175 return;
176} 176}
177 177
178#else /* CONFIG_ARCH_S390X */ 178#else /* CONFIG_64BIT */
179void __init paging_init(void) 179void __init paging_init(void)
180{ 180{
181 pgd_t * pg_dir; 181 pgd_t * pg_dir;
@@ -256,7 +256,7 @@ void __init paging_init(void)
256 256
257 return; 257 return;
258} 258}
259#endif /* CONFIG_ARCH_S390X */ 259#endif /* CONFIG_64BIT */
260 260
261void __init mem_init(void) 261void __init mem_init(void)
262{ 262{
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index fb187e5a54b4..356257c171de 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -50,7 +50,7 @@ static inline unsigned long mmap_base(void)
50 50
51static inline int mmap_is_legacy(void) 51static inline int mmap_is_legacy(void)
52{ 52{
53#ifdef CONFIG_ARCH_S390X 53#ifdef CONFIG_64BIT
54 /* 54 /*
55 * Force standard allocation for 64 bit programs. 55 * Force standard allocation for 64 bit programs.
56 */ 56 */
diff --git a/arch/s390/oprofile/Makefile b/arch/s390/oprofile/Makefile
index ec349276258a..537b2d840e69 100644
--- a/arch/s390/oprofile/Makefile
+++ b/arch/s390/oprofile/Makefile
@@ -6,4 +6,4 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
6 oprofilefs.o oprofile_stats.o \ 6 oprofilefs.o oprofile_stats.o \
7 timer_int.o ) 7 timer_int.o )
8 8
9oprofile-y := $(DRIVER_OBJS) init.o 9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
diff --git a/arch/s390/oprofile/backtrace.c b/arch/s390/oprofile/backtrace.c
new file mode 100644
index 000000000000..bc4b84a35cad
--- /dev/null
+++ b/arch/s390/oprofile/backtrace.c
@@ -0,0 +1,79 @@
1/**
2 * arch/s390/oprofile/backtrace.c
3 *
4 * S390 Version
5 * Copyright (C) 2005 IBM Corporation, IBM Deutschland Entwicklung GmbH.
6 * Author(s): Andreas Krebbel <Andreas.Krebbel@de.ibm.com>
7 */
8
9#include <linux/oprofile.h>
10
11#include <asm/processor.h> /* for struct stack_frame */
12
13static unsigned long
14__show_trace(unsigned int *depth, unsigned long sp,
15 unsigned long low, unsigned long high)
16{
17 struct stack_frame *sf;
18 struct pt_regs *regs;
19
20 while (*depth) {
21 sp = sp & PSW_ADDR_INSN;
22 if (sp < low || sp > high - sizeof(*sf))
23 return sp;
24 sf = (struct stack_frame *) sp;
25 (*depth)--;
26 oprofile_add_trace(sf->gprs[8] & PSW_ADDR_INSN);
27
28 /* Follow the backchain. */
29 while (*depth) {
30 low = sp;
31 sp = sf->back_chain & PSW_ADDR_INSN;
32 if (!sp)
33 break;
34 if (sp <= low || sp > high - sizeof(*sf))
35 return sp;
36 sf = (struct stack_frame *) sp;
37 (*depth)--;
38 oprofile_add_trace(sf->gprs[8] & PSW_ADDR_INSN);
39
40 }
41
42 if (*depth == 0)
43 break;
44
45 /* Zero backchain detected, check for interrupt frame. */
46 sp = (unsigned long) (sf + 1);
47 if (sp <= low || sp > high - sizeof(*regs))
48 return sp;
49 regs = (struct pt_regs *) sp;
50 (*depth)--;
51 oprofile_add_trace(sf->gprs[8] & PSW_ADDR_INSN);
52 low = sp;
53 sp = regs->gprs[15];
54 }
55 return sp;
56}
57
58void s390_backtrace(struct pt_regs * const regs, unsigned int depth)
59{
60 unsigned long head;
61 struct stack_frame* head_sf;
62
63 if (user_mode (regs))
64 return;
65
66 head = regs->gprs[15];
67 head_sf = (struct stack_frame*)head;
68
69 if (!head_sf->back_chain)
70 return;
71
72 head = head_sf->back_chain;
73
74 head = __show_trace(&depth, head, S390_lowcore.async_stack - ASYNC_SIZE,
75 S390_lowcore.async_stack);
76
77 __show_trace(&depth, head, S390_lowcore.thread_info,
78 S390_lowcore.thread_info + THREAD_SIZE);
79}
diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c
index a65ead0e200a..7a995113b918 100644
--- a/arch/s390/oprofile/init.c
+++ b/arch/s390/oprofile/init.c
@@ -12,8 +12,12 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/errno.h> 13#include <linux/errno.h>
14 14
15
16extern void s390_backtrace(struct pt_regs * const regs, unsigned int depth);
17
15int __init oprofile_arch_init(struct oprofile_operations* ops) 18int __init oprofile_arch_init(struct oprofile_operations* ops)
16{ 19{
20 ops->backtrace = s390_backtrace;
17 return -ENODEV; 21 return -ENODEV;
18} 22}
19 23
diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index 5b58fad45290..cd13b91b9ff6 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -1,4 +1,4 @@
1/* 1/*
2 * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) 2 * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
3 * Licensed under the GPL 3 * Licensed under the GPL
4 */ 4 */
@@ -58,7 +58,7 @@ static void *not_configged_init(char *str, int device, struct chan_opts *opts)
58{ 58{
59 my_puts("Using a channel type which is configured out of " 59 my_puts("Using a channel type which is configured out of "
60 "UML\n"); 60 "UML\n");
61 return(NULL); 61 return NULL;
62} 62}
63 63
64static int not_configged_open(int input, int output, int primary, void *data, 64static int not_configged_open(int input, int output, int primary, void *data,
@@ -66,7 +66,7 @@ static int not_configged_open(int input, int output, int primary, void *data,
66{ 66{
67 my_puts("Using a channel type which is configured out of " 67 my_puts("Using a channel type which is configured out of "
68 "UML\n"); 68 "UML\n");
69 return(-ENODEV); 69 return -ENODEV;
70} 70}
71 71
72static void not_configged_close(int fd, void *data) 72static void not_configged_close(int fd, void *data)
@@ -79,21 +79,21 @@ static int not_configged_read(int fd, char *c_out, void *data)
79{ 79{
80 my_puts("Using a channel type which is configured out of " 80 my_puts("Using a channel type which is configured out of "
81 "UML\n"); 81 "UML\n");
82 return(-EIO); 82 return -EIO;
83} 83}
84 84
85static int not_configged_write(int fd, const char *buf, int len, void *data) 85static int not_configged_write(int fd, const char *buf, int len, void *data)
86{ 86{
87 my_puts("Using a channel type which is configured out of " 87 my_puts("Using a channel type which is configured out of "
88 "UML\n"); 88 "UML\n");
89 return(-EIO); 89 return -EIO;
90} 90}
91 91
92static int not_configged_console_write(int fd, const char *buf, int len) 92static int not_configged_console_write(int fd, const char *buf, int len)
93{ 93{
94 my_puts("Using a channel type which is configured out of " 94 my_puts("Using a channel type which is configured out of "
95 "UML\n"); 95 "UML\n");
96 return(-EIO); 96 return -EIO;
97} 97}
98 98
99static int not_configged_window_size(int fd, void *data, unsigned short *rows, 99static int not_configged_window_size(int fd, void *data, unsigned short *rows,
@@ -101,7 +101,7 @@ static int not_configged_window_size(int fd, void *data, unsigned short *rows,
101{ 101{
102 my_puts("Using a channel type which is configured out of " 102 my_puts("Using a channel type which is configured out of "
103 "UML\n"); 103 "UML\n");
104 return(-ENODEV); 104 return -ENODEV;
105} 105}
106 106
107static void not_configged_free(void *data) 107static void not_configged_free(void *data)
@@ -135,17 +135,17 @@ int generic_read(int fd, char *c_out, void *unused)
135 n = os_read_file(fd, c_out, sizeof(*c_out)); 135 n = os_read_file(fd, c_out, sizeof(*c_out));
136 136
137 if(n == -EAGAIN) 137 if(n == -EAGAIN)
138 return(0); 138 return 0;
139 else if(n == 0) 139 else if(n == 0)
140 return(-EIO); 140 return -EIO;
141 return(n); 141 return n;
142} 142}
143 143
144/* XXX Trivial wrapper around os_write_file */ 144/* XXX Trivial wrapper around os_write_file */
145 145
146int generic_write(int fd, const char *buf, int n, void *unused) 146int generic_write(int fd, const char *buf, int n, void *unused)
147{ 147{
148 return(os_write_file(fd, buf, n)); 148 return os_write_file(fd, buf, n);
149} 149}
150 150
151int generic_window_size(int fd, void *unused, unsigned short *rows_out, 151int generic_window_size(int fd, void *unused, unsigned short *rows_out,
@@ -156,14 +156,14 @@ int generic_window_size(int fd, void *unused, unsigned short *rows_out,
156 156
157 ret = os_window_size(fd, &rows, &cols); 157 ret = os_window_size(fd, &rows, &cols);
158 if(ret < 0) 158 if(ret < 0)
159 return(ret); 159 return ret;
160 160
161 ret = ((*rows_out != rows) || (*cols_out != cols)); 161 ret = ((*rows_out != rows) || (*cols_out != cols));
162 162
163 *rows_out = rows; 163 *rows_out = rows;
164 *cols_out = cols; 164 *cols_out = cols;
165 165
166 return(ret); 166 return ret;
167} 167}
168 168
169void generic_free(void *data) 169void generic_free(void *data)
@@ -186,25 +186,29 @@ static void tty_receive_char(struct tty_struct *tty, char ch)
186 } 186 }
187 } 187 }
188 188
189 if((tty->flip.flag_buf_ptr == NULL) || 189 if((tty->flip.flag_buf_ptr == NULL) ||
190 (tty->flip.char_buf_ptr == NULL)) 190 (tty->flip.char_buf_ptr == NULL))
191 return; 191 return;
192 tty_insert_flip_char(tty, ch, TTY_NORMAL); 192 tty_insert_flip_char(tty, ch, TTY_NORMAL);
193} 193}
194 194
195static int open_one_chan(struct chan *chan, int input, int output, int primary) 195static int open_one_chan(struct chan *chan)
196{ 196{
197 int fd; 197 int fd;
198 198
199 if(chan->opened) return(0); 199 if(chan->opened)
200 if(chan->ops->open == NULL) fd = 0; 200 return 0;
201 else fd = (*chan->ops->open)(input, output, primary, chan->data, 201
202 &chan->dev); 202 if(chan->ops->open == NULL)
203 if(fd < 0) return(fd); 203 fd = 0;
204 else fd = (*chan->ops->open)(chan->input, chan->output, chan->primary,
205 chan->data, &chan->dev);
206 if(fd < 0)
207 return fd;
204 chan->fd = fd; 208 chan->fd = fd;
205 209
206 chan->opened = 1; 210 chan->opened = 1;
207 return(0); 211 return 0;
208} 212}
209 213
210int open_chan(struct list_head *chans) 214int open_chan(struct list_head *chans)
@@ -215,11 +219,11 @@ int open_chan(struct list_head *chans)
215 219
216 list_for_each(ele, chans){ 220 list_for_each(ele, chans){
217 chan = list_entry(ele, struct chan, list); 221 chan = list_entry(ele, struct chan, list);
218 ret = open_one_chan(chan, chan->input, chan->output, 222 ret = open_one_chan(chan);
219 chan->primary); 223 if(chan->primary)
220 if(chan->primary) err = ret; 224 err = ret;
221 } 225 }
222 return(err); 226 return err;
223} 227}
224 228
225void chan_enable_winch(struct list_head *chans, struct tty_struct *tty) 229void chan_enable_winch(struct list_head *chans, struct tty_struct *tty)
@@ -236,20 +240,65 @@ void chan_enable_winch(struct list_head *chans, struct tty_struct *tty)
236 } 240 }
237} 241}
238 242
239void enable_chan(struct list_head *chans, struct tty_struct *tty) 243void enable_chan(struct line *line)
240{ 244{
241 struct list_head *ele; 245 struct list_head *ele;
242 struct chan *chan; 246 struct chan *chan;
243 247
244 list_for_each(ele, chans){ 248 list_for_each(ele, &line->chan_list){
245 chan = list_entry(ele, struct chan, list); 249 chan = list_entry(ele, struct chan, list);
246 if(!chan->opened) continue; 250 if(open_one_chan(chan))
251 continue;
252
253 if(chan->enabled)
254 continue;
255 line_setup_irq(chan->fd, chan->input, chan->output, line,
256 chan);
257 chan->enabled = 1;
258 }
259}
260
261static LIST_HEAD(irqs_to_free);
262
263void free_irqs(void)
264{
265 struct chan *chan;
266
267 while(!list_empty(&irqs_to_free)){
268 chan = list_entry(irqs_to_free.next, struct chan, free_list);
269 list_del(&chan->free_list);
247 270
248 line_setup_irq(chan->fd, chan->input, chan->output, tty); 271 if(chan->input)
272 free_irq(chan->line->driver->read_irq, chan);
273 if(chan->output)
274 free_irq(chan->line->driver->write_irq, chan);
275 chan->enabled = 0;
276 }
277}
278
279static void close_one_chan(struct chan *chan, int delay_free_irq)
280{
281 if(!chan->opened)
282 return;
283
284 if(delay_free_irq){
285 list_add(&chan->free_list, &irqs_to_free);
249 } 286 }
287 else {
288 if(chan->input)
289 free_irq(chan->line->driver->read_irq, chan);
290 if(chan->output)
291 free_irq(chan->line->driver->write_irq, chan);
292 chan->enabled = 0;
293 }
294 if(chan->ops->close != NULL)
295 (*chan->ops->close)(chan->fd, chan->data);
296
297 chan->opened = 0;
298 chan->fd = -1;
250} 299}
251 300
252void close_chan(struct list_head *chans) 301void close_chan(struct list_head *chans, int delay_free_irq)
253{ 302{
254 struct chan *chan; 303 struct chan *chan;
255 304
@@ -259,15 +308,37 @@ void close_chan(struct list_head *chans)
259 * so it must be the last closed. 308 * so it must be the last closed.
260 */ 309 */
261 list_for_each_entry_reverse(chan, chans, list) { 310 list_for_each_entry_reverse(chan, chans, list) {
262 if(!chan->opened) continue; 311 close_one_chan(chan, delay_free_irq);
263 if(chan->ops->close != NULL) 312 }
264 (*chan->ops->close)(chan->fd, chan->data); 313}
265 chan->opened = 0; 314
266 chan->fd = -1; 315void deactivate_chan(struct list_head *chans, int irq)
316{
317 struct list_head *ele;
318
319 struct chan *chan;
320 list_for_each(ele, chans) {
321 chan = list_entry(ele, struct chan, list);
322
323 if(chan->enabled && chan->input)
324 deactivate_fd(chan->fd, irq);
325 }
326}
327
328void reactivate_chan(struct list_head *chans, int irq)
329{
330 struct list_head *ele;
331 struct chan *chan;
332
333 list_for_each(ele, chans) {
334 chan = list_entry(ele, struct chan, list);
335
336 if(chan->enabled && chan->input)
337 reactivate_fd(chan->fd, irq);
267 } 338 }
268} 339}
269 340
270int write_chan(struct list_head *chans, const char *buf, int len, 341int write_chan(struct list_head *chans, const char *buf, int len,
271 int write_irq) 342 int write_irq)
272{ 343{
273 struct list_head *ele; 344 struct list_head *ele;
@@ -285,7 +356,7 @@ int write_chan(struct list_head *chans, const char *buf, int len,
285 reactivate_fd(chan->fd, write_irq); 356 reactivate_fd(chan->fd, write_irq);
286 } 357 }
287 } 358 }
288 return(ret); 359 return ret;
289} 360}
290 361
291int console_write_chan(struct list_head *chans, const char *buf, int len) 362int console_write_chan(struct list_head *chans, const char *buf, int len)
@@ -301,19 +372,18 @@ int console_write_chan(struct list_head *chans, const char *buf, int len)
301 n = chan->ops->console_write(chan->fd, buf, len); 372 n = chan->ops->console_write(chan->fd, buf, len);
302 if(chan->primary) ret = n; 373 if(chan->primary) ret = n;
303 } 374 }
304 return(ret); 375 return ret;
305} 376}
306 377
307int console_open_chan(struct line *line, struct console *co, struct chan_opts *opts) 378int console_open_chan(struct line *line, struct console *co,
379 struct chan_opts *opts)
308{ 380{
309 if (!list_empty(&line->chan_list)) 381 int err;
310 return 0; 382
383 err = open_chan(&line->chan_list);
384 if(err)
385 return err;
311 386
312 if (0 != parse_chan_pair(line->init_str, &line->chan_list,
313 line->init_pri, co->index, opts))
314 return -1;
315 if (0 != open_chan(&line->chan_list))
316 return -1;
317 printk("Console initialized on /dev/%s%d\n",co->name,co->index); 387 printk("Console initialized on /dev/%s%d\n",co->name,co->index);
318 return 0; 388 return 0;
319} 389}
@@ -327,32 +397,36 @@ int chan_window_size(struct list_head *chans, unsigned short *rows_out,
327 list_for_each(ele, chans){ 397 list_for_each(ele, chans){
328 chan = list_entry(ele, struct chan, list); 398 chan = list_entry(ele, struct chan, list);
329 if(chan->primary){ 399 if(chan->primary){
330 if(chan->ops->window_size == NULL) return(0); 400 if(chan->ops->window_size == NULL)
331 return(chan->ops->window_size(chan->fd, chan->data, 401 return 0;
332 rows_out, cols_out)); 402 return chan->ops->window_size(chan->fd, chan->data,
403 rows_out, cols_out);
333 } 404 }
334 } 405 }
335 return(0); 406 return 0;
336} 407}
337 408
338void free_one_chan(struct chan *chan) 409void free_one_chan(struct chan *chan, int delay_free_irq)
339{ 410{
340 list_del(&chan->list); 411 list_del(&chan->list);
412
413 close_one_chan(chan, delay_free_irq);
414
341 if(chan->ops->free != NULL) 415 if(chan->ops->free != NULL)
342 (*chan->ops->free)(chan->data); 416 (*chan->ops->free)(chan->data);
343 free_irq_by_fd(chan->fd); 417
344 if(chan->primary && chan->output) ignore_sigio_fd(chan->fd); 418 if(chan->primary && chan->output) ignore_sigio_fd(chan->fd);
345 kfree(chan); 419 kfree(chan);
346} 420}
347 421
348void free_chan(struct list_head *chans) 422void free_chan(struct list_head *chans, int delay_free_irq)
349{ 423{
350 struct list_head *ele, *next; 424 struct list_head *ele, *next;
351 struct chan *chan; 425 struct chan *chan;
352 426
353 list_for_each_safe(ele, next, chans){ 427 list_for_each_safe(ele, next, chans){
354 chan = list_entry(ele, struct chan, list); 428 chan = list_entry(ele, struct chan, list);
355 free_one_chan(chan); 429 free_one_chan(chan, delay_free_irq);
356 } 430 }
357} 431}
358 432
@@ -363,23 +437,23 @@ static int one_chan_config_string(struct chan *chan, char *str, int size,
363 437
364 if(chan == NULL){ 438 if(chan == NULL){
365 CONFIG_CHUNK(str, size, n, "none", 1); 439 CONFIG_CHUNK(str, size, n, "none", 1);
366 return(n); 440 return n;
367 } 441 }
368 442
369 CONFIG_CHUNK(str, size, n, chan->ops->type, 0); 443 CONFIG_CHUNK(str, size, n, chan->ops->type, 0);
370 444
371 if(chan->dev == NULL){ 445 if(chan->dev == NULL){
372 CONFIG_CHUNK(str, size, n, "", 1); 446 CONFIG_CHUNK(str, size, n, "", 1);
373 return(n); 447 return n;
374 } 448 }
375 449
376 CONFIG_CHUNK(str, size, n, ":", 0); 450 CONFIG_CHUNK(str, size, n, ":", 0);
377 CONFIG_CHUNK(str, size, n, chan->dev, 0); 451 CONFIG_CHUNK(str, size, n, chan->dev, 0);
378 452
379 return(n); 453 return n;
380} 454}
381 455
382static int chan_pair_config_string(struct chan *in, struct chan *out, 456static int chan_pair_config_string(struct chan *in, struct chan *out,
383 char *str, int size, char **error_out) 457 char *str, int size, char **error_out)
384{ 458{
385 int n; 459 int n;
@@ -390,7 +464,7 @@ static int chan_pair_config_string(struct chan *in, struct chan *out,
390 464
391 if(in == out){ 465 if(in == out){
392 CONFIG_CHUNK(str, size, n, "", 1); 466 CONFIG_CHUNK(str, size, n, "", 1);
393 return(n); 467 return n;
394 } 468 }
395 469
396 CONFIG_CHUNK(str, size, n, ",", 1); 470 CONFIG_CHUNK(str, size, n, ",", 1);
@@ -399,10 +473,10 @@ static int chan_pair_config_string(struct chan *in, struct chan *out,
399 size -= n; 473 size -= n;
400 CONFIG_CHUNK(str, size, n, "", 1); 474 CONFIG_CHUNK(str, size, n, "", 1);
401 475
402 return(n); 476 return n;
403} 477}
404 478
405int chan_config_string(struct list_head *chans, char *str, int size, 479int chan_config_string(struct list_head *chans, char *str, int size,
406 char **error_out) 480 char **error_out)
407{ 481{
408 struct list_head *ele; 482 struct list_head *ele;
@@ -418,7 +492,7 @@ int chan_config_string(struct list_head *chans, char *str, int size,
418 out = chan; 492 out = chan;
419 } 493 }
420 494
421 return(chan_pair_config_string(in, out, str, size, error_out)); 495 return chan_pair_config_string(in, out, str, size, error_out);
422} 496}
423 497
424struct chan_type { 498struct chan_type {
@@ -462,7 +536,7 @@ struct chan_type chan_table[] = {
462#endif 536#endif
463}; 537};
464 538
465static struct chan *parse_chan(char *str, int pri, int device, 539static struct chan *parse_chan(struct line *line, char *str, int device,
466 struct chan_opts *opts) 540 struct chan_opts *opts)
467{ 541{
468 struct chan_type *entry; 542 struct chan_type *entry;
@@ -484,36 +558,42 @@ static struct chan *parse_chan(char *str, int pri, int device,
484 if(ops == NULL){ 558 if(ops == NULL){
485 my_printf("parse_chan couldn't parse \"%s\"\n", 559 my_printf("parse_chan couldn't parse \"%s\"\n",
486 str); 560 str);
487 return(NULL); 561 return NULL;
488 } 562 }
489 if(ops->init == NULL) return(NULL); 563 if(ops->init == NULL)
564 return NULL;
490 data = (*ops->init)(str, device, opts); 565 data = (*ops->init)(str, device, opts);
491 if(data == NULL) return(NULL); 566 if(data == NULL)
567 return NULL;
492 568
493 chan = kmalloc(sizeof(*chan), GFP_ATOMIC); 569 chan = kmalloc(sizeof(*chan), GFP_ATOMIC);
494 if(chan == NULL) return(NULL); 570 if(chan == NULL)
571 return NULL;
495 *chan = ((struct chan) { .list = LIST_HEAD_INIT(chan->list), 572 *chan = ((struct chan) { .list = LIST_HEAD_INIT(chan->list),
573 .free_list =
574 LIST_HEAD_INIT(chan->free_list),
575 .line = line,
496 .primary = 1, 576 .primary = 1,
497 .input = 0, 577 .input = 0,
498 .output = 0, 578 .output = 0,
499 .opened = 0, 579 .opened = 0,
580 .enabled = 0,
500 .fd = -1, 581 .fd = -1,
501 .pri = pri,
502 .ops = ops, 582 .ops = ops,
503 .data = data }); 583 .data = data });
504 return(chan); 584 return chan;
505} 585}
506 586
507int parse_chan_pair(char *str, struct list_head *chans, int pri, int device, 587int parse_chan_pair(char *str, struct line *line, int device,
508 struct chan_opts *opts) 588 struct chan_opts *opts)
509{ 589{
590 struct list_head *chans = &line->chan_list;
510 struct chan *new, *chan; 591 struct chan *new, *chan;
511 char *in, *out; 592 char *in, *out;
512 593
513 if(!list_empty(chans)){ 594 if(!list_empty(chans)){
514 chan = list_entry(chans->next, struct chan, list); 595 chan = list_entry(chans->next, struct chan, list);
515 if(chan->pri >= pri) return(0); 596 free_chan(chans, 0);
516 free_chan(chans);
517 INIT_LIST_HEAD(chans); 597 INIT_LIST_HEAD(chans);
518 } 598 }
519 599
@@ -522,24 +602,30 @@ int parse_chan_pair(char *str, struct list_head *chans, int pri, int device,
522 in = str; 602 in = str;
523 *out = '\0'; 603 *out = '\0';
524 out++; 604 out++;
525 new = parse_chan(in, pri, device, opts); 605 new = parse_chan(line, in, device, opts);
526 if(new == NULL) return(-1); 606 if(new == NULL)
607 return -1;
608
527 new->input = 1; 609 new->input = 1;
528 list_add(&new->list, chans); 610 list_add(&new->list, chans);
529 611
530 new = parse_chan(out, pri, device, opts); 612 new = parse_chan(line, out, device, opts);
531 if(new == NULL) return(-1); 613 if(new == NULL)
614 return -1;
615
532 list_add(&new->list, chans); 616 list_add(&new->list, chans);
533 new->output = 1; 617 new->output = 1;
534 } 618 }
535 else { 619 else {
536 new = parse_chan(str, pri, device, opts); 620 new = parse_chan(line, str, device, opts);
537 if(new == NULL) return(-1); 621 if(new == NULL)
622 return -1;
623
538 list_add(&new->list, chans); 624 list_add(&new->list, chans);
539 new->input = 1; 625 new->input = 1;
540 new->output = 1; 626 new->output = 1;
541 } 627 }
542 return(0); 628 return 0;
543} 629}
544 630
545int chan_out_fd(struct list_head *chans) 631int chan_out_fd(struct list_head *chans)
@@ -550,9 +636,9 @@ int chan_out_fd(struct list_head *chans)
550 list_for_each(ele, chans){ 636 list_for_each(ele, chans){
551 chan = list_entry(ele, struct chan, list); 637 chan = list_entry(ele, struct chan, list);
552 if(chan->primary && chan->output) 638 if(chan->primary && chan->output)
553 return(chan->fd); 639 return chan->fd;
554 } 640 }
555 return(-1); 641 return -1;
556} 642}
557 643
558void chan_interrupt(struct list_head *chans, struct work_struct *task, 644void chan_interrupt(struct list_head *chans, struct work_struct *task,
@@ -567,9 +653,9 @@ void chan_interrupt(struct list_head *chans, struct work_struct *task,
567 chan = list_entry(ele, struct chan, list); 653 chan = list_entry(ele, struct chan, list);
568 if(!chan->input || (chan->ops->read == NULL)) continue; 654 if(!chan->input || (chan->ops->read == NULL)) continue;
569 do { 655 do {
570 if((tty != NULL) && 656 if((tty != NULL) &&
571 (tty->flip.count >= TTY_FLIPBUF_SIZE)){ 657 (tty->flip.count >= TTY_FLIPBUF_SIZE)){
572 schedule_work(task); 658 schedule_delayed_work(task, 1);
573 goto out; 659 goto out;
574 } 660 }
575 err = chan->ops->read(chan->fd, &c, chan->data); 661 err = chan->ops->read(chan->fd, &c, chan->data);
@@ -582,29 +668,12 @@ void chan_interrupt(struct list_head *chans, struct work_struct *task,
582 if(chan->primary){ 668 if(chan->primary){
583 if(tty != NULL) 669 if(tty != NULL)
584 tty_hangup(tty); 670 tty_hangup(tty);
585 line_disable(tty, irq); 671 close_chan(chans, 1);
586 close_chan(chans);
587 free_chan(chans);
588 return; 672 return;
589 } 673 }
590 else { 674 else close_one_chan(chan, 1);
591 if(chan->ops->close != NULL)
592 chan->ops->close(chan->fd, chan->data);
593 free_one_chan(chan);
594 }
595 } 675 }
596 } 676 }
597 out: 677 out:
598 if(tty) tty_flip_buffer_push(tty); 678 if(tty) tty_flip_buffer_push(tty);
599} 679}
600
601/*
602 * Overrides for Emacs so that we follow Linus's tabbing style.
603 * Emacs will notice this stuff at the end of the file and automatically
604 * adjust the settings for this buffer only. This must remain at the end
605 * of the file.
606 * ---------------------------------------------------------------------------
607 * Local variables:
608 * c-file-style: "linux"
609 * End:
610 */
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index e0fdffa2d542..46ceb25a9959 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -1,4 +1,4 @@
1/* 1/*
2 * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) 2 * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com)
3 * Licensed under the GPL 3 * Licensed under the GPL
4 */ 4 */
@@ -23,8 +23,9 @@
23 23
24static irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused) 24static irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused)
25{ 25{
26 struct tty_struct *tty = data; 26 struct chan *chan = data;
27 struct line *line = tty->driver_data; 27 struct line *line = chan->line;
28 struct tty_struct *tty = line->tty;
28 29
29 if (line) 30 if (line)
30 chan_interrupt(&line->chan_list, &line->task, tty, irq); 31 chan_interrupt(&line->chan_list, &line->task, tty, irq);
@@ -33,10 +34,11 @@ static irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused)
33 34
34static void line_timer_cb(void *arg) 35static void line_timer_cb(void *arg)
35{ 36{
36 struct tty_struct *tty = arg; 37 struct line *line = arg;
37 struct line *line = tty->driver_data;
38 38
39 line_interrupt(line->driver->read_irq, arg, NULL); 39 if(!line->throttled)
40 chan_interrupt(&line->chan_list, &line->task, line->tty,
41 line->driver->read_irq);
40} 42}
41 43
42/* Returns the free space inside the ring buffer of this line. 44/* Returns the free space inside the ring buffer of this line.
@@ -124,7 +126,8 @@ static int buffer_data(struct line *line, const char *buf, int len)
124 if (len < end){ 126 if (len < end){
125 memcpy(line->tail, buf, len); 127 memcpy(line->tail, buf, len);
126 line->tail += len; 128 line->tail += len;
127 } else { 129 }
130 else {
128 /* The circular buffer is wrapping */ 131 /* The circular buffer is wrapping */
129 memcpy(line->tail, buf, end); 132 memcpy(line->tail, buf, end);
130 buf += end; 133 buf += end;
@@ -170,7 +173,7 @@ static int flush_buffer(struct line *line)
170 } 173 }
171 174
172 count = line->tail - line->head; 175 count = line->tail - line->head;
173 n = write_chan(&line->chan_list, line->head, count, 176 n = write_chan(&line->chan_list, line->head, count,
174 line->driver->write_irq); 177 line->driver->write_irq);
175 178
176 if(n < 0) 179 if(n < 0)
@@ -227,7 +230,7 @@ int line_write(struct tty_struct *tty, const unsigned char *buf, int len)
227 if (err <= 0 && (err != -EAGAIN || !ret)) 230 if (err <= 0 && (err != -EAGAIN || !ret))
228 ret = err; 231 ret = err;
229 } else { 232 } else {
230 n = write_chan(&line->chan_list, buf, len, 233 n = write_chan(&line->chan_list, buf, len,
231 line->driver->write_irq); 234 line->driver->write_irq);
232 if (n < 0) { 235 if (n < 0) {
233 ret = n; 236 ret = n;
@@ -338,11 +341,36 @@ int line_ioctl(struct tty_struct *tty, struct file * file,
338 return ret; 341 return ret;
339} 342}
340 343
344void line_throttle(struct tty_struct *tty)
345{
346 struct line *line = tty->driver_data;
347
348 deactivate_chan(&line->chan_list, line->driver->read_irq);
349 line->throttled = 1;
350}
351
352void line_unthrottle(struct tty_struct *tty)
353{
354 struct line *line = tty->driver_data;
355
356 line->throttled = 0;
357 chan_interrupt(&line->chan_list, &line->task, tty,
358 line->driver->read_irq);
359
360 /* Maybe there is enough stuff pending that calling the interrupt
361 * throttles us again. In this case, line->throttled will be 1
362 * again and we shouldn't turn the interrupt back on.
363 */
364 if(!line->throttled)
365 reactivate_chan(&line->chan_list, line->driver->read_irq);
366}
367
341static irqreturn_t line_write_interrupt(int irq, void *data, 368static irqreturn_t line_write_interrupt(int irq, void *data,
342 struct pt_regs *unused) 369 struct pt_regs *unused)
343{ 370{
344 struct tty_struct *tty = data; 371 struct chan *chan = data;
345 struct line *line = tty->driver_data; 372 struct line *line = chan->line;
373 struct tty_struct *tty = line->tty;
346 int err; 374 int err;
347 375
348 /* Interrupts are enabled here because we registered the interrupt with 376 /* Interrupts are enabled here because we registered the interrupt with
@@ -364,7 +392,7 @@ static irqreturn_t line_write_interrupt(int irq, void *data,
364 if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) && 392 if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) &&
365 (tty->ldisc.write_wakeup != NULL)) 393 (tty->ldisc.write_wakeup != NULL))
366 (tty->ldisc.write_wakeup)(tty); 394 (tty->ldisc.write_wakeup)(tty);
367 395
368 /* BLOCKING mode 396 /* BLOCKING mode
369 * In blocking mode, everything sleeps on tty->write_wait. 397 * In blocking mode, everything sleeps on tty->write_wait.
370 * Sleeping in the console driver would break non-blocking 398 * Sleeping in the console driver would break non-blocking
@@ -376,53 +404,29 @@ static irqreturn_t line_write_interrupt(int irq, void *data,
376 return IRQ_HANDLED; 404 return IRQ_HANDLED;
377} 405}
378 406
379int line_setup_irq(int fd, int input, int output, struct tty_struct *tty) 407int line_setup_irq(int fd, int input, int output, struct line *line, void *data)
380{ 408{
381 struct line *line = tty->driver_data;
382 struct line_driver *driver = line->driver; 409 struct line_driver *driver = line->driver;
383 int err = 0, flags = SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM; 410 int err = 0, flags = SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM;
384 411
385 if (input) 412 if (input)
386 err = um_request_irq(driver->read_irq, fd, IRQ_READ, 413 err = um_request_irq(driver->read_irq, fd, IRQ_READ,
387 line_interrupt, flags, 414 line_interrupt, flags,
388 driver->read_irq_name, tty); 415 driver->read_irq_name, data);
389 if (err) 416 if (err)
390 return err; 417 return err;
391 if (output) 418 if (output)
392 err = um_request_irq(driver->write_irq, fd, IRQ_WRITE, 419 err = um_request_irq(driver->write_irq, fd, IRQ_WRITE,
393 line_write_interrupt, flags, 420 line_write_interrupt, flags,
394 driver->write_irq_name, tty); 421 driver->write_irq_name, data);
395 line->have_irq = 1; 422 line->have_irq = 1;
396 return err; 423 return err;
397} 424}
398 425
399void line_disable(struct tty_struct *tty, int current_irq) 426int line_open(struct line *lines, struct tty_struct *tty)
400{
401 struct line *line = tty->driver_data;
402
403 if(!line->have_irq)
404 return;
405
406 if(line->driver->read_irq == current_irq)
407 free_irq_later(line->driver->read_irq, tty);
408 else {
409 free_irq(line->driver->read_irq, tty);
410 }
411
412 if(line->driver->write_irq == current_irq)
413 free_irq_later(line->driver->write_irq, tty);
414 else {
415 free_irq(line->driver->write_irq, tty);
416 }
417
418 line->have_irq = 0;
419}
420
421int line_open(struct line *lines, struct tty_struct *tty,
422 struct chan_opts *opts)
423{ 427{
424 struct line *line; 428 struct line *line;
425 int err = 0; 429 int err = -ENODEV;
426 430
427 line = &lines[tty->index]; 431 line = &lines[tty->index];
428 tty->driver_data = line; 432 tty->driver_data = line;
@@ -430,31 +434,29 @@ int line_open(struct line *lines, struct tty_struct *tty,
430 /* The IRQ which takes this lock is not yet enabled and won't be run 434 /* The IRQ which takes this lock is not yet enabled and won't be run
431 * before the end, so we don't need to use spin_lock_irq.*/ 435 * before the end, so we don't need to use spin_lock_irq.*/
432 spin_lock(&line->lock); 436 spin_lock(&line->lock);
433 if (tty->count == 1) { 437
434 if (!line->valid) { 438 tty->driver_data = line;
435 err = -ENODEV; 439 line->tty = tty;
436 goto out; 440 if(!line->valid)
437 } 441 goto out;
438 if (list_empty(&line->chan_list)) { 442
439 err = parse_chan_pair(line->init_str, &line->chan_list, 443 if(tty->count == 1){
440 line->init_pri, tty->index, opts); 444 /* Here the device is opened, if necessary, and interrupt
441 if(err) goto out; 445 * is registered.
442 err = open_chan(&line->chan_list); 446 */
443 if(err) goto out; 447 enable_chan(line);
448 INIT_WORK(&line->task, line_timer_cb, line);
449
450 if(!line->sigio){
451 chan_enable_winch(&line->chan_list, tty);
452 line->sigio = 1;
444 } 453 }
445 /* Here the interrupt is registered.*/
446 enable_chan(&line->chan_list, tty);
447 INIT_WORK(&line->task, line_timer_cb, tty);
448 }
449 454
450 if(!line->sigio){ 455 chan_window_size(&line->chan_list, &tty->winsize.ws_row,
451 chan_enable_winch(&line->chan_list, tty); 456 &tty->winsize.ws_col);
452 line->sigio = 1;
453 } 457 }
454 chan_window_size(&line->chan_list, &tty->winsize.ws_row,
455 &tty->winsize.ws_col);
456 line->count++;
457 458
459 err = 0;
458out: 460out:
459 spin_unlock(&line->lock); 461 spin_unlock(&line->lock);
460 return err; 462 return err;
@@ -474,15 +476,14 @@ void line_close(struct tty_struct *tty, struct file * filp)
474 /* We ignore the error anyway! */ 476 /* We ignore the error anyway! */
475 flush_buffer(line); 477 flush_buffer(line);
476 478
477 line->count--; 479 if(tty->count == 1){
478 if (tty->count == 1) { 480 line->tty = NULL;
479 line_disable(tty, -1);
480 tty->driver_data = NULL; 481 tty->driver_data = NULL;
481 }
482 482
483 if((line->count == 0) && line->sigio){ 483 if(line->sigio){
484 unregister_winch(tty); 484 unregister_winch(tty);
485 line->sigio = 0; 485 line->sigio = 0;
486 }
486 } 487 }
487 488
488 spin_unlock_irq(&line->lock); 489 spin_unlock_irq(&line->lock);
@@ -493,17 +494,15 @@ void close_lines(struct line *lines, int nlines)
493 int i; 494 int i;
494 495
495 for(i = 0; i < nlines; i++) 496 for(i = 0; i < nlines; i++)
496 close_chan(&lines[i].chan_list); 497 close_chan(&lines[i].chan_list, 0);
497} 498}
498 499
499/* Common setup code for both startup command line and mconsole initialization. 500/* Common setup code for both startup command line and mconsole initialization.
500 * @lines contains the the array (of size @num) to modify; 501 * @lines contains the the array (of size @num) to modify;
501 * @init is the setup string; 502 * @init is the setup string;
502 * @all_allowed is a boolean saying if we can setup the whole @lines 503 */
503 * at once. For instance, it will be usually true for startup init. (where we
504 * can use con=xterm) and false for mconsole.*/
505 504
506int line_setup(struct line *lines, unsigned int num, char *init, int all_allowed) 505int line_setup(struct line *lines, unsigned int num, char *init)
507{ 506{
508 int i, n; 507 int i, n;
509 char *end; 508 char *end;
@@ -512,10 +511,11 @@ int line_setup(struct line *lines, unsigned int num, char *init, int all_allowed
512 /* We said con=/ssl= instead of con#=, so we are configuring all 511 /* We said con=/ssl= instead of con#=, so we are configuring all
513 * consoles at once.*/ 512 * consoles at once.*/
514 n = -1; 513 n = -1;
515 } else { 514 }
515 else {
516 n = simple_strtoul(init, &end, 0); 516 n = simple_strtoul(init, &end, 0);
517 if(*end != '='){ 517 if(*end != '='){
518 printk(KERN_ERR "line_setup failed to parse \"%s\"\n", 518 printk(KERN_ERR "line_setup failed to parse \"%s\"\n",
519 init); 519 init);
520 return 0; 520 return 0;
521 } 521 }
@@ -527,8 +527,9 @@ int line_setup(struct line *lines, unsigned int num, char *init, int all_allowed
527 printk("line_setup - %d out of range ((0 ... %d) allowed)\n", 527 printk("line_setup - %d out of range ((0 ... %d) allowed)\n",
528 n, num - 1); 528 n, num - 1);
529 return 0; 529 return 0;
530 } else if (n >= 0){ 530 }
531 if (lines[n].count > 0) { 531 else if (n >= 0){
532 if (lines[n].tty != NULL) {
532 printk("line_setup - device %d is open\n", n); 533 printk("line_setup - device %d is open\n", n);
533 return 0; 534 return 0;
534 } 535 }
@@ -539,13 +540,10 @@ int line_setup(struct line *lines, unsigned int num, char *init, int all_allowed
539 else { 540 else {
540 lines[n].init_str = init; 541 lines[n].init_str = init;
541 lines[n].valid = 1; 542 lines[n].valid = 1;
542 } 543 }
543 } 544 }
544 } else if(!all_allowed){ 545 }
545 printk("line_setup - can't configure all devices from " 546 else {
546 "mconsole\n");
547 return 0;
548 } else {
549 for(i = 0; i < num; i++){ 547 for(i = 0; i < num; i++){
550 if(lines[i].init_pri <= INIT_ALL){ 548 if(lines[i].init_pri <= INIT_ALL){
551 lines[i].init_pri = INIT_ALL; 549 lines[i].init_pri = INIT_ALL;
@@ -557,18 +555,33 @@ int line_setup(struct line *lines, unsigned int num, char *init, int all_allowed
557 } 555 }
558 } 556 }
559 } 557 }
560 return 1; 558 return n == -1 ? num : n;
561} 559}
562 560
563int line_config(struct line *lines, unsigned int num, char *str) 561int line_config(struct line *lines, unsigned int num, char *str,
562 struct chan_opts *opts)
564{ 563{
565 char *new = uml_strdup(str); 564 struct line *line;
565 char *new;
566 int n;
566 567
568 if(*str == '='){
569 printk("line_config - can't configure all devices from "
570 "mconsole\n");
571 return 1;
572 }
573
574 new = kstrdup(str, GFP_KERNEL);
567 if(new == NULL){ 575 if(new == NULL){
568 printk("line_config - uml_strdup failed\n"); 576 printk("line_config - kstrdup failed\n");
569 return -ENOMEM; 577 return 1;
570 } 578 }
571 return !line_setup(lines, num, new, 0); 579 n = line_setup(lines, num, new);
580 if(n < 0)
581 return 1;
582
583 line = &lines[n];
584 return parse_chan_pair(line->init_str, line, n, opts);
572} 585}
573 586
574int line_get_config(char *name, struct line *lines, unsigned int num, char *str, 587int line_get_config(char *name, struct line *lines, unsigned int num, char *str,
@@ -594,7 +607,7 @@ int line_get_config(char *name, struct line *lines, unsigned int num, char *str,
594 spin_lock(&line->lock); 607 spin_lock(&line->lock);
595 if(!line->valid) 608 if(!line->valid)
596 CONFIG_CHUNK(str, size, n, "none", 1); 609 CONFIG_CHUNK(str, size, n, "none", 1);
597 else if(line->count == 0) 610 else if(line->tty == NULL)
598 CONFIG_CHUNK(str, size, n, line->init_str, 1); 611 CONFIG_CHUNK(str, size, n, line->init_str, 1);
599 else n = chan_config_string(&line->chan_list, str, size, error_out); 612 else n = chan_config_string(&line->chan_list, str, size, error_out);
600 spin_unlock(&line->lock); 613 spin_unlock(&line->lock);
@@ -619,14 +632,18 @@ int line_id(char **str, int *start_out, int *end_out)
619 632
620int line_remove(struct line *lines, unsigned int num, int n) 633int line_remove(struct line *lines, unsigned int num, int n)
621{ 634{
635 int err;
622 char config[sizeof("conxxxx=none\0")]; 636 char config[sizeof("conxxxx=none\0")];
623 637
624 sprintf(config, "%d=none", n); 638 sprintf(config, "%d=none", n);
625 return !line_setup(lines, num, config, 0); 639 err = line_setup(lines, num, config);
640 if(err >= 0)
641 err = 0;
642 return err;
626} 643}
627 644
628struct tty_driver *line_register_devfs(struct lines *set, 645struct tty_driver *line_register_devfs(struct lines *set,
629 struct line_driver *line_driver, 646 struct line_driver *line_driver,
630 struct tty_operations *ops, struct line *lines, 647 struct tty_operations *ops, struct line *lines,
631 int nlines) 648 int nlines)
632{ 649{
@@ -655,7 +672,7 @@ struct tty_driver *line_register_devfs(struct lines *set,
655 } 672 }
656 673
657 for(i = 0; i < nlines; i++){ 674 for(i = 0; i < nlines; i++){
658 if(!lines[i].valid) 675 if(!lines[i].valid)
659 tty_unregister_device(driver, i); 676 tty_unregister_device(driver, i);
660 } 677 }
661 678
@@ -663,24 +680,28 @@ struct tty_driver *line_register_devfs(struct lines *set,
663 return driver; 680 return driver;
664} 681}
665 682
666static spinlock_t winch_handler_lock; 683static DEFINE_SPINLOCK(winch_handler_lock);
667LIST_HEAD(winch_handlers); 684static LIST_HEAD(winch_handlers);
668 685
669void lines_init(struct line *lines, int nlines) 686void lines_init(struct line *lines, int nlines, struct chan_opts *opts)
670{ 687{
671 struct line *line; 688 struct line *line;
672 int i; 689 int i;
673 690
674 spin_lock_init(&winch_handler_lock);
675 for(i = 0; i < nlines; i++){ 691 for(i = 0; i < nlines; i++){
676 line = &lines[i]; 692 line = &lines[i];
677 INIT_LIST_HEAD(&line->chan_list); 693 INIT_LIST_HEAD(&line->chan_list);
678 spin_lock_init(&line->lock); 694
679 if(line->init_str != NULL){ 695 if(line->init_str == NULL)
680 line->init_str = uml_strdup(line->init_str); 696 continue;
681 if(line->init_str == NULL) 697
682 printk("lines_init - uml_strdup returned " 698 line->init_str = kstrdup(line->init_str, GFP_KERNEL);
683 "NULL\n"); 699 if(line->init_str == NULL)
700 printk("lines_init - kstrdup returned NULL\n");
701
702 if(parse_chan_pair(line->init_str, line, i, opts)){
703 printk("parse_chan_pair failed for device %d\n", i);
704 line->valid = 0;
684 } 705 }
685 } 706 }
686} 707}
@@ -717,8 +738,7 @@ irqreturn_t winch_interrupt(int irq, void *data, struct pt_regs *unused)
717 tty = winch->tty; 738 tty = winch->tty;
718 if (tty != NULL) { 739 if (tty != NULL) {
719 line = tty->driver_data; 740 line = tty->driver_data;
720 chan_window_size(&line->chan_list, 741 chan_window_size(&line->chan_list, &tty->winsize.ws_row,
721 &tty->winsize.ws_row,
722 &tty->winsize.ws_col); 742 &tty->winsize.ws_col);
723 kill_pg(tty->pgrp, SIGWINCH, 1); 743 kill_pg(tty->pgrp, SIGWINCH, 1);
724 } 744 }
@@ -749,60 +769,54 @@ void register_winch_irq(int fd, int tty_fd, int pid, struct tty_struct *tty)
749 spin_unlock(&winch_handler_lock); 769 spin_unlock(&winch_handler_lock);
750 770
751 if(um_request_irq(WINCH_IRQ, fd, IRQ_READ, winch_interrupt, 771 if(um_request_irq(WINCH_IRQ, fd, IRQ_READ, winch_interrupt,
752 SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, 772 SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM,
753 "winch", winch) < 0) 773 "winch", winch) < 0)
754 printk("register_winch_irq - failed to register IRQ\n"); 774 printk("register_winch_irq - failed to register IRQ\n");
755} 775}
756 776
777static void free_winch(struct winch *winch)
778{
779 list_del(&winch->list);
780
781 if(winch->pid != -1)
782 os_kill_process(winch->pid, 1);
783 if(winch->fd != -1)
784 os_close_file(winch->fd);
785
786 free_irq(WINCH_IRQ, winch);
787 kfree(winch);
788}
789
757static void unregister_winch(struct tty_struct *tty) 790static void unregister_winch(struct tty_struct *tty)
758{ 791{
759 struct list_head *ele; 792 struct list_head *ele;
760 struct winch *winch, *found = NULL; 793 struct winch *winch;
761 794
762 spin_lock(&winch_handler_lock); 795 spin_lock(&winch_handler_lock);
796
763 list_for_each(ele, &winch_handlers){ 797 list_for_each(ele, &winch_handlers){
764 winch = list_entry(ele, struct winch, list); 798 winch = list_entry(ele, struct winch, list);
765 if(winch->tty == tty){ 799 if(winch->tty == tty){
766 found = winch; 800 free_winch(winch);
767 break; 801 break;
768 } 802 }
769 } 803 }
770 if(found == NULL)
771 goto err;
772
773 list_del(&winch->list);
774 spin_unlock(&winch_handler_lock);
775
776 if(winch->pid != -1)
777 os_kill_process(winch->pid, 1);
778
779 free_irq(WINCH_IRQ, winch);
780 kfree(winch);
781
782 return;
783err:
784 spin_unlock(&winch_handler_lock); 804 spin_unlock(&winch_handler_lock);
785} 805}
786 806
787/* XXX: No lock as it's an exitcall... is this valid? Depending on cleanup
788 * order... are we sure that nothing else is done on the list? */
789static void winch_cleanup(void) 807static void winch_cleanup(void)
790{ 808{
791 struct list_head *ele; 809 struct list_head *ele, *next;
792 struct winch *winch; 810 struct winch *winch;
793 811
794 list_for_each(ele, &winch_handlers){ 812 spin_lock(&winch_handler_lock);
813
814 list_for_each_safe(ele, next, &winch_handlers){
795 winch = list_entry(ele, struct winch, list); 815 winch = list_entry(ele, struct winch, list);
796 if(winch->fd != -1){ 816 free_winch(winch);
797 /* Why is this different from the above free_irq(),
798 * which deactivates SIGIO? This searches the FD
799 * somewhere else and removes it from the list... */
800 deactivate_fd(winch->fd, WINCH_IRQ);
801 os_close_file(winch->fd);
802 }
803 if(winch->pid != -1)
804 os_kill_process(winch->pid, 1);
805 } 817 }
818
819 spin_unlock(&winch_handler_lock);
806} 820}
807__uml_exitcall(winch_cleanup); 821__uml_exitcall(winch_cleanup);
808 822
@@ -811,10 +825,10 @@ char *add_xterm_umid(char *base)
811 char *umid, *title; 825 char *umid, *title;
812 int len; 826 int len;
813 827
814 umid = get_umid(1); 828 umid = get_umid();
815 if(umid == NULL) 829 if(*umid == '\0')
816 return base; 830 return base;
817 831
818 len = strlen(base) + strlen(" ()") + strlen(umid) + 1; 832 len = strlen(base) + strlen(" ()") + strlen(umid) + 1;
819 title = kmalloc(len, GFP_KERNEL); 833 title = kmalloc(len, GFP_KERNEL);
820 if(title == NULL){ 834 if(title == NULL){
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 12c95368124a..be610125429f 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -20,6 +20,7 @@
20#include "linux/namei.h" 20#include "linux/namei.h"
21#include "linux/proc_fs.h" 21#include "linux/proc_fs.h"
22#include "linux/syscalls.h" 22#include "linux/syscalls.h"
23#include "linux/console.h"
23#include "asm/irq.h" 24#include "asm/irq.h"
24#include "asm/uaccess.h" 25#include "asm/uaccess.h"
25#include "user_util.h" 26#include "user_util.h"
@@ -34,7 +35,7 @@
34#include "irq_kern.h" 35#include "irq_kern.h"
35#include "choose-mode.h" 36#include "choose-mode.h"
36 37
37static int do_unlink_socket(struct notifier_block *notifier, 38static int do_unlink_socket(struct notifier_block *notifier,
38 unsigned long what, void *data) 39 unsigned long what, void *data)
39{ 40{
40 return(mconsole_unlink_socket()); 41 return(mconsole_unlink_socket());
@@ -46,12 +47,12 @@ static struct notifier_block reboot_notifier = {
46 .priority = 0, 47 .priority = 0,
47}; 48};
48 49
49/* Safe without explicit locking for now. Tasklets provide their own 50/* Safe without explicit locking for now. Tasklets provide their own
50 * locking, and the interrupt handler is safe because it can't interrupt 51 * locking, and the interrupt handler is safe because it can't interrupt
51 * itself and it can only happen on CPU 0. 52 * itself and it can only happen on CPU 0.
52 */ 53 */
53 54
54LIST_HEAD(mc_requests); 55static LIST_HEAD(mc_requests);
55 56
56static void mc_work_proc(void *unused) 57static void mc_work_proc(void *unused)
57{ 58{
@@ -60,7 +61,7 @@ static void mc_work_proc(void *unused)
60 61
61 while(!list_empty(&mc_requests)){ 62 while(!list_empty(&mc_requests)){
62 local_save_flags(flags); 63 local_save_flags(flags);
63 req = list_entry(mc_requests.next, struct mconsole_entry, 64 req = list_entry(mc_requests.next, struct mconsole_entry,
64 list); 65 list);
65 list_del(&req->list); 66 list_del(&req->list);
66 local_irq_restore(flags); 67 local_irq_restore(flags);
@@ -69,7 +70,7 @@ static void mc_work_proc(void *unused)
69 } 70 }
70} 71}
71 72
72DECLARE_WORK(mconsole_work, mc_work_proc, NULL); 73static DECLARE_WORK(mconsole_work, mc_work_proc, NULL);
73 74
74static irqreturn_t mconsole_interrupt(int irq, void *dev_id, 75static irqreturn_t mconsole_interrupt(int irq, void *dev_id,
75 struct pt_regs *regs) 76 struct pt_regs *regs)
@@ -103,8 +104,8 @@ void mconsole_version(struct mc_request *req)
103{ 104{
104 char version[256]; 105 char version[256];
105 106
106 sprintf(version, "%s %s %s %s %s", system_utsname.sysname, 107 sprintf(version, "%s %s %s %s %s", system_utsname.sysname,
107 system_utsname.nodename, system_utsname.release, 108 system_utsname.nodename, system_utsname.release,
108 system_utsname.version, system_utsname.machine); 109 system_utsname.version, system_utsname.machine);
109 mconsole_reply(req, version, 0, 0); 110 mconsole_reply(req, version, 0, 0);
110} 111}
@@ -348,7 +349,7 @@ static struct mc_device *mconsole_find_dev(char *name)
348 349
349#define CONFIG_BUF_SIZE 64 350#define CONFIG_BUF_SIZE 64
350 351
351static void mconsole_get_config(int (*get_config)(char *, char *, int, 352static void mconsole_get_config(int (*get_config)(char *, char *, int,
352 char **), 353 char **),
353 struct mc_request *req, char *name) 354 struct mc_request *req, char *name)
354{ 355{
@@ -389,7 +390,6 @@ static void mconsole_get_config(int (*get_config)(char *, char *, int,
389 out: 390 out:
390 if(buf != default_buf) 391 if(buf != default_buf)
391 kfree(buf); 392 kfree(buf);
392
393} 393}
394 394
395void mconsole_config(struct mc_request *req) 395void mconsole_config(struct mc_request *req)
@@ -420,9 +420,9 @@ void mconsole_config(struct mc_request *req)
420 420
421void mconsole_remove(struct mc_request *req) 421void mconsole_remove(struct mc_request *req)
422{ 422{
423 struct mc_device *dev; 423 struct mc_device *dev;
424 char *ptr = req->request.data, *err_msg = ""; 424 char *ptr = req->request.data, *err_msg = "";
425 char error[256]; 425 char error[256];
426 int err, start, end, n; 426 int err, start, end, n;
427 427
428 ptr += strlen("remove"); 428 ptr += strlen("remove");
@@ -433,37 +433,112 @@ void mconsole_remove(struct mc_request *req)
433 return; 433 return;
434 } 434 }
435 435
436 ptr = &ptr[strlen(dev->name)]; 436 ptr = &ptr[strlen(dev->name)];
437 437
438 err = 1; 438 err = 1;
439 n = (*dev->id)(&ptr, &start, &end); 439 n = (*dev->id)(&ptr, &start, &end);
440 if(n < 0){ 440 if(n < 0){
441 err_msg = "Couldn't parse device number"; 441 err_msg = "Couldn't parse device number";
442 goto out; 442 goto out;
443 } 443 }
444 else if((n < start) || (n > end)){ 444 else if((n < start) || (n > end)){
445 sprintf(error, "Invalid device number - must be between " 445 sprintf(error, "Invalid device number - must be between "
446 "%d and %d", start, end); 446 "%d and %d", start, end);
447 err_msg = error; 447 err_msg = error;
448 goto out; 448 goto out;
449 } 449 }
450 450
451 err = (*dev->remove)(n); 451 err = (*dev->remove)(n);
452 switch(err){ 452 switch(err){
453 case -ENODEV: 453 case -ENODEV:
454 err_msg = "Device doesn't exist"; 454 err_msg = "Device doesn't exist";
455 break; 455 break;
456 case -EBUSY: 456 case -EBUSY:
457 err_msg = "Device is currently open"; 457 err_msg = "Device is currently open";
458 break; 458 break;
459 default: 459 default:
460 break; 460 break;
461 } 461 }
462 out: 462out:
463 mconsole_reply(req, err_msg, err, 0); 463 mconsole_reply(req, err_msg, err, 0);
464} 464}
465 465
466static DEFINE_SPINLOCK(console_lock);
467static LIST_HEAD(clients);
468static char console_buf[MCONSOLE_MAX_DATA];
469static int console_index = 0;
470
471static void console_write(struct console *console, const char *string,
472 unsigned len)
473{
474 struct list_head *ele;
475 int n;
476
477 if(list_empty(&clients))
478 return;
479
480 while(1){
481 n = min(len, ARRAY_SIZE(console_buf) - console_index);
482 strncpy(&console_buf[console_index], string, n);
483 console_index += n;
484 string += n;
485 len -= n;
486 if(len == 0)
487 return;
488
489 list_for_each(ele, &clients){
490 struct mconsole_entry *entry;
491
492 entry = list_entry(ele, struct mconsole_entry, list);
493 mconsole_reply_len(&entry->request, console_buf,
494 console_index, 0, 1);
495 }
496
497 console_index = 0;
498 }
499}
500
501static struct console mc_console = { .name = "mc",
502 .write = console_write,
503 .flags = CON_PRINTBUFFER | CON_ENABLED,
504 .index = -1 };
505
506static int mc_add_console(void)
507{
508 register_console(&mc_console);
509 return 0;
510}
511
512late_initcall(mc_add_console);
513
514static void with_console(struct mc_request *req, void (*proc)(void *),
515 void *arg)
516{
517 struct mconsole_entry entry;
518 unsigned long flags;
519
520 INIT_LIST_HEAD(&entry.list);
521 entry.request = *req;
522 list_add(&entry.list, &clients);
523 spin_lock_irqsave(&console_lock, flags);
524
525 (*proc)(arg);
526
527 mconsole_reply_len(req, console_buf, console_index, 0, 0);
528 console_index = 0;
529
530 spin_unlock_irqrestore(&console_lock, flags);
531 list_del(&entry.list);
532}
533
466#ifdef CONFIG_MAGIC_SYSRQ 534#ifdef CONFIG_MAGIC_SYSRQ
535static void sysrq_proc(void *arg)
536{
537 char *op = arg;
538
539 handle_sysrq(*op, &current->thread.regs, NULL);
540}
541
467void mconsole_sysrq(struct mc_request *req) 542void mconsole_sysrq(struct mc_request *req)
468{ 543{
469 char *ptr = req->request.data; 544 char *ptr = req->request.data;
@@ -471,8 +546,13 @@ void mconsole_sysrq(struct mc_request *req)
471 ptr += strlen("sysrq"); 546 ptr += strlen("sysrq");
472 while(isspace(*ptr)) ptr++; 547 while(isspace(*ptr)) ptr++;
473 548
474 mconsole_reply(req, "", 0, 0); 549 /* With 'b', the system will shut down without a chance to reply,
475 handle_sysrq(*ptr, &current->thread.regs, NULL); 550 * so in this case, we reply first.
551 */
552 if(*ptr == 'b')
553 mconsole_reply(req, "", 0, 0);
554
555 with_console(req, sysrq_proc, ptr);
476} 556}
477#else 557#else
478void mconsole_sysrq(struct mc_request *req) 558void mconsole_sysrq(struct mc_request *req)
@@ -481,6 +561,14 @@ void mconsole_sysrq(struct mc_request *req)
481} 561}
482#endif 562#endif
483 563
564static void stack_proc(void *arg)
565{
566 struct task_struct *from = current, *to = arg;
567
568 to->thread.saved_task = from;
569 switch_to(from, to, from);
570}
571
484/* Mconsole stack trace 572/* Mconsole stack trace
485 * Added by Allan Graves, Jeff Dike 573 * Added by Allan Graves, Jeff Dike
486 * Dumps a stacks registers to the linux console. 574 * Dumps a stacks registers to the linux console.
@@ -488,37 +576,34 @@ void mconsole_sysrq(struct mc_request *req)
488 */ 576 */
489void do_stack(struct mc_request *req) 577void do_stack(struct mc_request *req)
490{ 578{
491 char *ptr = req->request.data; 579 char *ptr = req->request.data;
492 int pid_requested= -1; 580 int pid_requested= -1;
493 struct task_struct *from = NULL; 581 struct task_struct *from = NULL;
494 struct task_struct *to = NULL; 582 struct task_struct *to = NULL;
495 583
496 /* Would be nice: 584 /* Would be nice:
497 * 1) Send showregs output to mconsole. 585 * 1) Send showregs output to mconsole.
498 * 2) Add a way to stack dump all pids. 586 * 2) Add a way to stack dump all pids.
499 */ 587 */
500 588
501 ptr += strlen("stack"); 589 ptr += strlen("stack");
502 while(isspace(*ptr)) ptr++; 590 while(isspace(*ptr)) ptr++;
503
504 /* Should really check for multiple pids or reject bad args here */
505 /* What do the arguments in mconsole_reply mean? */
506 if(sscanf(ptr, "%d", &pid_requested) == 0){
507 mconsole_reply(req, "Please specify a pid", 1, 0);
508 return;
509 }
510 591
511 from = current; 592 /* Should really check for multiple pids or reject bad args here */
512 to = find_task_by_pid(pid_requested); 593 /* What do the arguments in mconsole_reply mean? */
594 if(sscanf(ptr, "%d", &pid_requested) == 0){
595 mconsole_reply(req, "Please specify a pid", 1, 0);
596 return;
597 }
513 598
514 if((to == NULL) || (pid_requested == 0)) { 599 from = current;
515 mconsole_reply(req, "Couldn't find that pid", 1, 0);
516 return;
517 }
518 to->thread.saved_task = current;
519 600
520 switch_to(from, to, from); 601 to = find_task_by_pid(pid_requested);
521 mconsole_reply(req, "Stack Dumped to console and message log", 0, 0); 602 if((to == NULL) || (pid_requested == 0)) {
603 mconsole_reply(req, "Couldn't find that pid", 1, 0);
604 return;
605 }
606 with_console(req, stack_proc, to);
522} 607}
523 608
524void mconsole_stack(struct mc_request *req) 609void mconsole_stack(struct mc_request *req)
@@ -534,9 +619,9 @@ void mconsole_stack(struct mc_request *req)
534/* Changed by mconsole_setup, which is __setup, and called before SMP is 619/* Changed by mconsole_setup, which is __setup, and called before SMP is
535 * active. 620 * active.
536 */ 621 */
537static char *notify_socket = NULL; 622static char *notify_socket = NULL;
538 623
539int mconsole_init(void) 624static int mconsole_init(void)
540{ 625{
541 /* long to avoid size mismatch warnings from gcc */ 626 /* long to avoid size mismatch warnings from gcc */
542 long sock; 627 long sock;
@@ -563,16 +648,16 @@ int mconsole_init(void)
563 } 648 }
564 649
565 if(notify_socket != NULL){ 650 if(notify_socket != NULL){
566 notify_socket = uml_strdup(notify_socket); 651 notify_socket = kstrdup(notify_socket, GFP_KERNEL);
567 if(notify_socket != NULL) 652 if(notify_socket != NULL)
568 mconsole_notify(notify_socket, MCONSOLE_SOCKET, 653 mconsole_notify(notify_socket, MCONSOLE_SOCKET,
569 mconsole_socket_name, 654 mconsole_socket_name,
570 strlen(mconsole_socket_name) + 1); 655 strlen(mconsole_socket_name) + 1);
571 else printk(KERN_ERR "mconsole_setup failed to strdup " 656 else printk(KERN_ERR "mconsole_setup failed to strdup "
572 "string\n"); 657 "string\n");
573 } 658 }
574 659
575 printk("mconsole (version %d) initialized on %s\n", 660 printk("mconsole (version %d) initialized on %s\n",
576 MCONSOLE_VERSION, mconsole_socket_name); 661 MCONSOLE_VERSION, mconsole_socket_name);
577 return(0); 662 return(0);
578} 663}
@@ -585,7 +670,7 @@ static int write_proc_mconsole(struct file *file, const char __user *buffer,
585 char *buf; 670 char *buf;
586 671
587 buf = kmalloc(count + 1, GFP_KERNEL); 672 buf = kmalloc(count + 1, GFP_KERNEL);
588 if(buf == NULL) 673 if(buf == NULL)
589 return(-ENOMEM); 674 return(-ENOMEM);
590 675
591 if(copy_from_user(buf, buffer, count)){ 676 if(copy_from_user(buf, buffer, count)){
@@ -661,7 +746,7 @@ static int notify_panic(struct notifier_block *self, unsigned long unused1,
661 746
662 if(notify_socket == NULL) return(0); 747 if(notify_socket == NULL) return(0);
663 748
664 mconsole_notify(notify_socket, MCONSOLE_PANIC, message, 749 mconsole_notify(notify_socket, MCONSOLE_PANIC, message,
665 strlen(message) + 1); 750 strlen(message) + 1);
666 return(0); 751 return(0);
667} 752}
@@ -686,14 +771,3 @@ char *mconsole_notify_socket(void)
686} 771}
687 772
688EXPORT_SYMBOL(mconsole_notify_socket); 773EXPORT_SYMBOL(mconsole_notify_socket);
689
690/*
691 * Overrides for Emacs so that we follow Linus's tabbing style.
692 * Emacs will notice this stuff at the end of the file and automatically
693 * adjust the settings for this buffer only. This must remain at the end
694 * of the file.
695 * ---------------------------------------------------------------------------
696 * Local variables:
697 * c-file-style: "linux"
698 * End:
699 */
diff --git a/arch/um/drivers/mconsole_user.c b/arch/um/drivers/mconsole_user.c
index 310c1f823f26..4b109fe7fff8 100644
--- a/arch/um/drivers/mconsole_user.c
+++ b/arch/um/drivers/mconsole_user.c
@@ -122,12 +122,12 @@ int mconsole_get_request(int fd, struct mc_request *req)
122 return(1); 122 return(1);
123} 123}
124 124
125int mconsole_reply(struct mc_request *req, char *str, int err, int more) 125int mconsole_reply_len(struct mc_request *req, const char *str, int total,
126 int err, int more)
126{ 127{
127 struct mconsole_reply reply; 128 struct mconsole_reply reply;
128 int total, len, n; 129 int len, n;
129 130
130 total = strlen(str);
131 do { 131 do {
132 reply.err = err; 132 reply.err = err;
133 133
@@ -155,6 +155,12 @@ int mconsole_reply(struct mc_request *req, char *str, int err, int more)
155 return(0); 155 return(0);
156} 156}
157 157
158int mconsole_reply(struct mc_request *req, const char *str, int err, int more)
159{
160 return mconsole_reply_len(req, str, strlen(str), err, more);
161}
162
163
158int mconsole_unlink_socket(void) 164int mconsole_unlink_socket(void)
159{ 165{
160 unlink(mconsole_socket_name); 166 unlink(mconsole_socket_name);
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 84c73a300acb..fb1f9fb9b871 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -34,7 +34,7 @@
34#define DRIVER_NAME "uml-netdev" 34#define DRIVER_NAME "uml-netdev"
35 35
36static DEFINE_SPINLOCK(opened_lock); 36static DEFINE_SPINLOCK(opened_lock);
37LIST_HEAD(opened); 37static LIST_HEAD(opened);
38 38
39static int uml_net_rx(struct net_device *dev) 39static int uml_net_rx(struct net_device *dev)
40{ 40{
@@ -150,6 +150,7 @@ static int uml_net_close(struct net_device *dev)
150 if(lp->close != NULL) 150 if(lp->close != NULL)
151 (*lp->close)(lp->fd, &lp->user); 151 (*lp->close)(lp->fd, &lp->user);
152 lp->fd = -1; 152 lp->fd = -1;
153 list_del(&lp->list);
153 154
154 spin_unlock(&lp->lock); 155 spin_unlock(&lp->lock);
155 return 0; 156 return 0;
@@ -266,7 +267,7 @@ void uml_net_user_timer_expire(unsigned long _conn)
266} 267}
267 268
268static DEFINE_SPINLOCK(devices_lock); 269static DEFINE_SPINLOCK(devices_lock);
269static struct list_head devices = LIST_HEAD_INIT(devices); 270static LIST_HEAD(devices);
270 271
271static struct platform_driver uml_net_driver = { 272static struct platform_driver uml_net_driver = {
272 .driver = { 273 .driver = {
@@ -586,7 +587,7 @@ static int net_config(char *str)
586 err = eth_parse(str, &n, &str); 587 err = eth_parse(str, &n, &str);
587 if(err) return(err); 588 if(err) return(err);
588 589
589 str = uml_strdup(str); 590 str = kstrdup(str, GFP_KERNEL);
590 if(str == NULL){ 591 if(str == NULL){
591 printk(KERN_ERR "net_config failed to strdup string\n"); 592 printk(KERN_ERR "net_config failed to strdup string\n");
592 return(-1); 593 return(-1);
@@ -715,6 +716,7 @@ static void close_devices(void)
715 716
716 list_for_each(ele, &opened){ 717 list_for_each(ele, &opened){
717 lp = list_entry(ele, struct uml_net_private, list); 718 lp = list_entry(ele, struct uml_net_private, list);
719 free_irq(lp->dev->irq, lp->dev);
718 if((lp->close != NULL) && (lp->fd >= 0)) 720 if((lp->close != NULL) && (lp->fd >= 0))
719 (*lp->close)(lp->fd, &lp->user); 721 (*lp->close)(lp->fd, &lp->user);
720 if(lp->remove != NULL) (*lp->remove)(&lp->user); 722 if(lp->remove != NULL) (*lp->remove)(&lp->user);
diff --git a/arch/um/drivers/ssl.c b/arch/um/drivers/ssl.c
index 62e04ecfada8..a32ef55cb244 100644
--- a/arch/um/drivers/ssl.c
+++ b/arch/um/drivers/ssl.c
@@ -69,7 +69,7 @@ static struct line_driver driver = {
69 .name = "ssl", 69 .name = "ssl",
70 .config = ssl_config, 70 .config = ssl_config,
71 .get_config = ssl_get_config, 71 .get_config = ssl_get_config,
72 .id = line_id, 72 .id = line_id,
73 .remove = ssl_remove, 73 .remove = ssl_remove,
74 }, 74 },
75}; 75};
@@ -84,26 +84,23 @@ static struct lines lines = LINES_INIT(NR_PORTS);
84 84
85static int ssl_config(char *str) 85static int ssl_config(char *str)
86{ 86{
87 return(line_config(serial_lines, 87 return line_config(serial_lines, ARRAY_SIZE(serial_lines), str, &opts);
88 sizeof(serial_lines)/sizeof(serial_lines[0]), str));
89} 88}
90 89
91static int ssl_get_config(char *dev, char *str, int size, char **error_out) 90static int ssl_get_config(char *dev, char *str, int size, char **error_out)
92{ 91{
93 return(line_get_config(dev, serial_lines, 92 return line_get_config(dev, serial_lines, ARRAY_SIZE(serial_lines), str,
94 sizeof(serial_lines)/sizeof(serial_lines[0]), 93 size, error_out);
95 str, size, error_out));
96} 94}
97 95
98static int ssl_remove(int n) 96static int ssl_remove(int n)
99{ 97{
100 return line_remove(serial_lines, 98 return line_remove(serial_lines, ARRAY_SIZE(serial_lines), n);
101 sizeof(serial_lines)/sizeof(serial_lines[0]), n);
102} 99}
103 100
104int ssl_open(struct tty_struct *tty, struct file *filp) 101int ssl_open(struct tty_struct *tty, struct file *filp)
105{ 102{
106 return line_open(serial_lines, tty, &opts); 103 return line_open(serial_lines, tty);
107} 104}
108 105
109#if 0 106#if 0
@@ -112,16 +109,6 @@ static void ssl_flush_buffer(struct tty_struct *tty)
112 return; 109 return;
113} 110}
114 111
115static void ssl_throttle(struct tty_struct * tty)
116{
117 printk(KERN_ERR "Someone should implement ssl_throttle\n");
118}
119
120static void ssl_unthrottle(struct tty_struct * tty)
121{
122 printk(KERN_ERR "Someone should implement ssl_unthrottle\n");
123}
124
125static void ssl_stop(struct tty_struct *tty) 112static void ssl_stop(struct tty_struct *tty)
126{ 113{
127 printk(KERN_ERR "Someone should implement ssl_stop\n"); 114 printk(KERN_ERR "Someone should implement ssl_stop\n");
@@ -148,9 +135,9 @@ static struct tty_operations ssl_ops = {
148 .flush_chars = line_flush_chars, 135 .flush_chars = line_flush_chars,
149 .set_termios = line_set_termios, 136 .set_termios = line_set_termios,
150 .ioctl = line_ioctl, 137 .ioctl = line_ioctl,
138 .throttle = line_throttle,
139 .unthrottle = line_unthrottle,
151#if 0 140#if 0
152 .throttle = ssl_throttle,
153 .unthrottle = ssl_unthrottle,
154 .stop = ssl_stop, 141 .stop = ssl_stop,
155 .start = ssl_start, 142 .start = ssl_start,
156 .hangup = ssl_hangup, 143 .hangup = ssl_hangup,
@@ -183,7 +170,7 @@ static int ssl_console_setup(struct console *co, char *options)
183{ 170{
184 struct line *line = &serial_lines[co->index]; 171 struct line *line = &serial_lines[co->index];
185 172
186 return console_open_chan(line,co,&opts); 173 return console_open_chan(line, co, &opts);
187} 174}
188 175
189static struct console ssl_cons = { 176static struct console ssl_cons = {
@@ -199,12 +186,13 @@ int ssl_init(void)
199{ 186{
200 char *new_title; 187 char *new_title;
201 188
202 printk(KERN_INFO "Initializing software serial port version %d\n", 189 printk(KERN_INFO "Initializing software serial port version %d\n",
203 ssl_version); 190 ssl_version);
204 ssl_driver = line_register_devfs(&lines, &driver, &ssl_ops, 191 ssl_driver = line_register_devfs(&lines, &driver, &ssl_ops,
205 serial_lines, ARRAY_SIZE(serial_lines)); 192 serial_lines,
193 ARRAY_SIZE(serial_lines));
206 194
207 lines_init(serial_lines, sizeof(serial_lines)/sizeof(serial_lines[0])); 195 lines_init(serial_lines, ARRAY_SIZE(serial_lines), &opts);
208 196
209 new_title = add_xterm_umid(opts.xterm_title); 197 new_title = add_xterm_umid(opts.xterm_title);
210 if (new_title != NULL) 198 if (new_title != NULL)
@@ -212,7 +200,7 @@ int ssl_init(void)
212 200
213 ssl_init_done = 1; 201 ssl_init_done = 1;
214 register_console(&ssl_cons); 202 register_console(&ssl_cons);
215 return(0); 203 return 0;
216} 204}
217late_initcall(ssl_init); 205late_initcall(ssl_init);
218 206
@@ -220,16 +208,13 @@ static void ssl_exit(void)
220{ 208{
221 if (!ssl_init_done) 209 if (!ssl_init_done)
222 return; 210 return;
223 close_lines(serial_lines, 211 close_lines(serial_lines, ARRAY_SIZE(serial_lines));
224 sizeof(serial_lines)/sizeof(serial_lines[0]));
225} 212}
226__uml_exitcall(ssl_exit); 213__uml_exitcall(ssl_exit);
227 214
228static int ssl_chan_setup(char *str) 215static int ssl_chan_setup(char *str)
229{ 216{
230 return(line_setup(serial_lines, 217 return line_setup(serial_lines, ARRAY_SIZE(serial_lines), str);
231 sizeof(serial_lines)/sizeof(serial_lines[0]),
232 str, 1));
233} 218}
234 219
235__setup("ssl", ssl_chan_setup); 220__setup("ssl", ssl_chan_setup);
diff --git a/arch/um/drivers/stdio_console.c b/arch/um/drivers/stdio_console.c
index 005aa6333b6e..61db8b2fc83f 100644
--- a/arch/um/drivers/stdio_console.c
+++ b/arch/um/drivers/stdio_console.c
@@ -75,7 +75,7 @@ static struct line_driver driver = {
75 .name = "con", 75 .name = "con",
76 .config = con_config, 76 .config = con_config,
77 .get_config = con_get_config, 77 .get_config = con_get_config,
78 .id = line_id, 78 .id = line_id,
79 .remove = con_remove, 79 .remove = con_remove,
80 }, 80 },
81}; 81};
@@ -86,28 +86,27 @@ static struct lines console_lines = LINES_INIT(MAX_TTYS);
86 * individual elements are protected by individual semaphores. 86 * individual elements are protected by individual semaphores.
87 */ 87 */
88struct line vts[MAX_TTYS] = { LINE_INIT(CONFIG_CON_ZERO_CHAN, &driver), 88struct line vts[MAX_TTYS] = { LINE_INIT(CONFIG_CON_ZERO_CHAN, &driver),
89 [ 1 ... MAX_TTYS - 1 ] = 89 [ 1 ... MAX_TTYS - 1 ] =
90 LINE_INIT(CONFIG_CON_CHAN, &driver) }; 90 LINE_INIT(CONFIG_CON_CHAN, &driver) };
91 91
92static int con_config(char *str) 92static int con_config(char *str)
93{ 93{
94 return(line_config(vts, sizeof(vts)/sizeof(vts[0]), str)); 94 return line_config(vts, ARRAY_SIZE(vts), str, &opts);
95} 95}
96 96
97static int con_get_config(char *dev, char *str, int size, char **error_out) 97static int con_get_config(char *dev, char *str, int size, char **error_out)
98{ 98{
99 return(line_get_config(dev, vts, sizeof(vts)/sizeof(vts[0]), str, 99 return line_get_config(dev, vts, ARRAY_SIZE(vts), str, size, error_out);
100 size, error_out));
101} 100}
102 101
103static int con_remove(int n) 102static int con_remove(int n)
104{ 103{
105 return line_remove(vts, sizeof(vts)/sizeof(vts[0]), n); 104 return line_remove(vts, ARRAY_SIZE(vts), n);
106} 105}
107 106
108static int con_open(struct tty_struct *tty, struct file *filp) 107static int con_open(struct tty_struct *tty, struct file *filp)
109{ 108{
110 return line_open(vts, tty, &opts); 109 return line_open(vts, tty);
111} 110}
112 111
113static int con_init_done = 0; 112static int con_init_done = 0;
@@ -117,16 +116,18 @@ static struct tty_operations console_ops = {
117 .close = line_close, 116 .close = line_close,
118 .write = line_write, 117 .write = line_write,
119 .put_char = line_put_char, 118 .put_char = line_put_char,
120 .write_room = line_write_room, 119 .write_room = line_write_room,
121 .chars_in_buffer = line_chars_in_buffer, 120 .chars_in_buffer = line_chars_in_buffer,
122 .flush_buffer = line_flush_buffer, 121 .flush_buffer = line_flush_buffer,
123 .flush_chars = line_flush_chars, 122 .flush_chars = line_flush_chars,
124 .set_termios = line_set_termios, 123 .set_termios = line_set_termios,
125 .ioctl = line_ioctl, 124 .ioctl = line_ioctl,
125 .throttle = line_throttle,
126 .unthrottle = line_unthrottle,
126}; 127};
127 128
128static void uml_console_write(struct console *console, const char *string, 129static void uml_console_write(struct console *console, const char *string,
129 unsigned len) 130 unsigned len)
130{ 131{
131 struct line *line = &vts[console->index]; 132 struct line *line = &vts[console->index];
132 unsigned long flags; 133 unsigned long flags;
@@ -146,7 +147,7 @@ static int uml_console_setup(struct console *co, char *options)
146{ 147{
147 struct line *line = &vts[co->index]; 148 struct line *line = &vts[co->index];
148 149
149 return console_open_chan(line,co,&opts); 150 return console_open_chan(line, co, &opts);
150} 151}
151 152
152static struct console stdiocons = { 153static struct console stdiocons = {
@@ -156,7 +157,7 @@ static struct console stdiocons = {
156 .setup = uml_console_setup, 157 .setup = uml_console_setup,
157 .flags = CON_PRINTBUFFER, 158 .flags = CON_PRINTBUFFER,
158 .index = -1, 159 .index = -1,
159 .data = &vts, 160 .data = &vts,
160}; 161};
161 162
162int stdio_init(void) 163int stdio_init(void)
@@ -166,11 +167,11 @@ int stdio_init(void)
166 console_driver = line_register_devfs(&console_lines, &driver, 167 console_driver = line_register_devfs(&console_lines, &driver,
167 &console_ops, vts, 168 &console_ops, vts,
168 ARRAY_SIZE(vts)); 169 ARRAY_SIZE(vts));
169 if (NULL == console_driver) 170 if (console_driver == NULL)
170 return -1; 171 return -1;
171 printk(KERN_INFO "Initialized stdio console driver\n"); 172 printk(KERN_INFO "Initialized stdio console driver\n");
172 173
173 lines_init(vts, sizeof(vts)/sizeof(vts[0])); 174 lines_init(vts, ARRAY_SIZE(vts), &opts);
174 175
175 new_title = add_xterm_umid(opts.xterm_title); 176 new_title = add_xterm_umid(opts.xterm_title);
176 if(new_title != NULL) 177 if(new_title != NULL)
@@ -178,7 +179,7 @@ int stdio_init(void)
178 179
179 con_init_done = 1; 180 con_init_done = 1;
180 register_console(&stdiocons); 181 register_console(&stdiocons);
181 return(0); 182 return 0;
182} 183}
183late_initcall(stdio_init); 184late_initcall(stdio_init);
184 185
@@ -186,13 +187,13 @@ static void console_exit(void)
186{ 187{
187 if (!con_init_done) 188 if (!con_init_done)
188 return; 189 return;
189 close_lines(vts, sizeof(vts)/sizeof(vts[0])); 190 close_lines(vts, ARRAY_SIZE(vts));
190} 191}
191__uml_exitcall(console_exit); 192__uml_exitcall(console_exit);
192 193
193static int console_chan_setup(char *str) 194static int console_chan_setup(char *str)
194{ 195{
195 return(line_setup(vts, sizeof(vts)/sizeof(vts[0]), str, 1)); 196 return line_setup(vts, ARRAY_SIZE(vts), str);
196} 197}
197__setup("con", console_chan_setup); 198__setup("con", console_chan_setup);
198__channel_help(console_chan_setup, "con"); 199__channel_help(console_chan_setup, "con");
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 93898917cbe5..73f9652b2ee9 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -706,7 +706,7 @@ static int ubd_config(char *str)
706{ 706{
707 int n, err; 707 int n, err;
708 708
709 str = uml_strdup(str); 709 str = kstrdup(str, GFP_KERNEL);
710 if(str == NULL){ 710 if(str == NULL){
711 printk(KERN_ERR "ubd_config failed to strdup string\n"); 711 printk(KERN_ERR "ubd_config failed to strdup string\n");
712 return(1); 712 return(1);
@@ -1387,15 +1387,6 @@ int io_thread(void *arg)
1387 printk("io_thread - write failed, fd = %d, err = %d\n", 1387 printk("io_thread - write failed, fd = %d, err = %d\n",
1388 kernel_fd, -n); 1388 kernel_fd, -n);
1389 } 1389 }
1390}
1391 1390
1392/* 1391 return 0;
1393 * Overrides for Emacs so that we follow Linus's tabbing style. 1392}
1394 * Emacs will notice this stuff at the end of the file and automatically
1395 * adjust the settings for this buffer only. This must remain at the end
1396 * of the file.
1397 * ---------------------------------------------------------------------------
1398 * Local variables:
1399 * c-file-style: "linux"
1400 * End:
1401 */
diff --git a/arch/um/include/chan_kern.h b/arch/um/include/chan_kern.h
index da9a6717e7a4..1bb5e9d94270 100644
--- a/arch/um/include/chan_kern.h
+++ b/arch/um/include/chan_kern.h
@@ -14,21 +14,23 @@
14 14
15struct chan { 15struct chan {
16 struct list_head list; 16 struct list_head list;
17 struct list_head free_list;
18 struct line *line;
17 char *dev; 19 char *dev;
18 unsigned int primary:1; 20 unsigned int primary:1;
19 unsigned int input:1; 21 unsigned int input:1;
20 unsigned int output:1; 22 unsigned int output:1;
21 unsigned int opened:1; 23 unsigned int opened:1;
24 unsigned int enabled:1;
22 int fd; 25 int fd;
23 enum chan_init_pri pri;
24 struct chan_ops *ops; 26 struct chan_ops *ops;
25 void *data; 27 void *data;
26}; 28};
27 29
28extern void chan_interrupt(struct list_head *chans, struct work_struct *task, 30extern void chan_interrupt(struct list_head *chans, struct work_struct *task,
29 struct tty_struct *tty, int irq); 31 struct tty_struct *tty, int irq);
30extern int parse_chan_pair(char *str, struct list_head *chans, int pri, 32extern int parse_chan_pair(char *str, struct line *line, int device,
31 int device, struct chan_opts *opts); 33 struct chan_opts *opts);
32extern int open_chan(struct list_head *chans); 34extern int open_chan(struct list_head *chans);
33extern int write_chan(struct list_head *chans, const char *buf, int len, 35extern int write_chan(struct list_head *chans, const char *buf, int len,
34 int write_irq); 36 int write_irq);
@@ -36,9 +38,11 @@ extern int console_write_chan(struct list_head *chans, const char *buf,
36 int len); 38 int len);
37extern int console_open_chan(struct line *line, struct console *co, 39extern int console_open_chan(struct line *line, struct console *co,
38 struct chan_opts *opts); 40 struct chan_opts *opts);
39extern void close_chan(struct list_head *chans); 41extern void deactivate_chan(struct list_head *chans, int irq);
42extern void reactivate_chan(struct list_head *chans, int irq);
40extern void chan_enable_winch(struct list_head *chans, struct tty_struct *tty); 43extern void chan_enable_winch(struct list_head *chans, struct tty_struct *tty);
41extern void enable_chan(struct list_head *chans, struct tty_struct *tty); 44extern void enable_chan(struct line *line);
45extern void close_chan(struct list_head *chans, int delay_free_irq);
42extern int chan_window_size(struct list_head *chans, 46extern int chan_window_size(struct list_head *chans,
43 unsigned short *rows_out, 47 unsigned short *rows_out,
44 unsigned short *cols_out); 48 unsigned short *cols_out);
@@ -47,14 +51,3 @@ extern int chan_config_string(struct list_head *chans, char *str, int size,
47 char **error_out); 51 char **error_out);
48 52
49#endif 53#endif
50
51/*
52 * Overrides for Emacs so that we follow Linus's tabbing style.
53 * Emacs will notice this stuff at the end of the file and automatically
54 * adjust the settings for this buffer only. This must remain at the end
55 * of the file.
56 * ---------------------------------------------------------------------------
57 * Local variables:
58 * c-file-style: "linux"
59 * End:
60 */
diff --git a/arch/um/include/choose-mode.h b/arch/um/include/choose-mode.h
index f25fa83a5da6..b87b36a87d91 100644
--- a/arch/um/include/choose-mode.h
+++ b/arch/um/include/choose-mode.h
@@ -23,6 +23,9 @@ static inline void *__choose_mode(void *tt, void *skas) {
23 23
24#elif defined(UML_CONFIG_MODE_TT) 24#elif defined(UML_CONFIG_MODE_TT)
25#define CHOOSE_MODE(tt, skas) (tt) 25#define CHOOSE_MODE(tt, skas) (tt)
26
27#else
28#error CONFIG_MODE_SKAS and CONFIG_MODE_TT are both disabled
26#endif 29#endif
27 30
28#define CHOOSE_MODE_PROC(tt, skas, args...) \ 31#define CHOOSE_MODE_PROC(tt, skas, args...) \
diff --git a/arch/um/include/irq_user.h b/arch/um/include/irq_user.h
index f724b717213f..b61deb8b362a 100644
--- a/arch/um/include/irq_user.h
+++ b/arch/um/include/irq_user.h
@@ -18,19 +18,8 @@ extern int deactivate_all_fds(void);
18extern void forward_interrupts(int pid); 18extern void forward_interrupts(int pid);
19extern void init_irq_signals(int on_sigstack); 19extern void init_irq_signals(int on_sigstack);
20extern void forward_ipi(int fd, int pid); 20extern void forward_ipi(int fd, int pid);
21extern void free_irq_later(int irq, void *dev_id);
22extern int activate_ipi(int fd, int pid); 21extern int activate_ipi(int fd, int pid);
23extern unsigned long irq_lock(void); 22extern unsigned long irq_lock(void);
24extern void irq_unlock(unsigned long flags); 23extern void irq_unlock(unsigned long flags);
25#endif
26 24
27/* 25#endif
28 * Overrides for Emacs so that we follow Linus's tabbing style.
29 * Emacs will notice this stuff at the end of the file and automatically
30 * adjust the settings for this buffer only. This must remain at the end
31 * of the file.
32 * ---------------------------------------------------------------------------
33 * Local variables:
34 * c-file-style: "linux"
35 * End:
36 */
diff --git a/arch/um/include/kern.h b/arch/um/include/kern.h
index 1e3170768b5c..7d223beccbc0 100644
--- a/arch/um/include/kern.h
+++ b/arch/um/include/kern.h
@@ -17,7 +17,7 @@ extern int errno;
17 17
18extern int clone(int (*proc)(void *), void *sp, int flags, void *data); 18extern int clone(int (*proc)(void *), void *sp, int flags, void *data);
19extern int sleep(int); 19extern int sleep(int);
20extern int printf(char *fmt, ...); 20extern int printf(const char *fmt, ...);
21extern char *strerror(int errnum); 21extern char *strerror(int errnum);
22extern char *ptsname(int __fd); 22extern char *ptsname(int __fd);
23extern int munmap(void *, int); 23extern int munmap(void *, int);
@@ -35,15 +35,6 @@ extern int read(unsigned int, char *, int);
35extern int pipe(int *); 35extern int pipe(int *);
36extern int sched_yield(void); 36extern int sched_yield(void);
37extern int ptrace(int op, int pid, long addr, long data); 37extern int ptrace(int op, int pid, long addr, long data);
38
38#endif 39#endif
39 40
40/*
41 * Overrides for Emacs so that we follow Linus's tabbing style.
42 * Emacs will notice this stuff at the end of the file and automatically
43 * adjust the settings for this buffer only. This must remain at the end
44 * of the file.
45 * ---------------------------------------------------------------------------
46 * Local variables:
47 * c-file-style: "linux"
48 * End:
49 */
diff --git a/arch/um/include/line.h b/arch/um/include/line.h
index 5323d22a6ca7..6f4d680dc1d4 100644
--- a/arch/um/include/line.h
+++ b/arch/um/include/line.h
@@ -32,11 +32,13 @@ struct line_driver {
32}; 32};
33 33
34struct line { 34struct line {
35 struct tty_struct *tty;
35 char *init_str; 36 char *init_str;
36 int init_pri; 37 int init_pri;
37 struct list_head chan_list; 38 struct list_head chan_list;
38 int valid; 39 int valid;
39 int count; 40 int count;
41 int throttled;
40 /*This lock is actually, mostly, local to*/ 42 /*This lock is actually, mostly, local to*/
41 spinlock_t lock; 43 spinlock_t lock;
42 44
@@ -58,14 +60,15 @@ struct line {
58#define LINE_INIT(str, d) \ 60#define LINE_INIT(str, d) \
59 { init_str : str, \ 61 { init_str : str, \
60 init_pri : INIT_STATIC, \ 62 init_pri : INIT_STATIC, \
61 chan_list : { }, \
62 valid : 1, \ 63 valid : 1, \
64 throttled : 0, \
65 lock : SPIN_LOCK_UNLOCKED, \
63 buffer : NULL, \ 66 buffer : NULL, \
64 head : NULL, \ 67 head : NULL, \
65 tail : NULL, \ 68 tail : NULL, \
66 sigio : 0, \ 69 sigio : 0, \
67 driver : d, \ 70 driver : d, \
68 have_irq : 0 } 71 have_irq : 0 }
69 72
70struct lines { 73struct lines {
71 int num; 74 int num;
@@ -74,11 +77,11 @@ struct lines {
74#define LINES_INIT(n) { num : n } 77#define LINES_INIT(n) { num : n }
75 78
76extern void line_close(struct tty_struct *tty, struct file * filp); 79extern void line_close(struct tty_struct *tty, struct file * filp);
77extern int line_open(struct line *lines, struct tty_struct *tty, 80extern int line_open(struct line *lines, struct tty_struct *tty);
78 struct chan_opts *opts); 81extern int line_setup(struct line *lines, unsigned int sizeof_lines,
79extern int line_setup(struct line *lines, unsigned int sizeof_lines, char *init, 82 char *init);
80 int all_allowed); 83extern int line_write(struct tty_struct *tty, const unsigned char *buf,
81extern int line_write(struct tty_struct *tty, const unsigned char *buf, int len); 84 int len);
82extern void line_put_char(struct tty_struct *tty, unsigned char ch); 85extern void line_put_char(struct tty_struct *tty, unsigned char ch);
83extern void line_set_termios(struct tty_struct *tty, struct termios * old); 86extern void line_set_termios(struct tty_struct *tty, struct termios * old);
84extern int line_chars_in_buffer(struct tty_struct *tty); 87extern int line_chars_in_buffer(struct tty_struct *tty);
@@ -87,23 +90,27 @@ extern void line_flush_chars(struct tty_struct *tty);
87extern int line_write_room(struct tty_struct *tty); 90extern int line_write_room(struct tty_struct *tty);
88extern int line_ioctl(struct tty_struct *tty, struct file * file, 91extern int line_ioctl(struct tty_struct *tty, struct file * file,
89 unsigned int cmd, unsigned long arg); 92 unsigned int cmd, unsigned long arg);
93extern void line_throttle(struct tty_struct *tty);
94extern void line_unthrottle(struct tty_struct *tty);
90 95
91extern char *add_xterm_umid(char *base); 96extern char *add_xterm_umid(char *base);
92extern int line_setup_irq(int fd, int input, int output, struct tty_struct *tty); 97extern int line_setup_irq(int fd, int input, int output, struct line *line,
98 void *data);
93extern void line_close_chan(struct line *line); 99extern void line_close_chan(struct line *line);
94extern void line_disable(struct tty_struct *tty, int current_irq); 100extern struct tty_driver * line_register_devfs(struct lines *set,
95extern struct tty_driver * line_register_devfs(struct lines *set, 101 struct line_driver *line_driver,
96 struct line_driver *line_driver,
97 struct tty_operations *driver, 102 struct tty_operations *driver,
98 struct line *lines, 103 struct line *lines,
99 int nlines); 104 int nlines);
100extern void lines_init(struct line *lines, int nlines); 105extern void lines_init(struct line *lines, int nlines, struct chan_opts *opts);
101extern void close_lines(struct line *lines, int nlines); 106extern void close_lines(struct line *lines, int nlines);
102 107
103extern int line_config(struct line *lines, unsigned int sizeof_lines, char *str); 108extern int line_config(struct line *lines, unsigned int sizeof_lines,
109 char *str, struct chan_opts *opts);
104extern int line_id(char **str, int *start_out, int *end_out); 110extern int line_id(char **str, int *start_out, int *end_out);
105extern int line_remove(struct line *lines, unsigned int sizeof_lines, int n); 111extern int line_remove(struct line *lines, unsigned int sizeof_lines, int n);
106extern int line_get_config(char *dev, struct line *lines, unsigned int sizeof_lines, char *str, 112extern int line_get_config(char *dev, struct line *lines,
113 unsigned int sizeof_lines, char *str,
107 int size, char **error_out); 114 int size, char **error_out);
108 115
109#endif 116#endif
diff --git a/arch/um/include/mconsole.h b/arch/um/include/mconsole.h
index b1b512f47035..58f67d391105 100644
--- a/arch/um/include/mconsole.h
+++ b/arch/um/include/mconsole.h
@@ -32,7 +32,7 @@ struct mconsole_reply {
32 32
33struct mconsole_notify { 33struct mconsole_notify {
34 u32 magic; 34 u32 magic;
35 u32 version; 35 u32 version;
36 enum { MCONSOLE_SOCKET, MCONSOLE_PANIC, MCONSOLE_HANG, 36 enum { MCONSOLE_SOCKET, MCONSOLE_PANIC, MCONSOLE_HANG,
37 MCONSOLE_USER_NOTIFY } type; 37 MCONSOLE_USER_NOTIFY } type;
38 u32 len; 38 u32 len;
@@ -66,7 +66,9 @@ struct mc_request
66extern char mconsole_socket_name[]; 66extern char mconsole_socket_name[];
67 67
68extern int mconsole_unlink_socket(void); 68extern int mconsole_unlink_socket(void);
69extern int mconsole_reply(struct mc_request *req, char *reply, int err, 69extern int mconsole_reply_len(struct mc_request *req, const char *reply,
70 int len, int err, int more);
71extern int mconsole_reply(struct mc_request *req, const char *str, int err,
70 int more); 72 int more);
71 73
72extern void mconsole_version(struct mc_request *req); 74extern void mconsole_version(struct mc_request *req);
@@ -84,7 +86,7 @@ extern void mconsole_proc(struct mc_request *req);
84extern void mconsole_stack(struct mc_request *req); 86extern void mconsole_stack(struct mc_request *req);
85 87
86extern int mconsole_get_request(int fd, struct mc_request *req); 88extern int mconsole_get_request(int fd, struct mc_request *req);
87extern int mconsole_notify(char *sock_name, int type, const void *data, 89extern int mconsole_notify(char *sock_name, int type, const void *data,
88 int len); 90 int len);
89extern char *mconsole_notify_socket(void); 91extern char *mconsole_notify_socket(void);
90extern void lock_notify(void); 92extern void lock_notify(void);
diff --git a/arch/um/include/os.h b/arch/um/include/os.h
index 2cccfa5b8ab5..c279ee6d89e4 100644
--- a/arch/um/include/os.h
+++ b/arch/um/include/os.h
@@ -213,15 +213,10 @@ extern int run_helper_thread(int (*proc)(void *), void *arg,
213 int stack_order); 213 int stack_order);
214extern int helper_wait(int pid); 214extern int helper_wait(int pid);
215 215
216#endif 216/* umid.c */
217 217
218/* 218extern int umid_file_name(char *name, char *buf, int len);
219 * Overrides for Emacs so that we follow Linus's tabbing style. 219extern int set_umid(char *name);
220 * Emacs will notice this stuff at the end of the file and automatically 220extern char *get_umid(void);
221 * adjust the settings for this buffer only. This must remain at the end 221
222 * of the file. 222#endif
223 * ---------------------------------------------------------------------------
224 * Local variables:
225 * c-file-style: "linux"
226 * End:
227 */
diff --git a/arch/um/include/user_util.h b/arch/um/include/user_util.h
index bb505e01d994..b9984003e603 100644
--- a/arch/um/include/user_util.h
+++ b/arch/um/include/user_util.h
@@ -64,7 +64,6 @@ extern void setup_machinename(char *machine_out);
64extern void setup_hostinfo(void); 64extern void setup_hostinfo(void);
65extern void do_exec(int old_pid, int new_pid); 65extern void do_exec(int old_pid, int new_pid);
66extern void tracer_panic(char *msg, ...); 66extern void tracer_panic(char *msg, ...);
67extern char *get_umid(int only_if_set);
68extern void do_longjmp(void *p, int val); 67extern void do_longjmp(void *p, int val);
69extern int detach(int pid, int sig); 68extern int detach(int pid, int sig);
70extern int attach(int pid); 69extern int attach(int pid);
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index 3de9d21e36bf..6f7700593a6f 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -10,8 +10,8 @@ obj-y = config.o exec_kern.o exitcode.o \
10 init_task.o irq.o irq_user.o ksyms.o mem.o physmem.o \ 10 init_task.o irq.o irq_user.o ksyms.o mem.o physmem.o \
11 process_kern.o ptrace.o reboot.o resource.o sigio_user.o sigio_kern.o \ 11 process_kern.o ptrace.o reboot.o resource.o sigio_user.o sigio_kern.o \
12 signal_kern.o signal_user.o smp.o syscall_kern.o sysrq.o time.o \ 12 signal_kern.o signal_user.o smp.o syscall_kern.o sysrq.o time.o \
13 time_kern.o tlb.o trap_kern.o trap_user.o uaccess.o um_arch.o \ 13 time_kern.o tlb.o trap_kern.o trap_user.o uaccess.o um_arch.o umid.o \
14 umid.o user_util.o 14 user_util.o
15 15
16obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o 16obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o
17obj-$(CONFIG_GPROF) += gprof_syms.o 17obj-$(CONFIG_GPROF) += gprof_syms.o
@@ -24,7 +24,7 @@ obj-$(CONFIG_MODE_SKAS) += skas/
24 24
25user-objs-$(CONFIG_TTY_LOG) += tty_log.o 25user-objs-$(CONFIG_TTY_LOG) += tty_log.o
26 26
27USER_OBJS := $(user-objs-y) config.o time.o tty_log.o umid.o user_util.o 27USER_OBJS := $(user-objs-y) config.o time.o tty_log.o user_util.o
28 28
29include arch/um/scripts/Makefile.rules 29include arch/um/scripts/Makefile.rules
30 30
diff --git a/arch/um/kernel/irq_user.c b/arch/um/kernel/irq_user.c
index c3ccaf24f3e0..50a2aa35cda9 100644
--- a/arch/um/kernel/irq_user.c
+++ b/arch/um/kernel/irq_user.c
@@ -29,7 +29,6 @@ struct irq_fd {
29 int pid; 29 int pid;
30 int events; 30 int events;
31 int current_events; 31 int current_events;
32 int freed;
33}; 32};
34 33
35static struct irq_fd *active_fds = NULL; 34static struct irq_fd *active_fds = NULL;
@@ -41,9 +40,11 @@ static int pollfds_size = 0;
41 40
42extern int io_count, intr_count; 41extern int io_count, intr_count;
43 42
43extern void free_irqs(void);
44
44void sigio_handler(int sig, union uml_pt_regs *regs) 45void sigio_handler(int sig, union uml_pt_regs *regs)
45{ 46{
46 struct irq_fd *irq_fd, *next; 47 struct irq_fd *irq_fd;
47 int i, n; 48 int i, n;
48 49
49 if(smp_sigio_handler()) return; 50 if(smp_sigio_handler()) return;
@@ -66,29 +67,15 @@ void sigio_handler(int sig, union uml_pt_regs *regs)
66 irq_fd = irq_fd->next; 67 irq_fd = irq_fd->next;
67 } 68 }
68 69
69 for(irq_fd = active_fds; irq_fd != NULL; irq_fd = next){ 70 for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){
70 next = irq_fd->next;
71 if(irq_fd->current_events != 0){ 71 if(irq_fd->current_events != 0){
72 irq_fd->current_events = 0; 72 irq_fd->current_events = 0;
73 do_IRQ(irq_fd->irq, regs); 73 do_IRQ(irq_fd->irq, regs);
74
75 /* This is here because the next irq may be
76 * freed in the handler. If a console goes
77 * away, both the read and write irqs will be
78 * freed. After do_IRQ, ->next will point to
79 * a good IRQ.
80 * Irqs can't be freed inside their handlers,
81 * so the next best thing is to have them
82 * marked as needing freeing, so that they
83 * can be freed here.
84 */
85 next = irq_fd->next;
86 if(irq_fd->freed){
87 free_irq(irq_fd->irq, irq_fd->id);
88 }
89 } 74 }
90 } 75 }
91 } 76 }
77
78 free_irqs();
92} 79}
93 80
94int activate_ipi(int fd, int pid) 81int activate_ipi(int fd, int pid)
@@ -136,8 +123,7 @@ int activate_fd(int irq, int fd, int type, void *dev_id)
136 .irq = irq, 123 .irq = irq,
137 .pid = pid, 124 .pid = pid,
138 .events = events, 125 .events = events,
139 .current_events = 0, 126 .current_events = 0 } );
140 .freed = 0 } );
141 127
142 /* Critical section - locked by a spinlock because this stuff can 128 /* Critical section - locked by a spinlock because this stuff can
143 * be changed from interrupt handlers. The stuff above is done 129 * be changed from interrupt handlers. The stuff above is done
@@ -313,26 +299,6 @@ static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out)
313 return(irq); 299 return(irq);
314} 300}
315 301
316void free_irq_later(int irq, void *dev_id)
317{
318 struct irq_fd *irq_fd;
319 unsigned long flags;
320
321 flags = irq_lock();
322 for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){
323 if((irq_fd->irq == irq) && (irq_fd->id == dev_id))
324 break;
325 }
326 if(irq_fd == NULL){
327 printk("free_irq_later found no irq, irq = %d, "
328 "dev_id = 0x%p\n", irq, dev_id);
329 goto out;
330 }
331 irq_fd->freed = 1;
332 out:
333 irq_unlock(flags);
334}
335
336void reactivate_fd(int fd, int irqnum) 302void reactivate_fd(int fd, int irqnum)
337{ 303{
338 struct irq_fd *irq; 304 struct irq_fd *irq;
diff --git a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c
index 34b54a3e2132..651abf255bc5 100644
--- a/arch/um/kernel/process_kern.c
+++ b/arch/um/kernel/process_kern.c
@@ -324,10 +324,6 @@ int user_context(unsigned long sp)
324 return(stack != (unsigned long) current_thread); 324 return(stack != (unsigned long) current_thread);
325} 325}
326 326
327extern void remove_umid_dir(void);
328
329__uml_exitcall(remove_umid_dir);
330
331extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end; 327extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end;
332 328
333void do_uml_exitcalls(void) 329void do_uml_exitcalls(void)
diff --git a/arch/um/kernel/sigio_user.c b/arch/um/kernel/sigio_user.c
index 48b1f644b9a6..62e5cfdf2188 100644
--- a/arch/um/kernel/sigio_user.c
+++ b/arch/um/kernel/sigio_user.c
@@ -216,6 +216,8 @@ static int write_sigio_thread(void *unused)
216 "err = %d\n", -n); 216 "err = %d\n", -n);
217 } 217 }
218 } 218 }
219
220 return 0;
219} 221}
220 222
221static int need_poll(int n) 223static int need_poll(int n)
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 142a9493912b..26626b2b9172 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -146,8 +146,8 @@ void set_cmdline(char *cmd)
146 146
147 if(CHOOSE_MODE(honeypot, 0)) return; 147 if(CHOOSE_MODE(honeypot, 0)) return;
148 148
149 umid = get_umid(1); 149 umid = get_umid();
150 if(umid != NULL){ 150 if(*umid != '\0'){
151 snprintf(argv1_begin, 151 snprintf(argv1_begin,
152 (argv1_end - argv1_begin) * sizeof(*ptr), 152 (argv1_end - argv1_begin) * sizeof(*ptr),
153 "(%s) ", umid); 153 "(%s) ", umid);
diff --git a/arch/um/kernel/umid.c b/arch/um/kernel/umid.c
index 0b21d59ba0cd..4eaee823bfd2 100644
--- a/arch/um/kernel/umid.c
+++ b/arch/um/kernel/umid.c
@@ -3,61 +3,30 @@
3 * Licensed under the GPL 3 * Licensed under the GPL
4 */ 4 */
5 5
6#include <stdio.h> 6#include "asm/errno.h"
7#include <unistd.h>
8#include <errno.h>
9#include <string.h>
10#include <stdlib.h>
11#include <dirent.h>
12#include <signal.h>
13#include <sys/stat.h>
14#include <sys/param.h>
15#include "user.h"
16#include "umid.h"
17#include "init.h" 7#include "init.h"
18#include "os.h" 8#include "os.h"
19#include "user_util.h" 9#include "kern.h"
20#include "choose-mode.h" 10#include "linux/kernel.h"
21 11
22#define UMID_LEN 64 12/* Changed by set_umid_arg */
23#define UML_DIR "~/.uml/"
24
25/* Changed by set_umid and make_umid, which are run early in boot */
26static char umid[UMID_LEN] = { 0 };
27
28/* Changed by set_uml_dir and make_uml_dir, which are run early in boot */
29static char *uml_dir = UML_DIR;
30
31/* Changed by set_umid */
32static int umid_is_random = 1;
33static int umid_inited = 0; 13static int umid_inited = 0;
34/* Have we created the files? Should we remove them? */
35static int umid_owned = 0;
36 14
37static int make_umid(int (*printer)(const char *fmt, ...)); 15static int __init set_umid_arg(char *name, int *add)
38
39static int __init set_umid(char *name, int is_random,
40 int (*printer)(const char *fmt, ...))
41{ 16{
42 if(umid_inited){ 17 int err;
43 (*printer)("Unique machine name can't be set twice\n");
44 return(-1);
45 }
46 18
47 if(strlen(name) > UMID_LEN - 1) 19 if(umid_inited)
48 (*printer)("Unique machine name is being truncated to %d " 20 return 0;
49 "characters\n", UMID_LEN);
50 strlcpy(umid, name, sizeof(umid));
51 21
52 umid_is_random = is_random;
53 umid_inited = 1;
54 return 0;
55}
56
57static int __init set_umid_arg(char *name, int *add)
58{
59 *add = 0; 22 *add = 0;
60 return(set_umid(name, 0, printf)); 23 err = set_umid(name);
24 if(err == -EEXIST)
25 printf("umid '%s' already in use\n", name);
26 else if(!err)
27 umid_inited = 1;
28
29 return 0;
61} 30}
62 31
63__uml_setup("umid=", set_umid_arg, 32__uml_setup("umid=", set_umid_arg,
@@ -66,265 +35,3 @@ __uml_setup("umid=", set_umid_arg,
66" is used for naming the pid file and management console socket.\n\n" 35" is used for naming the pid file and management console socket.\n\n"
67); 36);
68 37
69int __init umid_file_name(char *name, char *buf, int len)
70{
71 int n;
72
73 if(!umid_inited && make_umid(printk)) return(-1);
74
75 n = strlen(uml_dir) + strlen(umid) + strlen(name) + 1;
76 if(n > len){
77 printk("umid_file_name : buffer too short\n");
78 return(-1);
79 }
80
81 sprintf(buf, "%s%s/%s", uml_dir, umid, name);
82 return(0);
83}
84
85extern int tracing_pid;
86
87static void __init create_pid_file(void)
88{
89 char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
90 char pid[sizeof("nnnnn\0")];
91 int fd, n;
92
93 if(umid_file_name("pid", file, sizeof(file)))
94 return;
95
96 fd = os_open_file(file, of_create(of_excl(of_rdwr(OPENFLAGS()))),
97 0644);
98 if(fd < 0){
99 printf("Open of machine pid file \"%s\" failed: %s\n",
100 file, strerror(-fd));
101 return;
102 }
103
104 sprintf(pid, "%d\n", os_getpid());
105 n = os_write_file(fd, pid, strlen(pid));
106 if(n != strlen(pid))
107 printf("Write of pid file failed - err = %d\n", -n);
108 os_close_file(fd);
109}
110
111static int actually_do_remove(char *dir)
112{
113 DIR *directory;
114 struct dirent *ent;
115 int len;
116 char file[256];
117
118 directory = opendir(dir);
119 if(directory == NULL){
120 printk("actually_do_remove : couldn't open directory '%s', "
121 "errno = %d\n", dir, errno);
122 return(1);
123 }
124 while((ent = readdir(directory)) != NULL){
125 if(!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, ".."))
126 continue;
127 len = strlen(dir) + sizeof("/") + strlen(ent->d_name) + 1;
128 if(len > sizeof(file)){
129 printk("Not deleting '%s' from '%s' - name too long\n",
130 ent->d_name, dir);
131 continue;
132 }
133 sprintf(file, "%s/%s", dir, ent->d_name);
134 if(unlink(file) < 0){
135 printk("actually_do_remove : couldn't remove '%s' "
136 "from '%s', errno = %d\n", ent->d_name, dir,
137 errno);
138 return(1);
139 }
140 }
141 if(rmdir(dir) < 0){
142 printk("actually_do_remove : couldn't rmdir '%s', "
143 "errno = %d\n", dir, errno);
144 return(1);
145 }
146 return(0);
147}
148
149void remove_umid_dir(void)
150{
151 char dir[strlen(uml_dir) + UMID_LEN + 1];
152 if (!umid_owned)
153 return;
154
155 sprintf(dir, "%s%s", uml_dir, umid);
156 actually_do_remove(dir);
157}
158
159char *get_umid(int only_if_set)
160{
161 if(only_if_set && umid_is_random)
162 return NULL;
163 return umid;
164}
165
166static int not_dead_yet(char *dir)
167{
168 char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
169 char pid[sizeof("nnnnn\0")], *end;
170 int dead, fd, p, n;
171
172 sprintf(file, "%s/pid", dir);
173 dead = 0;
174 fd = os_open_file(file, of_read(OPENFLAGS()), 0);
175 if(fd < 0){
176 if(fd != -ENOENT){
177 printk("not_dead_yet : couldn't open pid file '%s', "
178 "err = %d\n", file, -fd);
179 return(1);
180 }
181 dead = 1;
182 }
183 if(fd > 0){
184 n = os_read_file(fd, pid, sizeof(pid));
185 if(n < 0){
186 printk("not_dead_yet : couldn't read pid file '%s', "
187 "err = %d\n", file, -n);
188 return(1);
189 }
190 p = strtoul(pid, &end, 0);
191 if(end == pid){
192 printk("not_dead_yet : couldn't parse pid file '%s', "
193 "errno = %d\n", file, errno);
194 dead = 1;
195 }
196 if(((kill(p, 0) < 0) && (errno == ESRCH)) ||
197 (p == CHOOSE_MODE(tracing_pid, os_getpid())))
198 dead = 1;
199 }
200 if(!dead)
201 return(1);
202 return(actually_do_remove(dir));
203}
204
205static int __init set_uml_dir(char *name, int *add)
206{
207 if((strlen(name) > 0) && (name[strlen(name) - 1] != '/')){
208 uml_dir = malloc(strlen(name) + 2);
209 if(uml_dir == NULL){
210 printf("Failed to malloc uml_dir - error = %d\n",
211 errno);
212 uml_dir = name;
213 /* Return 0 here because do_initcalls doesn't look at
214 * the return value.
215 */
216 return(0);
217 }
218 sprintf(uml_dir, "%s/", name);
219 }
220 else uml_dir = name;
221 return(0);
222}
223
224static int __init make_uml_dir(void)
225{
226 char dir[MAXPATHLEN + 1] = { '\0' };
227 int len;
228
229 if(*uml_dir == '~'){
230 char *home = getenv("HOME");
231
232 if(home == NULL){
233 printf("make_uml_dir : no value in environment for "
234 "$HOME\n");
235 exit(1);
236 }
237 strlcpy(dir, home, sizeof(dir));
238 uml_dir++;
239 }
240 strlcat(dir, uml_dir, sizeof(dir));
241 len = strlen(dir);
242 if (len > 0 && dir[len - 1] != '/')
243 strlcat(dir, "/", sizeof(dir));
244
245 uml_dir = malloc(strlen(dir) + 1);
246 if (uml_dir == NULL) {
247 printf("make_uml_dir : malloc failed, errno = %d\n", errno);
248 exit(1);
249 }
250 strcpy(uml_dir, dir);
251
252 if((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)){
253 printf("Failed to mkdir %s: %s\n", uml_dir, strerror(errno));
254 return(-1);
255 }
256 return 0;
257}
258
259static int __init make_umid(int (*printer)(const char *fmt, ...))
260{
261 int fd, err;
262 char tmp[strlen(uml_dir) + UMID_LEN + 1];
263
264 strlcpy(tmp, uml_dir, sizeof(tmp));
265
266 if(!umid_inited){
267 strcat(tmp, "XXXXXX");
268 fd = mkstemp(tmp);
269 if(fd < 0){
270 (*printer)("make_umid - mkstemp(%s) failed: %s\n",
271 tmp,strerror(errno));
272 return(1);
273 }
274
275 os_close_file(fd);
276 /* There's a nice tiny little race between this unlink and
277 * the mkdir below. It'd be nice if there were a mkstemp
278 * for directories.
279 */
280 unlink(tmp);
281 set_umid(&tmp[strlen(uml_dir)], 1, printer);
282 }
283
284 sprintf(tmp, "%s%s", uml_dir, umid);
285
286 err = mkdir(tmp, 0777);
287 if(err < 0){
288 if(errno == EEXIST){
289 if(not_dead_yet(tmp)){
290 (*printer)("umid '%s' is in use\n", umid);
291 umid_owned = 0;
292 return(-1);
293 }
294 err = mkdir(tmp, 0777);
295 }
296 }
297 if(err < 0){
298 (*printer)("Failed to create %s - errno = %d\n", umid, errno);
299 return(-1);
300 }
301
302 umid_owned = 1;
303 return 0;
304}
305
306__uml_setup("uml_dir=", set_uml_dir,
307"uml_dir=<directory>\n"
308" The location to place the pid and umid files.\n\n"
309);
310
311static int __init make_umid_setup(void)
312{
313 /* one function with the ordering we need ... */
314 make_uml_dir();
315 make_umid(printf);
316 create_pid_file();
317 return 0;
318}
319__uml_postsetup(make_umid_setup);
320
321/*
322 * Overrides for Emacs so that we follow Linus's tabbing style.
323 * Emacs will notice this stuff at the end of the file and automatically
324 * adjust the settings for this buffer only. This must remain at the end
325 * of the file.
326 * ---------------------------------------------------------------------------
327 * Local variables:
328 * c-file-style: "linux"
329 * End:
330 */
diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile
index b83ac8e21c35..11e30b13e318 100644
--- a/arch/um/os-Linux/Makefile
+++ b/arch/um/os-Linux/Makefile
@@ -4,11 +4,11 @@
4# 4#
5 5
6obj-y = aio.o elf_aux.o file.o helper.o main.o mem.o process.o signal.o \ 6obj-y = aio.o elf_aux.o file.o helper.o main.o mem.o process.o signal.o \
7 start_up.o time.o tt.o tty.o uaccess.o user_syms.o drivers/ \ 7 start_up.o time.o tt.o tty.o uaccess.o umid.o user_syms.o drivers/ \
8 sys-$(SUBARCH)/ 8 sys-$(SUBARCH)/
9 9
10USER_OBJS := aio.o elf_aux.o file.o helper.o main.o mem.o process.o signal.o \ 10USER_OBJS := aio.o elf_aux.o file.o helper.o main.o mem.o process.o signal.o \
11 start_up.o time.o tt.o tty.o uaccess.o 11 start_up.o time.o tt.o tty.o uaccess.o umid.o
12 12
13elf_aux.o: $(ARCH_DIR)/kernel-offsets.h 13elf_aux.o: $(ARCH_DIR)/kernel-offsets.h
14CFLAGS_elf_aux.o += -I$(objtree)/arch/um 14CFLAGS_elf_aux.o += -I$(objtree)/arch/um
diff --git a/arch/um/os-Linux/aio.c b/arch/um/os-Linux/aio.c
index ffa759addd3c..f897140cc4ae 100644
--- a/arch/um/os-Linux/aio.c
+++ b/arch/um/os-Linux/aio.c
@@ -16,12 +16,12 @@
16#include "mode.h" 16#include "mode.h"
17 17
18struct aio_thread_req { 18struct aio_thread_req {
19 enum aio_type type; 19 enum aio_type type;
20 int io_fd; 20 int io_fd;
21 unsigned long long offset; 21 unsigned long long offset;
22 char *buf; 22 char *buf;
23 int len; 23 int len;
24 struct aio_context *aio; 24 struct aio_context *aio;
25}; 25};
26 26
27static int aio_req_fd_r = -1; 27static int aio_req_fd_r = -1;
@@ -38,18 +38,18 @@ static int aio_req_fd_w = -1;
38 38
39static long io_setup(int n, aio_context_t *ctxp) 39static long io_setup(int n, aio_context_t *ctxp)
40{ 40{
41 return syscall(__NR_io_setup, n, ctxp); 41 return syscall(__NR_io_setup, n, ctxp);
42} 42}
43 43
44static long io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp) 44static long io_submit(aio_context_t ctx, long nr, struct iocb **iocbpp)
45{ 45{
46 return syscall(__NR_io_submit, ctx, nr, iocbpp); 46 return syscall(__NR_io_submit, ctx, nr, iocbpp);
47} 47}
48 48
49static long io_getevents(aio_context_t ctx_id, long min_nr, long nr, 49static long io_getevents(aio_context_t ctx_id, long min_nr, long nr,
50 struct io_event *events, struct timespec *timeout) 50 struct io_event *events, struct timespec *timeout)
51{ 51{
52 return syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, timeout); 52 return syscall(__NR_io_getevents, ctx_id, min_nr, nr, events, timeout);
53} 53}
54 54
55#endif 55#endif
@@ -66,243 +66,245 @@ static long io_getevents(aio_context_t ctx_id, long min_nr, long nr,
66 */ 66 */
67 67
68static int do_aio(aio_context_t ctx, enum aio_type type, int fd, char *buf, 68static int do_aio(aio_context_t ctx, enum aio_type type, int fd, char *buf,
69 int len, unsigned long long offset, struct aio_context *aio) 69 int len, unsigned long long offset, struct aio_context *aio)
70{ 70{
71 struct iocb iocb, *iocbp = &iocb; 71 struct iocb iocb, *iocbp = &iocb;
72 char c; 72 char c;
73 int err; 73 int err;
74 74
75 iocb = ((struct iocb) { .aio_data = (unsigned long) aio, 75 iocb = ((struct iocb) { .aio_data = (unsigned long) aio,
76 .aio_reqprio = 0, 76 .aio_reqprio = 0,
77 .aio_fildes = fd, 77 .aio_fildes = fd,
78 .aio_buf = (unsigned long) buf, 78 .aio_buf = (unsigned long) buf,
79 .aio_nbytes = len, 79 .aio_nbytes = len,
80 .aio_offset = offset, 80 .aio_offset = offset,
81 .aio_reserved1 = 0, 81 .aio_reserved1 = 0,
82 .aio_reserved2 = 0, 82 .aio_reserved2 = 0,
83 .aio_reserved3 = 0 }); 83 .aio_reserved3 = 0 });
84 84
85 switch(type){ 85 switch(type){
86 case AIO_READ: 86 case AIO_READ:
87 iocb.aio_lio_opcode = IOCB_CMD_PREAD; 87 iocb.aio_lio_opcode = IOCB_CMD_PREAD;
88 err = io_submit(ctx, 1, &iocbp); 88 err = io_submit(ctx, 1, &iocbp);
89 break; 89 break;
90 case AIO_WRITE: 90 case AIO_WRITE:
91 iocb.aio_lio_opcode = IOCB_CMD_PWRITE; 91 iocb.aio_lio_opcode = IOCB_CMD_PWRITE;
92 err = io_submit(ctx, 1, &iocbp); 92 err = io_submit(ctx, 1, &iocbp);
93 break; 93 break;
94 case AIO_MMAP: 94 case AIO_MMAP:
95 iocb.aio_lio_opcode = IOCB_CMD_PREAD; 95 iocb.aio_lio_opcode = IOCB_CMD_PREAD;
96 iocb.aio_buf = (unsigned long) &c; 96 iocb.aio_buf = (unsigned long) &c;
97 iocb.aio_nbytes = sizeof(c); 97 iocb.aio_nbytes = sizeof(c);
98 err = io_submit(ctx, 1, &iocbp); 98 err = io_submit(ctx, 1, &iocbp);
99 break; 99 break;
100 default: 100 default:
101 printk("Bogus op in do_aio - %d\n", type); 101 printk("Bogus op in do_aio - %d\n", type);
102 err = -EINVAL; 102 err = -EINVAL;
103 break; 103 break;
104 } 104 }
105 105
106 if(err > 0) 106 if(err > 0)
107 err = 0; 107 err = 0;
108 else 108 else
109 err = -errno; 109 err = -errno;
110 110
111 return err; 111 return err;
112} 112}
113 113
114static aio_context_t ctx = 0; 114static aio_context_t ctx = 0;
115 115
116static int aio_thread(void *arg) 116static int aio_thread(void *arg)
117{ 117{
118 struct aio_thread_reply reply; 118 struct aio_thread_reply reply;
119 struct io_event event; 119 struct io_event event;
120 int err, n, reply_fd; 120 int err, n, reply_fd;
121 121
122 signal(SIGWINCH, SIG_IGN); 122 signal(SIGWINCH, SIG_IGN);
123 123
124 while(1){ 124 while(1){
125 n = io_getevents(ctx, 1, 1, &event, NULL); 125 n = io_getevents(ctx, 1, 1, &event, NULL);
126 if(n < 0){ 126 if(n < 0){
127 if(errno == EINTR) 127 if(errno == EINTR)
128 continue; 128 continue;
129 printk("aio_thread - io_getevents failed, " 129 printk("aio_thread - io_getevents failed, "
130 "errno = %d\n", errno); 130 "errno = %d\n", errno);
131 } 131 }
132 else { 132 else {
133 reply = ((struct aio_thread_reply) 133 reply = ((struct aio_thread_reply)
134 { .data = (void *) (long) event.data, 134 { .data = (void *) (long) event.data,
135 .err = event.res }); 135 .err = event.res });
136 reply_fd = ((struct aio_context *) reply.data)->reply_fd; 136 reply_fd = ((struct aio_context *) reply.data)->reply_fd;
137 err = os_write_file(reply_fd, &reply, sizeof(reply)); 137 err = os_write_file(reply_fd, &reply, sizeof(reply));
138 if(err != sizeof(reply)) 138 if(err != sizeof(reply))
139 printk("aio_thread - write failed, fd = %d, " 139 printk("aio_thread - write failed, fd = %d, "
140 "err = %d\n", aio_req_fd_r, -err); 140 "err = %d\n", aio_req_fd_r, -err);
141 } 141 }
142 } 142 }
143 return 0; 143 return 0;
144} 144}
145 145
146#endif 146#endif
147 147
148static int do_not_aio(struct aio_thread_req *req) 148static int do_not_aio(struct aio_thread_req *req)
149{ 149{
150 char c; 150 char c;
151 int err; 151 int err;
152 152
153 switch(req->type){ 153 switch(req->type){
154 case AIO_READ: 154 case AIO_READ:
155 err = os_seek_file(req->io_fd, req->offset); 155 err = os_seek_file(req->io_fd, req->offset);
156 if(err) 156 if(err)
157 goto out; 157 goto out;
158 158
159 err = os_read_file(req->io_fd, req->buf, req->len); 159 err = os_read_file(req->io_fd, req->buf, req->len);
160 break; 160 break;
161 case AIO_WRITE: 161 case AIO_WRITE:
162 err = os_seek_file(req->io_fd, req->offset); 162 err = os_seek_file(req->io_fd, req->offset);
163 if(err) 163 if(err)
164 goto out; 164 goto out;
165 165
166 err = os_write_file(req->io_fd, req->buf, req->len); 166 err = os_write_file(req->io_fd, req->buf, req->len);
167 break; 167 break;
168 case AIO_MMAP: 168 case AIO_MMAP:
169 err = os_seek_file(req->io_fd, req->offset); 169 err = os_seek_file(req->io_fd, req->offset);
170 if(err) 170 if(err)
171 goto out; 171 goto out;
172 172
173 err = os_read_file(req->io_fd, &c, sizeof(c)); 173 err = os_read_file(req->io_fd, &c, sizeof(c));
174 break; 174 break;
175 default: 175 default:
176 printk("do_not_aio - bad request type : %d\n", req->type); 176 printk("do_not_aio - bad request type : %d\n", req->type);
177 err = -EINVAL; 177 err = -EINVAL;
178 break; 178 break;
179 } 179 }
180 180
181 out: 181out:
182 return err; 182 return err;
183} 183}
184 184
185static int not_aio_thread(void *arg) 185static int not_aio_thread(void *arg)
186{ 186{
187 struct aio_thread_req req; 187 struct aio_thread_req req;
188 struct aio_thread_reply reply; 188 struct aio_thread_reply reply;
189 int err; 189 int err;
190 190
191 signal(SIGWINCH, SIG_IGN); 191 signal(SIGWINCH, SIG_IGN);
192 while(1){ 192 while(1){
193 err = os_read_file(aio_req_fd_r, &req, sizeof(req)); 193 err = os_read_file(aio_req_fd_r, &req, sizeof(req));
194 if(err != sizeof(req)){ 194 if(err != sizeof(req)){
195 if(err < 0) 195 if(err < 0)
196 printk("not_aio_thread - read failed, " 196 printk("not_aio_thread - read failed, "
197 "fd = %d, err = %d\n", aio_req_fd_r, 197 "fd = %d, err = %d\n", aio_req_fd_r,
198 -err); 198 -err);
199 else { 199 else {
200 printk("not_aio_thread - short read, fd = %d, " 200 printk("not_aio_thread - short read, fd = %d, "
201 "length = %d\n", aio_req_fd_r, err); 201 "length = %d\n", aio_req_fd_r, err);
202 } 202 }
203 continue; 203 continue;
204 } 204 }
205 err = do_not_aio(&req); 205 err = do_not_aio(&req);
206 reply = ((struct aio_thread_reply) { .data = req.aio, 206 reply = ((struct aio_thread_reply) { .data = req.aio,
207 .err = err }); 207 .err = err });
208 err = os_write_file(req.aio->reply_fd, &reply, sizeof(reply)); 208 err = os_write_file(req.aio->reply_fd, &reply, sizeof(reply));
209 if(err != sizeof(reply)) 209 if(err != sizeof(reply))
210 printk("not_aio_thread - write failed, fd = %d, " 210 printk("not_aio_thread - write failed, fd = %d, "
211 "err = %d\n", aio_req_fd_r, -err); 211 "err = %d\n", aio_req_fd_r, -err);
212 } 212 }
213
214 return 0;
213} 215}
214 216
215static int aio_pid = -1; 217static int aio_pid = -1;
216 218
217static int init_aio_24(void) 219static int init_aio_24(void)
218{ 220{
219 unsigned long stack; 221 unsigned long stack;
220 int fds[2], err; 222 int fds[2], err;
221 223
222 err = os_pipe(fds, 1, 1); 224 err = os_pipe(fds, 1, 1);
223 if(err) 225 if(err)
224 goto out; 226 goto out;
225 227
226 aio_req_fd_w = fds[0]; 228 aio_req_fd_w = fds[0];
227 aio_req_fd_r = fds[1]; 229 aio_req_fd_r = fds[1];
228 err = run_helper_thread(not_aio_thread, NULL, 230 err = run_helper_thread(not_aio_thread, NULL,
229 CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0); 231 CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0);
230 if(err < 0) 232 if(err < 0)
231 goto out_close_pipe; 233 goto out_close_pipe;
232 234
233 aio_pid = err; 235 aio_pid = err;
234 goto out; 236 goto out;
235 237
236 out_close_pipe: 238out_close_pipe:
237 os_close_file(fds[0]); 239 os_close_file(fds[0]);
238 os_close_file(fds[1]); 240 os_close_file(fds[1]);
239 aio_req_fd_w = -1; 241 aio_req_fd_w = -1;
240 aio_req_fd_r = -1; 242 aio_req_fd_r = -1;
241 out: 243out:
242#ifndef HAVE_AIO_ABI 244#ifndef HAVE_AIO_ABI
243 printk("/usr/include/linux/aio_abi.h not present during build\n"); 245 printk("/usr/include/linux/aio_abi.h not present during build\n");
244#endif 246#endif
245 printk("2.6 host AIO support not used - falling back to I/O " 247 printk("2.6 host AIO support not used - falling back to I/O "
246 "thread\n"); 248 "thread\n");
247 return 0; 249 return 0;
248} 250}
249 251
250#ifdef HAVE_AIO_ABI 252#ifdef HAVE_AIO_ABI
251#define DEFAULT_24_AIO 0 253#define DEFAULT_24_AIO 0
252static int init_aio_26(void) 254static int init_aio_26(void)
253{ 255{
254 unsigned long stack; 256 unsigned long stack;
255 int err; 257 int err;
256 258
257 if(io_setup(256, &ctx)){ 259 if(io_setup(256, &ctx)){
258 err = -errno; 260 err = -errno;
259 printk("aio_thread failed to initialize context, err = %d\n", 261 printk("aio_thread failed to initialize context, err = %d\n",
260 errno); 262 errno);
261 return err; 263 return err;
262 } 264 }
263 265
264 err = run_helper_thread(aio_thread, NULL, 266 err = run_helper_thread(aio_thread, NULL,
265 CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0); 267 CLONE_FILES | CLONE_VM | SIGCHLD, &stack, 0);
266 if(err < 0) 268 if(err < 0)
267 return err; 269 return err;
268 270
269 aio_pid = err; 271 aio_pid = err;
270 272
271 printk("Using 2.6 host AIO\n"); 273 printk("Using 2.6 host AIO\n");
272 return 0; 274 return 0;
273} 275}
274 276
275static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len, 277static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len,
276 unsigned long long offset, struct aio_context *aio) 278 unsigned long long offset, struct aio_context *aio)
277{ 279{
278 struct aio_thread_reply reply; 280 struct aio_thread_reply reply;
279 int err; 281 int err;
280 282
281 err = do_aio(ctx, type, io_fd, buf, len, offset, aio); 283 err = do_aio(ctx, type, io_fd, buf, len, offset, aio);
282 if(err){ 284 if(err){
283 reply = ((struct aio_thread_reply) { .data = aio, 285 reply = ((struct aio_thread_reply) { .data = aio,
284 .err = err }); 286 .err = err });
285 err = os_write_file(aio->reply_fd, &reply, sizeof(reply)); 287 err = os_write_file(aio->reply_fd, &reply, sizeof(reply));
286 if(err != sizeof(reply)) 288 if(err != sizeof(reply))
287 printk("submit_aio_26 - write failed, " 289 printk("submit_aio_26 - write failed, "
288 "fd = %d, err = %d\n", aio->reply_fd, -err); 290 "fd = %d, err = %d\n", aio->reply_fd, -err);
289 else err = 0; 291 else err = 0;
290 } 292 }
291 293
292 return err; 294 return err;
293} 295}
294 296
295#else 297#else
296#define DEFAULT_24_AIO 1 298#define DEFAULT_24_AIO 1
297static int init_aio_26(void) 299static int init_aio_26(void)
298{ 300{
299 return -ENOSYS; 301 return -ENOSYS;
300} 302}
301 303
302static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len, 304static int submit_aio_26(enum aio_type type, int io_fd, char *buf, int len,
303 unsigned long long offset, struct aio_context *aio) 305 unsigned long long offset, struct aio_context *aio)
304{ 306{
305 return -ENOSYS; 307 return -ENOSYS;
306} 308}
307#endif 309#endif
308 310
@@ -310,8 +312,8 @@ static int aio_24 = DEFAULT_24_AIO;
310 312
311static int __init set_aio_24(char *name, int *add) 313static int __init set_aio_24(char *name, int *add)
312{ 314{
313 aio_24 = 1; 315 aio_24 = 1;
314 return 0; 316 return 0;
315} 317}
316 318
317__uml_setup("aio=2.4", set_aio_24, 319__uml_setup("aio=2.4", set_aio_24,
@@ -328,28 +330,27 @@ __uml_setup("aio=2.4", set_aio_24,
328 330
329static int init_aio(void) 331static int init_aio(void)
330{ 332{
331 int err; 333 int err;
332 334
333 CHOOSE_MODE(({ 335 CHOOSE_MODE(({ if(!aio_24){
334 if(!aio_24){ 336 printk("Disabling 2.6 AIO in tt mode\n");
335 printk("Disabling 2.6 AIO in tt mode\n"); 337 aio_24 = 1;
336 aio_24 = 1; 338 } }), (void) 0);
337 } }), (void) 0); 339
338 340 if(!aio_24){
339 if(!aio_24){ 341 err = init_aio_26();
340 err = init_aio_26(); 342 if(err && (errno == ENOSYS)){
341 if(err && (errno == ENOSYS)){ 343 printk("2.6 AIO not supported on the host - "
342 printk("2.6 AIO not supported on the host - " 344 "reverting to 2.4 AIO\n");
343 "reverting to 2.4 AIO\n"); 345 aio_24 = 1;
344 aio_24 = 1; 346 }
345 } 347 else return err;
346 else return err; 348 }
347 } 349
348 350 if(aio_24)
349 if(aio_24) 351 return init_aio_24();
350 return init_aio_24(); 352
351 353 return 0;
352 return 0;
353} 354}
354 355
355/* The reason for the __initcall/__uml_exitcall asymmetry is that init_aio 356/* The reason for the __initcall/__uml_exitcall asymmetry is that init_aio
@@ -362,8 +363,8 @@ __initcall(init_aio);
362 363
363static void exit_aio(void) 364static void exit_aio(void)
364{ 365{
365 if(aio_pid != -1) 366 if(aio_pid != -1)
366 os_kill_process(aio_pid, 1); 367 os_kill_process(aio_pid, 1);
367} 368}
368 369
369__uml_exitcall(exit_aio); 370__uml_exitcall(exit_aio);
@@ -371,30 +372,30 @@ __uml_exitcall(exit_aio);
371static int submit_aio_24(enum aio_type type, int io_fd, char *buf, int len, 372static int submit_aio_24(enum aio_type type, int io_fd, char *buf, int len,
372 unsigned long long offset, struct aio_context *aio) 373 unsigned long long offset, struct aio_context *aio)
373{ 374{
374 struct aio_thread_req req = { .type = type, 375 struct aio_thread_req req = { .type = type,
375 .io_fd = io_fd, 376 .io_fd = io_fd,
376 .offset = offset, 377 .offset = offset,
377 .buf = buf, 378 .buf = buf,
378 .len = len, 379 .len = len,
379 .aio = aio, 380 .aio = aio,
380 }; 381 };
381 int err; 382 int err;
382 383
383 err = os_write_file(aio_req_fd_w, &req, sizeof(req)); 384 err = os_write_file(aio_req_fd_w, &req, sizeof(req));
384 if(err == sizeof(req)) 385 if(err == sizeof(req))
385 err = 0; 386 err = 0;
386 387
387 return err; 388 return err;
388} 389}
389 390
390int submit_aio(enum aio_type type, int io_fd, char *buf, int len, 391int submit_aio(enum aio_type type, int io_fd, char *buf, int len,
391 unsigned long long offset, int reply_fd, 392 unsigned long long offset, int reply_fd,
392 struct aio_context *aio) 393 struct aio_context *aio)
393{ 394{
394 aio->reply_fd = reply_fd; 395 aio->reply_fd = reply_fd;
395 if(aio_24) 396 if(aio_24)
396 return submit_aio_24(type, io_fd, buf, len, offset, aio); 397 return submit_aio_24(type, io_fd, buf, len, offset, aio);
397 else { 398 else {
398 return submit_aio_26(type, io_fd, buf, len, offset, aio); 399 return submit_aio_26(type, io_fd, buf, len, offset, aio);
399 } 400 }
400} 401}
diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c
new file mode 100644
index 000000000000..ecf107ae5ac8
--- /dev/null
+++ b/arch/um/os-Linux/umid.c
@@ -0,0 +1,335 @@
1#include <stdio.h>
2#include <unistd.h>
3#include <stdlib.h>
4#include <string.h>
5#include <errno.h>
6#include <signal.h>
7#include <dirent.h>
8#include <sys/fcntl.h>
9#include <sys/stat.h>
10#include <sys/param.h>
11#include "init.h"
12#include "os.h"
13#include "user.h"
14#include "mode.h"
15
16#define UML_DIR "~/.uml/"
17
18#define UMID_LEN 64
19
20/* Changed by set_umid, which is run early in boot */
21char umid[UMID_LEN] = { 0 };
22
23/* Changed by set_uml_dir and make_uml_dir, which are run early in boot */
24static char *uml_dir = UML_DIR;
25
26static int __init make_uml_dir(void)
27{
28 char dir[512] = { '\0' };
29 int len, err;
30
31 if(*uml_dir == '~'){
32 char *home = getenv("HOME");
33
34 err = -ENOENT;
35 if(home == NULL){
36 printk("make_uml_dir : no value in environment for "
37 "$HOME\n");
38 goto err;
39 }
40 strlcpy(dir, home, sizeof(dir));
41 uml_dir++;
42 }
43 strlcat(dir, uml_dir, sizeof(dir));
44 len = strlen(dir);
45 if (len > 0 && dir[len - 1] != '/')
46 strlcat(dir, "/", sizeof(dir));
47
48 err = -ENOMEM;
49 uml_dir = malloc(strlen(dir) + 1);
50 if (uml_dir == NULL) {
51 printf("make_uml_dir : malloc failed, errno = %d\n", errno);
52 goto err;
53 }
54 strcpy(uml_dir, dir);
55
56 if((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)){
57 printf("Failed to mkdir '%s': %s\n", uml_dir, strerror(errno));
58 err = -errno;
59 goto err_free;
60 }
61 return 0;
62
63err_free:
64 free(uml_dir);
65err:
66 uml_dir = NULL;
67 return err;
68}
69
70static int actually_do_remove(char *dir)
71{
72 DIR *directory;
73 struct dirent *ent;
74 int len;
75 char file[256];
76
77 directory = opendir(dir);
78 if(directory == NULL)
79 return -errno;
80
81 while((ent = readdir(directory)) != NULL){
82 if(!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, ".."))
83 continue;
84 len = strlen(dir) + sizeof("/") + strlen(ent->d_name) + 1;
85 if(len > sizeof(file))
86 return -E2BIG;
87
88 sprintf(file, "%s/%s", dir, ent->d_name);
89 if(unlink(file) < 0)
90 return -errno;
91 }
92 if(rmdir(dir) < 0)
93 return -errno;
94
95 return 0;
96}
97
98/* This says that there isn't already a user of the specified directory even if
99 * there are errors during the checking. This is because if these errors
100 * happen, the directory is unusable by the pre-existing UML, so we might as
101 * well take it over. This could happen either by
102 * the existing UML somehow corrupting its umid directory
103 * something other than UML sticking stuff in the directory
104 * this boot racing with a shutdown of the other UML
105 * In any of these cases, the directory isn't useful for anything else.
106 */
107
108static int not_dead_yet(char *dir)
109{
110 char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
111 char pid[sizeof("nnnnn\0")], *end;
112 int dead, fd, p, n, err;
113
114 n = snprintf(file, sizeof(file), "%s/pid", dir);
115 if(n >= sizeof(file)){
116 printk("not_dead_yet - pid filename too long\n");
117 err = -E2BIG;
118 goto out;
119 }
120
121 dead = 0;
122 fd = open(file, O_RDONLY);
123 if(fd < 0){
124 if(fd != -ENOENT){
125 printk("not_dead_yet : couldn't open pid file '%s', "
126 "err = %d\n", file, -fd);
127 }
128 goto out;
129 }
130
131 err = 0;
132 n = read(fd, pid, sizeof(pid));
133 if(n <= 0){
134 printk("not_dead_yet : couldn't read pid file '%s', "
135 "err = %d\n", file, -n);
136 goto out_close;
137 }
138
139 p = strtoul(pid, &end, 0);
140 if(end == pid){
141 printk("not_dead_yet : couldn't parse pid file '%s', "
142 "errno = %d\n", file, errno);
143 goto out_close;
144 }
145
146 if((kill(p, 0) == 0) || (errno != ESRCH))
147 return 1;
148
149 err = actually_do_remove(dir);
150 if(err)
151 printk("not_dead_yet - actually_do_remove failed with "
152 "err = %d\n", err);
153
154 return err;
155
156 out_close:
157 close(fd);
158 out:
159 return 0;
160}
161
162static void __init create_pid_file(void)
163{
164 char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")];
165 char pid[sizeof("nnnnn\0")];
166 int fd, n;
167
168 if(umid_file_name("pid", file, sizeof(file)))
169 return;
170
171 fd = open(file, O_RDWR | O_CREAT | O_EXCL, 0644);
172 if(fd < 0){
173 printk("Open of machine pid file \"%s\" failed: %s\n",
174 file, strerror(-fd));
175 return;
176 }
177
178 snprintf(pid, sizeof(pid), "%d\n", getpid());
179 n = write(fd, pid, strlen(pid));
180 if(n != strlen(pid))
181 printk("Write of pid file failed - err = %d\n", -n);
182
183 close(fd);
184}
185
186int __init set_umid(char *name)
187{
188 if(strlen(name) > UMID_LEN - 1)
189 return -E2BIG;
190
191 strlcpy(umid, name, sizeof(umid));
192
193 return 0;
194}
195
196static int umid_setup = 0;
197
198int __init make_umid(void)
199{
200 int fd, err;
201 char tmp[256];
202
203 if(umid_setup)
204 return 0;
205
206 make_uml_dir();
207
208 if(*umid == '\0'){
209 strlcpy(tmp, uml_dir, sizeof(tmp));
210 strlcat(tmp, "XXXXXX", sizeof(tmp));
211 fd = mkstemp(tmp);
212 if(fd < 0){
213 printk("make_umid - mkstemp(%s) failed: %s\n",
214 tmp, strerror(errno));
215 err = -errno;
216 goto err;
217 }
218
219 close(fd);
220
221 set_umid(&tmp[strlen(uml_dir)]);
222
223 /* There's a nice tiny little race between this unlink and
224 * the mkdir below. It'd be nice if there were a mkstemp
225 * for directories.
226 */
227 if(unlink(tmp)){
228 err = -errno;
229 goto err;
230 }
231 }
232
233 snprintf(tmp, sizeof(tmp), "%s%s", uml_dir, umid);
234 err = mkdir(tmp, 0777);
235 if(err < 0){
236 err = -errno;
237 if(errno != EEXIST)
238 goto err;
239
240 if(not_dead_yet(tmp) < 0)
241 goto err;
242
243 err = mkdir(tmp, 0777);
244 }
245 if(err < 0){
246 printk("Failed to create '%s' - err = %d\n", umid, err);
247 goto err_rmdir;
248 }
249
250 umid_setup = 1;
251
252 create_pid_file();
253
254 return 0;
255
256 err_rmdir:
257 rmdir(tmp);
258 err:
259 return err;
260}
261
262static int __init make_umid_init(void)
263{
264 make_umid();
265
266 return 0;
267}
268
269__initcall(make_umid_init);
270
271int __init umid_file_name(char *name, char *buf, int len)
272{
273 int n, err;
274
275 err = make_umid();
276 if(err)
277 return err;
278
279 n = snprintf(buf, len, "%s%s/%s", uml_dir, umid, name);
280 if(n >= len){
281 printk("umid_file_name : buffer too short\n");
282 return -E2BIG;
283 }
284
285 return 0;
286}
287
288char *get_umid(void)
289{
290 return umid;
291}
292
293static int __init set_uml_dir(char *name, int *add)
294{
295 if(*name == '\0'){
296 printf("uml_dir can't be an empty string\n");
297 return 0;
298 }
299
300 if(name[strlen(name) - 1] == '/'){
301 uml_dir = name;
302 return 0;
303 }
304
305 uml_dir = malloc(strlen(name) + 2);
306 if(uml_dir == NULL){
307 printf("Failed to malloc uml_dir - error = %d\n", errno);
308
309 /* Return 0 here because do_initcalls doesn't look at
310 * the return value.
311 */
312 return 0;
313 }
314 sprintf(uml_dir, "%s/", name);
315
316 return 0;
317}
318
319__uml_setup("uml_dir=", set_uml_dir,
320"uml_dir=<directory>\n"
321" The location to place the pid and umid files.\n\n"
322);
323
324static void remove_umid_dir(void)
325{
326 char dir[strlen(uml_dir) + UMID_LEN + 1], err;
327
328 sprintf(dir, "%s%s", uml_dir, umid);
329 err = actually_do_remove(dir);
330 if(err)
331 printf("remove_umid_dir - actually_do_remove failed with "
332 "err = %d\n", err);
333}
334
335__uml_exitcall(remove_umid_dir);
diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
index e2c6e64a85ec..fcb06a50fdd2 100644
--- a/arch/x86_64/Kconfig.debug
+++ b/arch/x86_64/Kconfig.debug
@@ -9,6 +9,16 @@ config INIT_DEBUG
9 Fill __init and __initdata at the end of boot. This helps debugging 9 Fill __init and __initdata at the end of boot. This helps debugging
10 illegal uses of __init and __initdata after initialization. 10 illegal uses of __init and __initdata after initialization.
11 11
12config DEBUG_RODATA
13 bool "Write protect kernel read-only data structures"
14 depends on DEBUG_KERNEL
15 help
16 Mark the kernel read-only data as write-protected in the pagetables,
17 in order to catch accidental (and incorrect) writes to such const data.
18 This option may have a slight performance impact because a portion
19 of the kernel code won't be covered by a 2MB TLB anymore.
20 If in doubt, say "N".
21
12config IOMMU_DEBUG 22config IOMMU_DEBUG
13 depends on GART_IOMMU && DEBUG_KERNEL 23 depends on GART_IOMMU && DEBUG_KERNEL
14 bool "Enable IOMMU debugging" 24 bool "Enable IOMMU debugging"
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index e0eb0c712fe9..df0773c9bdbe 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -341,7 +341,7 @@ ENTRY(ia32_ptregs_common)
341 jmp ia32_sysret /* misbalances the return cache */ 341 jmp ia32_sysret /* misbalances the return cache */
342 CFI_ENDPROC 342 CFI_ENDPROC
343 343
344 .data 344 .section .rodata,"a"
345 .align 8 345 .align 8
346 .globl ia32_sys_call_table 346 .globl ia32_sys_call_table
347ia32_sys_call_table: 347ia32_sys_call_table:
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 7519fc520eb3..3060ed97b755 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -157,7 +157,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
157DECLARE_PER_CPU(int, cpu_state); 157DECLARE_PER_CPU(int, cpu_state);
158 158
159#include <asm/nmi.h> 159#include <asm/nmi.h>
160/* We don't actually take CPU down, just spin without interrupts. */ 160/* We halt the CPU with physical CPU hotplug */
161static inline void play_dead(void) 161static inline void play_dead(void)
162{ 162{
163 idle_task_exit(); 163 idle_task_exit();
@@ -166,8 +166,9 @@ static inline void play_dead(void)
166 /* Ack it */ 166 /* Ack it */
167 __get_cpu_var(cpu_state) = CPU_DEAD; 167 __get_cpu_var(cpu_state) = CPU_DEAD;
168 168
169 local_irq_disable();
169 while (1) 170 while (1)
170 safe_halt(); 171 halt();
171} 172}
172#else 173#else
173static inline void play_dead(void) 174static inline void play_dead(void)
diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c
index e263685f864c..7c176b3edde0 100644
--- a/arch/x86_64/kernel/syscall.c
+++ b/arch/x86_64/kernel/syscall.c
@@ -19,7 +19,7 @@ typedef void (*sys_call_ptr_t)(void);
19 19
20extern void sys_ni_syscall(void); 20extern void sys_ni_syscall(void);
21 21
22sys_call_ptr_t sys_call_table[__NR_syscall_max+1] __cacheline_aligned = { 22const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
23 /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */ 23 /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */
24 [0 ... __NR_syscall_max] = &sys_ni_syscall, 24 [0 ... __NR_syscall_max] = &sys_ni_syscall,
25#include <asm-x86_64/unistd.h> 25#include <asm-x86_64/unistd.h>
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index c016dfe84784..1faae5fc1c01 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -498,6 +498,29 @@ void free_initmem(void)
498 printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10); 498 printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
499} 499}
500 500
501#ifdef CONFIG_DEBUG_RODATA
502
503extern char __start_rodata, __end_rodata;
504void mark_rodata_ro(void)
505{
506 unsigned long addr = (unsigned long)&__start_rodata;
507
508 for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
509 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
510
511 printk ("Write protecting the kernel read-only data: %luk\n",
512 (&__end_rodata - &__start_rodata) >> 10);
513
514 /*
515 * change_page_attr_addr() requires a global_flush_tlb() call after it.
516 * We do this after the printk so that if something went wrong in the
517 * change, the printk gets out at least to give a better debug hint
518 * of who is the culprit.
519 */
520 global_flush_tlb();
521}
522#endif
523
501#ifdef CONFIG_BLK_DEV_INITRD 524#ifdef CONFIG_BLK_DEV_INITRD
502void free_initrd_mem(unsigned long start, unsigned long end) 525void free_initrd_mem(unsigned long start, unsigned long end)
503{ 526{
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index b90e8fe9eeb0..35f1f1aab063 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -128,6 +128,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
128 pte_t *kpte; 128 pte_t *kpte;
129 struct page *kpte_page; 129 struct page *kpte_page;
130 unsigned kpte_flags; 130 unsigned kpte_flags;
131 pgprot_t ref_prot2;
131 kpte = lookup_address(address); 132 kpte = lookup_address(address);
132 if (!kpte) return 0; 133 if (!kpte) return 0;
133 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); 134 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
@@ -140,10 +141,14 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
140 * split_large_page will take the reference for this change_page_attr 141 * split_large_page will take the reference for this change_page_attr
141 * on the split page. 142 * on the split page.
142 */ 143 */
143 struct page *split = split_large_page(address, prot, ref_prot); 144
145 struct page *split;
146 ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
147
148 split = split_large_page(address, prot, ref_prot2);
144 if (!split) 149 if (!split)
145 return -ENOMEM; 150 return -ENOMEM;
146 set_pte(kpte,mk_pte(split, ref_prot)); 151 set_pte(kpte,mk_pte(split, ref_prot2));
147 kpte_page = split; 152 kpte_page = split;
148 } 153 }
149 get_page(kpte_page); 154 get_page(kpte_page);
diff --git a/block/Kconfig b/block/Kconfig
index eb48edb80c1d..377f6dd20e17 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,7 +5,7 @@
5#for instance. 5#for instance.
6config LBD 6config LBD
7 bool "Support for Large Block Devices" 7 bool "Support for Large Block Devices"
8 depends on X86 || (MIPS && 32BIT) || PPC32 || ARCH_S390_31 || SUPERH || UML 8 depends on X86 || (MIPS && 32BIT) || PPC32 || (S390 && !64BIT) || SUPERH || UML
9 help 9 help
10 Say Y here if you want to attach large (bigger than 2TB) discs to 10 Say Y here if you want to attach large (bigger than 2TB) discs to
11 your machine, or if you want to have a raid or loopback device 11 your machine, or if you want to have a raid or loopback device
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 43fa20495688..8da3cf66894c 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -182,6 +182,9 @@ struct as_rq {
182 182
183static kmem_cache_t *arq_pool; 183static kmem_cache_t *arq_pool;
184 184
185static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq);
186static void as_antic_stop(struct as_data *ad);
187
185/* 188/*
186 * IO Context helper functions 189 * IO Context helper functions
187 */ 190 */
@@ -370,7 +373,7 @@ static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir)
370 * existing request against the same sector), which can happen when using 373 * existing request against the same sector), which can happen when using
371 * direct IO, then return the alias. 374 * direct IO, then return the alias.
372 */ 375 */
373static struct as_rq *as_add_arq_rb(struct as_data *ad, struct as_rq *arq) 376static struct as_rq *__as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
374{ 377{
375 struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node; 378 struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node;
376 struct rb_node *parent = NULL; 379 struct rb_node *parent = NULL;
@@ -397,6 +400,16 @@ static struct as_rq *as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
397 return NULL; 400 return NULL;
398} 401}
399 402
403static void as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
404{
405 struct as_rq *alias;
406
407 while ((unlikely(alias = __as_add_arq_rb(ad, arq)))) {
408 as_move_to_dispatch(ad, alias);
409 as_antic_stop(ad);
410 }
411}
412
400static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq) 413static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq)
401{ 414{
402 if (!ON_RB(&arq->rb_node)) { 415 if (!ON_RB(&arq->rb_node)) {
@@ -1133,23 +1146,6 @@ static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
1133 /* 1146 /*
1134 * take it off the sort and fifo list, add to dispatch queue 1147 * take it off the sort and fifo list, add to dispatch queue
1135 */ 1148 */
1136 while (!list_empty(&rq->queuelist)) {
1137 struct request *__rq = list_entry_rq(rq->queuelist.next);
1138 struct as_rq *__arq = RQ_DATA(__rq);
1139
1140 list_del(&__rq->queuelist);
1141
1142 elv_dispatch_add_tail(ad->q, __rq);
1143
1144 if (__arq->io_context && __arq->io_context->aic)
1145 atomic_inc(&__arq->io_context->aic->nr_dispatched);
1146
1147 WARN_ON(__arq->state != AS_RQ_QUEUED);
1148 __arq->state = AS_RQ_DISPATCHED;
1149
1150 ad->nr_dispatched++;
1151 }
1152
1153 as_remove_queued_request(ad->q, rq); 1149 as_remove_queued_request(ad->q, rq);
1154 WARN_ON(arq->state != AS_RQ_QUEUED); 1150 WARN_ON(arq->state != AS_RQ_QUEUED);
1155 1151
@@ -1326,49 +1322,12 @@ fifo_expired:
1326} 1322}
1327 1323
1328/* 1324/*
1329 * Add arq to a list behind alias
1330 */
1331static inline void
1332as_add_aliased_request(struct as_data *ad, struct as_rq *arq,
1333 struct as_rq *alias)
1334{
1335 struct request *req = arq->request;
1336 struct list_head *insert = alias->request->queuelist.prev;
1337
1338 /*
1339 * Transfer list of aliases
1340 */
1341 while (!list_empty(&req->queuelist)) {
1342 struct request *__rq = list_entry_rq(req->queuelist.next);
1343 struct as_rq *__arq = RQ_DATA(__rq);
1344
1345 list_move_tail(&__rq->queuelist, &alias->request->queuelist);
1346
1347 WARN_ON(__arq->state != AS_RQ_QUEUED);
1348 }
1349
1350 /*
1351 * Another request with the same start sector on the rbtree.
1352 * Link this request to that sector. They are untangled in
1353 * as_move_to_dispatch
1354 */
1355 list_add(&arq->request->queuelist, insert);
1356
1357 /*
1358 * Don't want to have to handle merges.
1359 */
1360 as_del_arq_hash(arq);
1361 arq->request->flags |= REQ_NOMERGE;
1362}
1363
1364/*
1365 * add arq to rbtree and fifo 1325 * add arq to rbtree and fifo
1366 */ 1326 */
1367static void as_add_request(request_queue_t *q, struct request *rq) 1327static void as_add_request(request_queue_t *q, struct request *rq)
1368{ 1328{
1369 struct as_data *ad = q->elevator->elevator_data; 1329 struct as_data *ad = q->elevator->elevator_data;
1370 struct as_rq *arq = RQ_DATA(rq); 1330 struct as_rq *arq = RQ_DATA(rq);
1371 struct as_rq *alias;
1372 int data_dir; 1331 int data_dir;
1373 1332
1374 arq->state = AS_RQ_NEW; 1333 arq->state = AS_RQ_NEW;
@@ -1387,33 +1346,17 @@ static void as_add_request(request_queue_t *q, struct request *rq)
1387 atomic_inc(&arq->io_context->aic->nr_queued); 1346 atomic_inc(&arq->io_context->aic->nr_queued);
1388 } 1347 }
1389 1348
1390 alias = as_add_arq_rb(ad, arq); 1349 as_add_arq_rb(ad, arq);
1391 if (!alias) { 1350 if (rq_mergeable(arq->request))
1392 /* 1351 as_add_arq_hash(ad, arq);
1393 * set expire time (only used for reads) and add to fifo list
1394 */
1395 arq->expires = jiffies + ad->fifo_expire[data_dir];
1396 list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
1397 1352
1398 if (rq_mergeable(arq->request)) 1353 /*
1399 as_add_arq_hash(ad, arq); 1354 * set expire time (only used for reads) and add to fifo list
1400 as_update_arq(ad, arq); /* keep state machine up to date */ 1355 */
1401 1356 arq->expires = jiffies + ad->fifo_expire[data_dir];
1402 } else { 1357 list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
1403 as_add_aliased_request(ad, arq, alias);
1404
1405 /*
1406 * have we been anticipating this request?
1407 * or does it come from the same process as the one we are
1408 * anticipating for?
1409 */
1410 if (ad->antic_status == ANTIC_WAIT_REQ
1411 || ad->antic_status == ANTIC_WAIT_NEXT) {
1412 if (as_can_break_anticipation(ad, arq))
1413 as_antic_stop(ad);
1414 }
1415 }
1416 1358
1359 as_update_arq(ad, arq); /* keep state machine up to date */
1417 arq->state = AS_RQ_QUEUED; 1360 arq->state = AS_RQ_QUEUED;
1418} 1361}
1419 1362
@@ -1536,23 +1479,8 @@ static void as_merged_request(request_queue_t *q, struct request *req)
1536 * if the merge was a front merge, we need to reposition request 1479 * if the merge was a front merge, we need to reposition request
1537 */ 1480 */
1538 if (rq_rb_key(req) != arq->rb_key) { 1481 if (rq_rb_key(req) != arq->rb_key) {
1539 struct as_rq *alias, *next_arq = NULL;
1540
1541 if (ad->next_arq[arq->is_sync] == arq)
1542 next_arq = as_find_next_arq(ad, arq);
1543
1544 /*
1545 * Note! We should really be moving any old aliased requests
1546 * off this request and try to insert them into the rbtree. We
1547 * currently don't bother. Ditto the next function.
1548 */
1549 as_del_arq_rb(ad, arq); 1482 as_del_arq_rb(ad, arq);
1550 if ((alias = as_add_arq_rb(ad, arq))) { 1483 as_add_arq_rb(ad, arq);
1551 list_del_init(&arq->fifo);
1552 as_add_aliased_request(ad, arq, alias);
1553 if (next_arq)
1554 ad->next_arq[arq->is_sync] = next_arq;
1555 }
1556 /* 1484 /*
1557 * Note! At this stage of this and the next function, our next 1485 * Note! At this stage of this and the next function, our next
1558 * request may not be optimal - eg the request may have "grown" 1486 * request may not be optimal - eg the request may have "grown"
@@ -1579,18 +1507,8 @@ static void as_merged_requests(request_queue_t *q, struct request *req,
1579 as_add_arq_hash(ad, arq); 1507 as_add_arq_hash(ad, arq);
1580 1508
1581 if (rq_rb_key(req) != arq->rb_key) { 1509 if (rq_rb_key(req) != arq->rb_key) {
1582 struct as_rq *alias, *next_arq = NULL;
1583
1584 if (ad->next_arq[arq->is_sync] == arq)
1585 next_arq = as_find_next_arq(ad, arq);
1586
1587 as_del_arq_rb(ad, arq); 1510 as_del_arq_rb(ad, arq);
1588 if ((alias = as_add_arq_rb(ad, arq))) { 1511 as_add_arq_rb(ad, arq);
1589 list_del_init(&arq->fifo);
1590 as_add_aliased_request(ad, arq, alias);
1591 if (next_arq)
1592 ad->next_arq[arq->is_sync] = next_arq;
1593 }
1594 } 1512 }
1595 1513
1596 /* 1514 /*
@@ -1610,18 +1528,6 @@ static void as_merged_requests(request_queue_t *q, struct request *req,
1610 } 1528 }
1611 1529
1612 /* 1530 /*
1613 * Transfer list of aliases
1614 */
1615 while (!list_empty(&next->queuelist)) {
1616 struct request *__rq = list_entry_rq(next->queuelist.next);
1617 struct as_rq *__arq = RQ_DATA(__rq);
1618
1619 list_move_tail(&__rq->queuelist, &req->queuelist);
1620
1621 WARN_ON(__arq->state != AS_RQ_QUEUED);
1622 }
1623
1624 /*
1625 * kill knowledge of next, this one is a goner 1531 * kill knowledge of next, this one is a goner
1626 */ 1532 */
1627 as_remove_queued_request(q, next); 1533 as_remove_queued_request(q, next);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ee0bb41694b0..74fae2daf87e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -25,15 +25,15 @@
25/* 25/*
26 * tunables 26 * tunables
27 */ 27 */
28static int cfq_quantum = 4; /* max queue in one round of service */ 28static const int cfq_quantum = 4; /* max queue in one round of service */
29static int cfq_queued = 8; /* minimum rq allocate limit per-queue*/ 29static const int cfq_queued = 8; /* minimum rq allocate limit per-queue*/
30static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; 30static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
31static int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */ 31static const int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */
32static int cfq_back_penalty = 2; /* penalty of a backwards seek */ 32static const int cfq_back_penalty = 2; /* penalty of a backwards seek */
33 33
34static int cfq_slice_sync = HZ / 10; 34static const int cfq_slice_sync = HZ / 10;
35static int cfq_slice_async = HZ / 25; 35static int cfq_slice_async = HZ / 25;
36static int cfq_slice_async_rq = 2; 36static const int cfq_slice_async_rq = 2;
37static int cfq_slice_idle = HZ / 100; 37static int cfq_slice_idle = HZ / 100;
38 38
39#define CFQ_IDLE_GRACE (HZ / 10) 39#define CFQ_IDLE_GRACE (HZ / 10)
@@ -45,7 +45,7 @@ static int cfq_slice_idle = HZ / 100;
45/* 45/*
46 * disable queueing at the driver/hardware level 46 * disable queueing at the driver/hardware level
47 */ 47 */
48static int cfq_max_depth = 2; 48static const int cfq_max_depth = 2;
49 49
50/* 50/*
51 * for the hash of cfqq inside the cfqd 51 * for the hash of cfqq inside the cfqd
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 9cbec09e8415..27e494b1bf97 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -19,10 +19,10 @@
19/* 19/*
20 * See Documentation/block/deadline-iosched.txt 20 * See Documentation/block/deadline-iosched.txt
21 */ 21 */
22static int read_expire = HZ / 2; /* max time before a read is submitted. */ 22static const int read_expire = HZ / 2; /* max time before a read is submitted. */
23static int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ 23static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
24static int writes_starved = 2; /* max times reads can starve a write */ 24static const int writes_starved = 2; /* max times reads can starve a write */
25static int fifo_batch = 16; /* # of sequential requests treated as one 25static const int fifo_batch = 16; /* # of sequential requests treated as one
26 by the above parameters. For throughput. */ 26 by the above parameters. For throughput. */
27 27
28static const int deadline_hash_shift = 5; 28static const int deadline_hash_shift = 5;
diff --git a/block/elevator.c b/block/elevator.c
index 6c3fc8a10bf2..39dcccc82ada 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -304,15 +304,7 @@ void elv_requeue_request(request_queue_t *q, struct request *rq)
304 304
305 rq->flags &= ~REQ_STARTED; 305 rq->flags &= ~REQ_STARTED;
306 306
307 /* 307 __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE, 0);
308 * if this is the flush, requeue the original instead and drop the flush
309 */
310 if (rq->flags & REQ_BAR_FLUSH) {
311 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
312 rq = rq->end_io_data;
313 }
314
315 __elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
316} 308}
317 309
318static void elv_drain_elevator(request_queue_t *q) 310static void elv_drain_elevator(request_queue_t *q)
@@ -332,8 +324,19 @@ static void elv_drain_elevator(request_queue_t *q)
332void __elv_add_request(request_queue_t *q, struct request *rq, int where, 324void __elv_add_request(request_queue_t *q, struct request *rq, int where,
333 int plug) 325 int plug)
334{ 326{
327 struct list_head *pos;
328 unsigned ordseq;
329
330 if (q->ordcolor)
331 rq->flags |= REQ_ORDERED_COLOR;
332
335 if (rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { 333 if (rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
336 /* 334 /*
335 * toggle ordered color
336 */
337 q->ordcolor ^= 1;
338
339 /*
337 * barriers implicitly indicate back insertion 340 * barriers implicitly indicate back insertion
338 */ 341 */
339 if (where == ELEVATOR_INSERT_SORT) 342 if (where == ELEVATOR_INSERT_SORT)
@@ -393,6 +396,30 @@ void __elv_add_request(request_queue_t *q, struct request *rq, int where,
393 q->elevator->ops->elevator_add_req_fn(q, rq); 396 q->elevator->ops->elevator_add_req_fn(q, rq);
394 break; 397 break;
395 398
399 case ELEVATOR_INSERT_REQUEUE:
400 /*
401 * If ordered flush isn't in progress, we do front
402 * insertion; otherwise, requests should be requeued
403 * in ordseq order.
404 */
405 rq->flags |= REQ_SOFTBARRIER;
406
407 if (q->ordseq == 0) {
408 list_add(&rq->queuelist, &q->queue_head);
409 break;
410 }
411
412 ordseq = blk_ordered_req_seq(rq);
413
414 list_for_each(pos, &q->queue_head) {
415 struct request *pos_rq = list_entry_rq(pos);
416 if (ordseq <= blk_ordered_req_seq(pos_rq))
417 break;
418 }
419
420 list_add_tail(&rq->queuelist, pos);
421 break;
422
396 default: 423 default:
397 printk(KERN_ERR "%s: bad insertion point %d\n", 424 printk(KERN_ERR "%s: bad insertion point %d\n",
398 __FUNCTION__, where); 425 __FUNCTION__, where);
@@ -422,25 +449,16 @@ static inline struct request *__elv_next_request(request_queue_t *q)
422{ 449{
423 struct request *rq; 450 struct request *rq;
424 451
425 if (unlikely(list_empty(&q->queue_head) && 452 while (1) {
426 !q->elevator->ops->elevator_dispatch_fn(q, 0))) 453 while (!list_empty(&q->queue_head)) {
427 return NULL; 454 rq = list_entry_rq(q->queue_head.next);
428 455 if (blk_do_ordered(q, &rq))
429 rq = list_entry_rq(q->queue_head.next); 456 return rq;
430 457 }
431 /*
432 * if this is a barrier write and the device has to issue a
433 * flush sequence to support it, check how far we are
434 */
435 if (blk_fs_request(rq) && blk_barrier_rq(rq)) {
436 BUG_ON(q->ordered == QUEUE_ORDERED_NONE);
437 458
438 if (q->ordered == QUEUE_ORDERED_FLUSH && 459 if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
439 !blk_barrier_preflush(rq)) 460 return NULL;
440 rq = blk_start_pre_flush(q, rq);
441 } 461 }
442
443 return rq;
444} 462}
445 463
446struct request *elv_next_request(request_queue_t *q) 464struct request *elv_next_request(request_queue_t *q)
@@ -498,7 +516,7 @@ struct request *elv_next_request(request_queue_t *q)
498 blkdev_dequeue_request(rq); 516 blkdev_dequeue_request(rq);
499 rq->flags |= REQ_QUIET; 517 rq->flags |= REQ_QUIET;
500 end_that_request_chunk(rq, 0, nr_bytes); 518 end_that_request_chunk(rq, 0, nr_bytes);
501 end_that_request_last(rq); 519 end_that_request_last(rq, 0);
502 } else { 520 } else {
503 printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__, 521 printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__,
504 ret); 522 ret);
@@ -593,7 +611,21 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
593 * request is released from the driver, io must be done 611 * request is released from the driver, io must be done
594 */ 612 */
595 if (blk_account_rq(rq)) { 613 if (blk_account_rq(rq)) {
614 struct request *first_rq = list_entry_rq(q->queue_head.next);
615
596 q->in_flight--; 616 q->in_flight--;
617
618 /*
619 * Check if the queue is waiting for fs requests to be
620 * drained for flush sequence.
621 */
622 if (q->ordseq && q->in_flight == 0 &&
623 blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
624 blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) {
625 blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
626 q->request_fn(q);
627 }
628
597 if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn) 629 if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
598 e->ops->elevator_completed_req_fn(q, rq); 630 e->ops->elevator_completed_req_fn(q, rq);
599 } 631 }
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index d4beb9a89ee0..91d3b4828c49 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -36,6 +36,8 @@
36static void blk_unplug_work(void *data); 36static void blk_unplug_work(void *data);
37static void blk_unplug_timeout(unsigned long data); 37static void blk_unplug_timeout(unsigned long data);
38static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io); 38static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
39static void init_request_from_bio(struct request *req, struct bio *bio);
40static int __make_request(request_queue_t *q, struct bio *bio);
39 41
40/* 42/*
41 * For the allocated request tables 43 * For the allocated request tables
@@ -288,8 +290,8 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
288 290
289/** 291/**
290 * blk_queue_ordered - does this queue support ordered writes 292 * blk_queue_ordered - does this queue support ordered writes
291 * @q: the request queue 293 * @q: the request queue
292 * @flag: see below 294 * @ordered: one of QUEUE_ORDERED_*
293 * 295 *
294 * Description: 296 * Description:
295 * For journalled file systems, doing ordered writes on a commit 297 * For journalled file systems, doing ordered writes on a commit
@@ -298,28 +300,30 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
298 * feature should call this function and indicate so. 300 * feature should call this function and indicate so.
299 * 301 *
300 **/ 302 **/
301void blk_queue_ordered(request_queue_t *q, int flag) 303int blk_queue_ordered(request_queue_t *q, unsigned ordered,
302{ 304 prepare_flush_fn *prepare_flush_fn)
303 switch (flag) { 305{
304 case QUEUE_ORDERED_NONE: 306 if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
305 if (q->flush_rq) 307 prepare_flush_fn == NULL) {
306 kmem_cache_free(request_cachep, q->flush_rq); 308 printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
307 q->flush_rq = NULL; 309 return -EINVAL;
308 q->ordered = flag; 310 }
309 break; 311
310 case QUEUE_ORDERED_TAG: 312 if (ordered != QUEUE_ORDERED_NONE &&
311 q->ordered = flag; 313 ordered != QUEUE_ORDERED_DRAIN &&
312 break; 314 ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
313 case QUEUE_ORDERED_FLUSH: 315 ordered != QUEUE_ORDERED_DRAIN_FUA &&
314 q->ordered = flag; 316 ordered != QUEUE_ORDERED_TAG &&
315 if (!q->flush_rq) 317 ordered != QUEUE_ORDERED_TAG_FLUSH &&
316 q->flush_rq = kmem_cache_alloc(request_cachep, 318 ordered != QUEUE_ORDERED_TAG_FUA) {
317 GFP_KERNEL); 319 printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
318 break; 320 return -EINVAL;
319 default:
320 printk("blk_queue_ordered: bad value %d\n", flag);
321 break;
322 } 321 }
322
323 q->next_ordered = ordered;
324 q->prepare_flush_fn = prepare_flush_fn;
325
326 return 0;
323} 327}
324 328
325EXPORT_SYMBOL(blk_queue_ordered); 329EXPORT_SYMBOL(blk_queue_ordered);
@@ -344,167 +348,265 @@ EXPORT_SYMBOL(blk_queue_issue_flush_fn);
344/* 348/*
345 * Cache flushing for ordered writes handling 349 * Cache flushing for ordered writes handling
346 */ 350 */
347static void blk_pre_flush_end_io(struct request *flush_rq) 351inline unsigned blk_ordered_cur_seq(request_queue_t *q)
348{ 352{
349 struct request *rq = flush_rq->end_io_data; 353 if (!q->ordseq)
350 request_queue_t *q = rq->q; 354 return 0;
351 355 return 1 << ffz(q->ordseq);
352 elv_completed_request(q, flush_rq);
353
354 rq->flags |= REQ_BAR_PREFLUSH;
355
356 if (!flush_rq->errors)
357 elv_requeue_request(q, rq);
358 else {
359 q->end_flush_fn(q, flush_rq);
360 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
361 q->request_fn(q);
362 }
363} 356}
364 357
365static void blk_post_flush_end_io(struct request *flush_rq) 358unsigned blk_ordered_req_seq(struct request *rq)
366{ 359{
367 struct request *rq = flush_rq->end_io_data;
368 request_queue_t *q = rq->q; 360 request_queue_t *q = rq->q;
369 361
370 elv_completed_request(q, flush_rq); 362 BUG_ON(q->ordseq == 0);
371 363
372 rq->flags |= REQ_BAR_POSTFLUSH; 364 if (rq == &q->pre_flush_rq)
365 return QUEUE_ORDSEQ_PREFLUSH;
366 if (rq == &q->bar_rq)
367 return QUEUE_ORDSEQ_BAR;
368 if (rq == &q->post_flush_rq)
369 return QUEUE_ORDSEQ_POSTFLUSH;
373 370
374 q->end_flush_fn(q, flush_rq); 371 if ((rq->flags & REQ_ORDERED_COLOR) ==
375 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); 372 (q->orig_bar_rq->flags & REQ_ORDERED_COLOR))
376 q->request_fn(q); 373 return QUEUE_ORDSEQ_DRAIN;
374 else
375 return QUEUE_ORDSEQ_DONE;
377} 376}
378 377
379struct request *blk_start_pre_flush(request_queue_t *q, struct request *rq) 378void blk_ordered_complete_seq(request_queue_t *q, unsigned seq, int error)
380{ 379{
381 struct request *flush_rq = q->flush_rq; 380 struct request *rq;
382 381 int uptodate;
383 BUG_ON(!blk_barrier_rq(rq));
384 382
385 if (test_and_set_bit(QUEUE_FLAG_FLUSH, &q->queue_flags)) 383 if (error && !q->orderr)
386 return NULL; 384 q->orderr = error;
387 385
388 rq_init(q, flush_rq); 386 BUG_ON(q->ordseq & seq);
389 flush_rq->elevator_private = NULL; 387 q->ordseq |= seq;
390 flush_rq->flags = REQ_BAR_FLUSH;
391 flush_rq->rq_disk = rq->rq_disk;
392 flush_rq->rl = NULL;
393 388
394 /* 389 if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
395 * prepare_flush returns 0 if no flush is needed, just mark both 390 return;
396 * pre and post flush as done in that case
397 */
398 if (!q->prepare_flush_fn(q, flush_rq)) {
399 rq->flags |= REQ_BAR_PREFLUSH | REQ_BAR_POSTFLUSH;
400 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags);
401 return rq;
402 }
403 391
404 /* 392 /*
405 * some drivers dequeue requests right away, some only after io 393 * Okay, sequence complete.
406 * completion. make sure the request is dequeued.
407 */ 394 */
408 if (!list_empty(&rq->queuelist)) 395 rq = q->orig_bar_rq;
409 blkdev_dequeue_request(rq); 396 uptodate = q->orderr ? q->orderr : 1;
410 397
411 flush_rq->end_io_data = rq; 398 q->ordseq = 0;
412 flush_rq->end_io = blk_pre_flush_end_io;
413 399
414 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0); 400 end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
415 return flush_rq; 401 end_that_request_last(rq, uptodate);
416} 402}
417 403
418static void blk_start_post_flush(request_queue_t *q, struct request *rq) 404static void pre_flush_end_io(struct request *rq, int error)
419{ 405{
420 struct request *flush_rq = q->flush_rq; 406 elv_completed_request(rq->q, rq);
407 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
408}
421 409
422 BUG_ON(!blk_barrier_rq(rq)); 410static void bar_end_io(struct request *rq, int error)
411{
412 elv_completed_request(rq->q, rq);
413 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
414}
423 415
424 rq_init(q, flush_rq); 416static void post_flush_end_io(struct request *rq, int error)
425 flush_rq->elevator_private = NULL; 417{
426 flush_rq->flags = REQ_BAR_FLUSH; 418 elv_completed_request(rq->q, rq);
427 flush_rq->rq_disk = rq->rq_disk; 419 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
428 flush_rq->rl = NULL; 420}
429 421
430 if (q->prepare_flush_fn(q, flush_rq)) { 422static void queue_flush(request_queue_t *q, unsigned which)
431 flush_rq->end_io_data = rq; 423{
432 flush_rq->end_io = blk_post_flush_end_io; 424 struct request *rq;
425 rq_end_io_fn *end_io;
433 426
434 __elv_add_request(q, flush_rq, ELEVATOR_INSERT_FRONT, 0); 427 if (which == QUEUE_ORDERED_PREFLUSH) {
435 q->request_fn(q); 428 rq = &q->pre_flush_rq;
429 end_io = pre_flush_end_io;
430 } else {
431 rq = &q->post_flush_rq;
432 end_io = post_flush_end_io;
436 } 433 }
434
435 rq_init(q, rq);
436 rq->flags = REQ_HARDBARRIER;
437 rq->elevator_private = NULL;
438 rq->rq_disk = q->bar_rq.rq_disk;
439 rq->rl = NULL;
440 rq->end_io = end_io;
441 q->prepare_flush_fn(q, rq);
442
443 __elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
437} 444}
438 445
439static inline int blk_check_end_barrier(request_queue_t *q, struct request *rq, 446static inline struct request *start_ordered(request_queue_t *q,
440 int sectors) 447 struct request *rq)
441{ 448{
442 if (sectors > rq->nr_sectors) 449 q->bi_size = 0;
443 sectors = rq->nr_sectors; 450 q->orderr = 0;
451 q->ordered = q->next_ordered;
452 q->ordseq |= QUEUE_ORDSEQ_STARTED;
453
454 /*
455 * Prep proxy barrier request.
456 */
457 blkdev_dequeue_request(rq);
458 q->orig_bar_rq = rq;
459 rq = &q->bar_rq;
460 rq_init(q, rq);
461 rq->flags = bio_data_dir(q->orig_bar_rq->bio);
462 rq->flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0;
463 rq->elevator_private = NULL;
464 rq->rl = NULL;
465 init_request_from_bio(rq, q->orig_bar_rq->bio);
466 rq->end_io = bar_end_io;
467
468 /*
469 * Queue ordered sequence. As we stack them at the head, we
470 * need to queue in reverse order. Note that we rely on that
471 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
472 * request gets inbetween ordered sequence.
473 */
474 if (q->ordered & QUEUE_ORDERED_POSTFLUSH)
475 queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
476 else
477 q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
478
479 __elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0);
480
481 if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
482 queue_flush(q, QUEUE_ORDERED_PREFLUSH);
483 rq = &q->pre_flush_rq;
484 } else
485 q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
444 486
445 rq->nr_sectors -= sectors; 487 if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
446 return rq->nr_sectors; 488 q->ordseq |= QUEUE_ORDSEQ_DRAIN;
489 else
490 rq = NULL;
491
492 return rq;
447} 493}
448 494
449static int __blk_complete_barrier_rq(request_queue_t *q, struct request *rq, 495int blk_do_ordered(request_queue_t *q, struct request **rqp)
450 int sectors, int queue_locked)
451{ 496{
452 if (q->ordered != QUEUE_ORDERED_FLUSH) 497 struct request *rq = *rqp, *allowed_rq;
453 return 0; 498 int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
454 if (!blk_fs_request(rq) || !blk_barrier_rq(rq))
455 return 0;
456 if (blk_barrier_postflush(rq))
457 return 0;
458 499
459 if (!blk_check_end_barrier(q, rq, sectors)) { 500 if (!q->ordseq) {
460 unsigned long flags = 0; 501 if (!is_barrier)
502 return 1;
461 503
462 if (!queue_locked) 504 if (q->next_ordered != QUEUE_ORDERED_NONE) {
463 spin_lock_irqsave(q->queue_lock, flags); 505 *rqp = start_ordered(q, rq);
506 return 1;
507 } else {
508 /*
509 * This can happen when the queue switches to
510 * ORDERED_NONE while this request is on it.
511 */
512 blkdev_dequeue_request(rq);
513 end_that_request_first(rq, -EOPNOTSUPP,
514 rq->hard_nr_sectors);
515 end_that_request_last(rq, -EOPNOTSUPP);
516 *rqp = NULL;
517 return 0;
518 }
519 }
464 520
465 blk_start_post_flush(q, rq); 521 if (q->ordered & QUEUE_ORDERED_TAG) {
522 if (is_barrier && rq != &q->bar_rq)
523 *rqp = NULL;
524 return 1;
525 }
466 526
467 if (!queue_locked) 527 switch (blk_ordered_cur_seq(q)) {
468 spin_unlock_irqrestore(q->queue_lock, flags); 528 case QUEUE_ORDSEQ_PREFLUSH:
529 allowed_rq = &q->pre_flush_rq;
530 break;
531 case QUEUE_ORDSEQ_BAR:
532 allowed_rq = &q->bar_rq;
533 break;
534 case QUEUE_ORDSEQ_POSTFLUSH:
535 allowed_rq = &q->post_flush_rq;
536 break;
537 default:
538 allowed_rq = NULL;
539 break;
469 } 540 }
470 541
542 if (rq != allowed_rq &&
543 (blk_fs_request(rq) || rq == &q->pre_flush_rq ||
544 rq == &q->post_flush_rq))
545 *rqp = NULL;
546
471 return 1; 547 return 1;
472} 548}
473 549
474/** 550static int flush_dry_bio_endio(struct bio *bio, unsigned int bytes, int error)
475 * blk_complete_barrier_rq - complete possible barrier request
476 * @q: the request queue for the device
477 * @rq: the request
478 * @sectors: number of sectors to complete
479 *
480 * Description:
481 * Used in driver end_io handling to determine whether to postpone
482 * completion of a barrier request until a post flush has been done. This
483 * is the unlocked variant, used if the caller doesn't already hold the
484 * queue lock.
485 **/
486int blk_complete_barrier_rq(request_queue_t *q, struct request *rq, int sectors)
487{ 551{
488 return __blk_complete_barrier_rq(q, rq, sectors, 0); 552 request_queue_t *q = bio->bi_private;
553 struct bio_vec *bvec;
554 int i;
555
556 /*
557 * This is dry run, restore bio_sector and size. We'll finish
558 * this request again with the original bi_end_io after an
559 * error occurs or post flush is complete.
560 */
561 q->bi_size += bytes;
562
563 if (bio->bi_size)
564 return 1;
565
566 /* Rewind bvec's */
567 bio->bi_idx = 0;
568 bio_for_each_segment(bvec, bio, i) {
569 bvec->bv_len += bvec->bv_offset;
570 bvec->bv_offset = 0;
571 }
572
573 /* Reset bio */
574 set_bit(BIO_UPTODATE, &bio->bi_flags);
575 bio->bi_size = q->bi_size;
576 bio->bi_sector -= (q->bi_size >> 9);
577 q->bi_size = 0;
578
579 return 0;
489} 580}
490EXPORT_SYMBOL(blk_complete_barrier_rq);
491 581
492/** 582static inline int ordered_bio_endio(struct request *rq, struct bio *bio,
493 * blk_complete_barrier_rq_locked - complete possible barrier request 583 unsigned int nbytes, int error)
494 * @q: the request queue for the device
495 * @rq: the request
496 * @sectors: number of sectors to complete
497 *
498 * Description:
499 * See blk_complete_barrier_rq(). This variant must be used if the caller
500 * holds the queue lock.
501 **/
502int blk_complete_barrier_rq_locked(request_queue_t *q, struct request *rq,
503 int sectors)
504{ 584{
505 return __blk_complete_barrier_rq(q, rq, sectors, 1); 585 request_queue_t *q = rq->q;
586 bio_end_io_t *endio;
587 void *private;
588
589 if (&q->bar_rq != rq)
590 return 0;
591
592 /*
593 * Okay, this is the barrier request in progress, dry finish it.
594 */
595 if (error && !q->orderr)
596 q->orderr = error;
597
598 endio = bio->bi_end_io;
599 private = bio->bi_private;
600 bio->bi_end_io = flush_dry_bio_endio;
601 bio->bi_private = q;
602
603 bio_endio(bio, nbytes, error);
604
605 bio->bi_end_io = endio;
606 bio->bi_private = private;
607
608 return 1;
506} 609}
507EXPORT_SYMBOL(blk_complete_barrier_rq_locked);
508 610
509/** 611/**
510 * blk_queue_bounce_limit - set bounce buffer limit for queue 612 * blk_queue_bounce_limit - set bounce buffer limit for queue
@@ -1039,12 +1141,13 @@ void blk_queue_invalidate_tags(request_queue_t *q)
1039 1141
1040EXPORT_SYMBOL(blk_queue_invalidate_tags); 1142EXPORT_SYMBOL(blk_queue_invalidate_tags);
1041 1143
1042static char *rq_flags[] = { 1144static const char * const rq_flags[] = {
1043 "REQ_RW", 1145 "REQ_RW",
1044 "REQ_FAILFAST", 1146 "REQ_FAILFAST",
1045 "REQ_SORTED", 1147 "REQ_SORTED",
1046 "REQ_SOFTBARRIER", 1148 "REQ_SOFTBARRIER",
1047 "REQ_HARDBARRIER", 1149 "REQ_HARDBARRIER",
1150 "REQ_FUA",
1048 "REQ_CMD", 1151 "REQ_CMD",
1049 "REQ_NOMERGE", 1152 "REQ_NOMERGE",
1050 "REQ_STARTED", 1153 "REQ_STARTED",
@@ -1064,6 +1167,7 @@ static char *rq_flags[] = {
1064 "REQ_PM_SUSPEND", 1167 "REQ_PM_SUSPEND",
1065 "REQ_PM_RESUME", 1168 "REQ_PM_RESUME",
1066 "REQ_PM_SHUTDOWN", 1169 "REQ_PM_SHUTDOWN",
1170 "REQ_ORDERED_COLOR",
1067}; 1171};
1068 1172
1069void blk_dump_rq_flags(struct request *rq, char *msg) 1173void blk_dump_rq_flags(struct request *rq, char *msg)
@@ -1641,8 +1745,6 @@ void blk_cleanup_queue(request_queue_t * q)
1641 if (q->queue_tags) 1745 if (q->queue_tags)
1642 __blk_queue_free_tags(q); 1746 __blk_queue_free_tags(q);
1643 1747
1644 blk_queue_ordered(q, QUEUE_ORDERED_NONE);
1645
1646 kmem_cache_free(requestq_cachep, q); 1748 kmem_cache_free(requestq_cachep, q);
1647} 1749}
1648 1750
@@ -1667,8 +1769,6 @@ static int blk_init_free_list(request_queue_t *q)
1667 return 0; 1769 return 0;
1668} 1770}
1669 1771
1670static int __make_request(request_queue_t *, struct bio *);
1671
1672request_queue_t *blk_alloc_queue(gfp_t gfp_mask) 1772request_queue_t *blk_alloc_queue(gfp_t gfp_mask)
1673{ 1773{
1674 return blk_alloc_queue_node(gfp_mask, -1); 1774 return blk_alloc_queue_node(gfp_mask, -1);
@@ -1908,40 +2008,40 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
1908{ 2008{
1909 struct request *rq = NULL; 2009 struct request *rq = NULL;
1910 struct request_list *rl = &q->rq; 2010 struct request_list *rl = &q->rq;
1911 struct io_context *ioc = current_io_context(GFP_ATOMIC); 2011 struct io_context *ioc = NULL;
1912 int priv; 2012 int may_queue, priv;
1913 2013
1914 if (rl->count[rw]+1 >= q->nr_requests) { 2014 may_queue = elv_may_queue(q, rw, bio);
1915 /* 2015 if (may_queue == ELV_MQUEUE_NO)
1916 * The queue will fill after this allocation, so set it as 2016 goto rq_starved;
1917 * full, and mark this process as "batching". This process
1918 * will be allowed to complete a batch of requests, others
1919 * will be blocked.
1920 */
1921 if (!blk_queue_full(q, rw)) {
1922 ioc_set_batching(q, ioc);
1923 blk_set_queue_full(q, rw);
1924 }
1925 }
1926 2017
1927 switch (elv_may_queue(q, rw, bio)) { 2018 if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
1928 case ELV_MQUEUE_NO: 2019 if (rl->count[rw]+1 >= q->nr_requests) {
1929 goto rq_starved; 2020 ioc = current_io_context(GFP_ATOMIC);
1930 case ELV_MQUEUE_MAY: 2021 /*
1931 break; 2022 * The queue will fill after this allocation, so set
1932 case ELV_MQUEUE_MUST: 2023 * it as full, and mark this process as "batching".
1933 goto get_rq; 2024 * This process will be allowed to complete a batch of
1934 } 2025 * requests, others will be blocked.
1935 2026 */
1936 if (blk_queue_full(q, rw) && !ioc_batching(q, ioc)) { 2027 if (!blk_queue_full(q, rw)) {
1937 /* 2028 ioc_set_batching(q, ioc);
1938 * The queue is full and the allocating process is not a 2029 blk_set_queue_full(q, rw);
1939 * "batcher", and not exempted by the IO scheduler 2030 } else {
1940 */ 2031 if (may_queue != ELV_MQUEUE_MUST
1941 goto out; 2032 && !ioc_batching(q, ioc)) {
2033 /*
2034 * The queue is full and the allocating
2035 * process is not a "batcher", and not
2036 * exempted by the IO scheduler
2037 */
2038 goto out;
2039 }
2040 }
2041 }
2042 set_queue_congested(q, rw);
1942 } 2043 }
1943 2044
1944get_rq:
1945 /* 2045 /*
1946 * Only allow batching queuers to allocate up to 50% over the defined 2046 * Only allow batching queuers to allocate up to 50% over the defined
1947 * limit of requests, otherwise we could have thousands of requests 2047 * limit of requests, otherwise we could have thousands of requests
@@ -1952,8 +2052,6 @@ get_rq:
1952 2052
1953 rl->count[rw]++; 2053 rl->count[rw]++;
1954 rl->starved[rw] = 0; 2054 rl->starved[rw] = 0;
1955 if (rl->count[rw] >= queue_congestion_on_threshold(q))
1956 set_queue_congested(q, rw);
1957 2055
1958 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 2056 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
1959 if (priv) 2057 if (priv)
@@ -1962,7 +2060,7 @@ get_rq:
1962 spin_unlock_irq(q->queue_lock); 2060 spin_unlock_irq(q->queue_lock);
1963 2061
1964 rq = blk_alloc_request(q, rw, bio, priv, gfp_mask); 2062 rq = blk_alloc_request(q, rw, bio, priv, gfp_mask);
1965 if (!rq) { 2063 if (unlikely(!rq)) {
1966 /* 2064 /*
1967 * Allocation failed presumably due to memory. Undo anything 2065 * Allocation failed presumably due to memory. Undo anything
1968 * we might have messed up. 2066 * we might have messed up.
@@ -1987,6 +2085,12 @@ rq_starved:
1987 goto out; 2085 goto out;
1988 } 2086 }
1989 2087
2088 /*
2089 * ioc may be NULL here, and ioc_batching will be false. That's
2090 * OK, if the queue is under the request limit then requests need
2091 * not count toward the nr_batch_requests limit. There will always
2092 * be some limit enforced by BLK_BATCH_TIME.
2093 */
1990 if (ioc_batching(q, ioc)) 2094 if (ioc_batching(q, ioc))
1991 ioc->nr_batch_requests--; 2095 ioc->nr_batch_requests--;
1992 2096
@@ -2313,7 +2417,7 @@ EXPORT_SYMBOL(blk_rq_map_kern);
2313 */ 2417 */
2314void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk, 2418void blk_execute_rq_nowait(request_queue_t *q, struct gendisk *bd_disk,
2315 struct request *rq, int at_head, 2419 struct request *rq, int at_head,
2316 void (*done)(struct request *)) 2420 rq_end_io_fn *done)
2317{ 2421{
2318 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 2422 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2319 2423
@@ -2517,7 +2621,7 @@ EXPORT_SYMBOL(blk_put_request);
2517 * blk_end_sync_rq - executes a completion event on a request 2621 * blk_end_sync_rq - executes a completion event on a request
2518 * @rq: request to complete 2622 * @rq: request to complete
2519 */ 2623 */
2520void blk_end_sync_rq(struct request *rq) 2624void blk_end_sync_rq(struct request *rq, int error)
2521{ 2625{
2522 struct completion *waiting = rq->waiting; 2626 struct completion *waiting = rq->waiting;
2523 2627
@@ -2655,6 +2759,36 @@ void blk_attempt_remerge(request_queue_t *q, struct request *rq)
2655 2759
2656EXPORT_SYMBOL(blk_attempt_remerge); 2760EXPORT_SYMBOL(blk_attempt_remerge);
2657 2761
2762static void init_request_from_bio(struct request *req, struct bio *bio)
2763{
2764 req->flags |= REQ_CMD;
2765
2766 /*
2767 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
2768 */
2769 if (bio_rw_ahead(bio) || bio_failfast(bio))
2770 req->flags |= REQ_FAILFAST;
2771
2772 /*
2773 * REQ_BARRIER implies no merging, but lets make it explicit
2774 */
2775 if (unlikely(bio_barrier(bio)))
2776 req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
2777
2778 req->errors = 0;
2779 req->hard_sector = req->sector = bio->bi_sector;
2780 req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
2781 req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio);
2782 req->nr_phys_segments = bio_phys_segments(req->q, bio);
2783 req->nr_hw_segments = bio_hw_segments(req->q, bio);
2784 req->buffer = bio_data(bio); /* see ->buffer comment above */
2785 req->waiting = NULL;
2786 req->bio = req->biotail = bio;
2787 req->ioprio = bio_prio(bio);
2788 req->rq_disk = bio->bi_bdev->bd_disk;
2789 req->start_time = jiffies;
2790}
2791
2658static int __make_request(request_queue_t *q, struct bio *bio) 2792static int __make_request(request_queue_t *q, struct bio *bio)
2659{ 2793{
2660 struct request *req; 2794 struct request *req;
@@ -2680,7 +2814,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
2680 spin_lock_prefetch(q->queue_lock); 2814 spin_lock_prefetch(q->queue_lock);
2681 2815
2682 barrier = bio_barrier(bio); 2816 barrier = bio_barrier(bio);
2683 if (unlikely(barrier) && (q->ordered == QUEUE_ORDERED_NONE)) { 2817 if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
2684 err = -EOPNOTSUPP; 2818 err = -EOPNOTSUPP;
2685 goto end_io; 2819 goto end_io;
2686 } 2820 }
@@ -2750,33 +2884,7 @@ get_rq:
2750 * We don't worry about that case for efficiency. It won't happen 2884 * We don't worry about that case for efficiency. It won't happen
2751 * often, and the elevators are able to handle it. 2885 * often, and the elevators are able to handle it.
2752 */ 2886 */
2753 2887 init_request_from_bio(req, bio);
2754 req->flags |= REQ_CMD;
2755
2756 /*
2757 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
2758 */
2759 if (bio_rw_ahead(bio) || bio_failfast(bio))
2760 req->flags |= REQ_FAILFAST;
2761
2762 /*
2763 * REQ_BARRIER implies no merging, but lets make it explicit
2764 */
2765 if (unlikely(barrier))
2766 req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
2767
2768 req->errors = 0;
2769 req->hard_sector = req->sector = sector;
2770 req->hard_nr_sectors = req->nr_sectors = nr_sectors;
2771 req->current_nr_sectors = req->hard_cur_sectors = cur_nr_sectors;
2772 req->nr_phys_segments = bio_phys_segments(q, bio);
2773 req->nr_hw_segments = bio_hw_segments(q, bio);
2774 req->buffer = bio_data(bio); /* see ->buffer comment above */
2775 req->waiting = NULL;
2776 req->bio = req->biotail = bio;
2777 req->ioprio = prio;
2778 req->rq_disk = bio->bi_bdev->bd_disk;
2779 req->start_time = jiffies;
2780 2888
2781 spin_lock_irq(q->queue_lock); 2889 spin_lock_irq(q->queue_lock);
2782 if (elv_queue_empty(q)) 2890 if (elv_queue_empty(q))
@@ -3067,7 +3175,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
3067 if (nr_bytes >= bio->bi_size) { 3175 if (nr_bytes >= bio->bi_size) {
3068 req->bio = bio->bi_next; 3176 req->bio = bio->bi_next;
3069 nbytes = bio->bi_size; 3177 nbytes = bio->bi_size;
3070 bio_endio(bio, nbytes, error); 3178 if (!ordered_bio_endio(req, bio, nbytes, error))
3179 bio_endio(bio, nbytes, error);
3071 next_idx = 0; 3180 next_idx = 0;
3072 bio_nbytes = 0; 3181 bio_nbytes = 0;
3073 } else { 3182 } else {
@@ -3122,7 +3231,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
3122 * if the request wasn't completed, update state 3231 * if the request wasn't completed, update state
3123 */ 3232 */
3124 if (bio_nbytes) { 3233 if (bio_nbytes) {
3125 bio_endio(bio, bio_nbytes, error); 3234 if (!ordered_bio_endio(req, bio, bio_nbytes, error))
3235 bio_endio(bio, bio_nbytes, error);
3126 bio->bi_idx += next_idx; 3236 bio->bi_idx += next_idx;
3127 bio_iovec(bio)->bv_offset += nr_bytes; 3237 bio_iovec(bio)->bv_offset += nr_bytes;
3128 bio_iovec(bio)->bv_len -= nr_bytes; 3238 bio_iovec(bio)->bv_len -= nr_bytes;
@@ -3179,9 +3289,17 @@ EXPORT_SYMBOL(end_that_request_chunk);
3179/* 3289/*
3180 * queue lock must be held 3290 * queue lock must be held
3181 */ 3291 */
3182void end_that_request_last(struct request *req) 3292void end_that_request_last(struct request *req, int uptodate)
3183{ 3293{
3184 struct gendisk *disk = req->rq_disk; 3294 struct gendisk *disk = req->rq_disk;
3295 int error;
3296
3297 /*
3298 * extend uptodate bool to allow < 0 value to be direct io error
3299 */
3300 error = 0;
3301 if (end_io_error(uptodate))
3302 error = !uptodate ? -EIO : uptodate;
3185 3303
3186 if (unlikely(laptop_mode) && blk_fs_request(req)) 3304 if (unlikely(laptop_mode) && blk_fs_request(req))
3187 laptop_io_completion(); 3305 laptop_io_completion();
@@ -3196,7 +3314,7 @@ void end_that_request_last(struct request *req)
3196 disk->in_flight--; 3314 disk->in_flight--;
3197 } 3315 }
3198 if (req->end_io) 3316 if (req->end_io)
3199 req->end_io(req); 3317 req->end_io(req, error);
3200 else 3318 else
3201 __blk_put_request(req->q, req); 3319 __blk_put_request(req->q, req);
3202} 3320}
@@ -3208,7 +3326,7 @@ void end_request(struct request *req, int uptodate)
3208 if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) { 3326 if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
3209 add_disk_randomness(req->rq_disk); 3327 add_disk_randomness(req->rq_disk);
3210 blkdev_dequeue_request(req); 3328 blkdev_dequeue_request(req);
3211 end_that_request_last(req); 3329 end_that_request_last(req, uptodate);
3212 } 3330 }
3213} 3331}
3214 3332
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 1d8852f7bbff..c2ac36dfe4f3 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -46,7 +46,7 @@ EXPORT_SYMBOL(scsi_command_size);
46 46
47static int sg_get_version(int __user *p) 47static int sg_get_version(int __user *p)
48{ 48{
49 static int sg_version_num = 30527; 49 static const int sg_version_num = 30527;
50 return put_user(sg_version_num, p); 50 return put_user(sg_version_num, p);
51} 51}
52 52
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 89299f4ffe12..52e1d4108a99 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -40,10 +40,11 @@ config CRYPTO_SHA1
40 help 40 help
41 SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2). 41 SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2).
42 42
43config CRYPTO_SHA1_Z990 43config CRYPTO_SHA1_S390
44 tristate "SHA1 digest algorithm for IBM zSeries z990" 44 tristate "SHA1 digest algorithm (s390)"
45 depends on CRYPTO && ARCH_S390 45 depends on CRYPTO && S390
46 help 46 help
47 This is the s390 hardware accelerated implementation of the
47 SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2). 48 SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2).
48 49
49config CRYPTO_SHA256 50config CRYPTO_SHA256
@@ -55,6 +56,16 @@ config CRYPTO_SHA256
55 This version of SHA implements a 256 bit hash with 128 bits of 56 This version of SHA implements a 256 bit hash with 128 bits of
56 security against collision attacks. 57 security against collision attacks.
57 58
59config CRYPTO_SHA256_S390
60 tristate "SHA256 digest algorithm (s390)"
61 depends on CRYPTO && S390
62 help
63 This is the s390 hardware accelerated implementation of the
64 SHA256 secure hash standard (DFIPS 180-2).
65
66 This version of SHA implements a 256 bit hash with 128 bits of
67 security against collision attacks.
68
58config CRYPTO_SHA512 69config CRYPTO_SHA512
59 tristate "SHA384 and SHA512 digest algorithms" 70 tristate "SHA384 and SHA512 digest algorithms"
60 depends on CRYPTO 71 depends on CRYPTO
@@ -98,9 +109,9 @@ config CRYPTO_DES
98 help 109 help
99 DES cipher algorithm (FIPS 46-2), and Triple DES EDE (FIPS 46-3). 110 DES cipher algorithm (FIPS 46-2), and Triple DES EDE (FIPS 46-3).
100 111
101config CRYPTO_DES_Z990 112config CRYPTO_DES_S390
102 tristate "DES and Triple DES cipher algorithms for IBM zSeries z990" 113 tristate "DES and Triple DES cipher algorithms (s390)"
103 depends on CRYPTO && ARCH_S390 114 depends on CRYPTO && S390
104 help 115 help
105 DES cipher algorithm (FIPS 46-2), and Triple DES EDE (FIPS 46-3). 116 DES cipher algorithm (FIPS 46-2), and Triple DES EDE (FIPS 46-3).
106 117
@@ -204,6 +215,26 @@ config CRYPTO_AES_X86_64
204 215
205 See <http://csrc.nist.gov/encryption/aes/> for more information. 216 See <http://csrc.nist.gov/encryption/aes/> for more information.
206 217
218config CRYPTO_AES_S390
219 tristate "AES cipher algorithms (s390)"
220 depends on CRYPTO && S390
221 help
222 This is the s390 hardware accelerated implementation of the
223 AES cipher algorithms (FIPS-197). AES uses the Rijndael
224 algorithm.
225
226 Rijndael appears to be consistently a very good performer in
227 both hardware and software across a wide range of computing
228 environments regardless of its use in feedback or non-feedback
229 modes. Its key setup time is excellent, and its key agility is
230 good. Rijndael's very low memory requirements make it very well
231 suited for restricted-space environments, in which it also
232 demonstrates excellent performance. Rijndael's operations are
233 among the easiest to defend against power and timing attacks.
234
235 On s390 the System z9-109 currently only supports the key size
236 of 128 bit.
237
207config CRYPTO_CAST5 238config CRYPTO_CAST5
208 tristate "CAST5 (CAST-128) cipher algorithm" 239 tristate "CAST5 (CAST-128) cipher algorithm"
209 depends on CRYPTO 240 depends on CRYPTO
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 53f4ee804bdb..49e344f00806 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -805,6 +805,8 @@ static void do_test(void)
805 //AES 805 //AES
806 test_cipher ("aes", MODE_ECB, ENCRYPT, aes_enc_tv_template, AES_ENC_TEST_VECTORS); 806 test_cipher ("aes", MODE_ECB, ENCRYPT, aes_enc_tv_template, AES_ENC_TEST_VECTORS);
807 test_cipher ("aes", MODE_ECB, DECRYPT, aes_dec_tv_template, AES_DEC_TEST_VECTORS); 807 test_cipher ("aes", MODE_ECB, DECRYPT, aes_dec_tv_template, AES_DEC_TEST_VECTORS);
808 test_cipher ("aes", MODE_CBC, ENCRYPT, aes_cbc_enc_tv_template, AES_CBC_ENC_TEST_VECTORS);
809 test_cipher ("aes", MODE_CBC, DECRYPT, aes_cbc_dec_tv_template, AES_CBC_DEC_TEST_VECTORS);
808 810
809 //CAST5 811 //CAST5
810 test_cipher ("cast5", MODE_ECB, ENCRYPT, cast5_enc_tv_template, CAST5_ENC_TEST_VECTORS); 812 test_cipher ("cast5", MODE_ECB, ENCRYPT, cast5_enc_tv_template, CAST5_ENC_TEST_VECTORS);
@@ -910,6 +912,8 @@ static void do_test(void)
910 case 10: 912 case 10:
911 test_cipher ("aes", MODE_ECB, ENCRYPT, aes_enc_tv_template, AES_ENC_TEST_VECTORS); 913 test_cipher ("aes", MODE_ECB, ENCRYPT, aes_enc_tv_template, AES_ENC_TEST_VECTORS);
912 test_cipher ("aes", MODE_ECB, DECRYPT, aes_dec_tv_template, AES_DEC_TEST_VECTORS); 914 test_cipher ("aes", MODE_ECB, DECRYPT, aes_dec_tv_template, AES_DEC_TEST_VECTORS);
915 test_cipher ("aes", MODE_CBC, ENCRYPT, aes_cbc_enc_tv_template, AES_CBC_ENC_TEST_VECTORS);
916 test_cipher ("aes", MODE_CBC, DECRYPT, aes_cbc_dec_tv_template, AES_CBC_DEC_TEST_VECTORS);
913 break; 917 break;
914 918
915 case 11: 919 case 11:
diff --git a/crypto/tcrypt.h b/crypto/tcrypt.h
index 522ffd4b6f43..733d07ed75e9 100644
--- a/crypto/tcrypt.h
+++ b/crypto/tcrypt.h
@@ -1836,6 +1836,8 @@ static struct cipher_testvec cast6_dec_tv_template[] = {
1836 */ 1836 */
1837#define AES_ENC_TEST_VECTORS 3 1837#define AES_ENC_TEST_VECTORS 3
1838#define AES_DEC_TEST_VECTORS 3 1838#define AES_DEC_TEST_VECTORS 3
1839#define AES_CBC_ENC_TEST_VECTORS 2
1840#define AES_CBC_DEC_TEST_VECTORS 2
1839 1841
1840static struct cipher_testvec aes_enc_tv_template[] = { 1842static struct cipher_testvec aes_enc_tv_template[] = {
1841 { /* From FIPS-197 */ 1843 { /* From FIPS-197 */
@@ -1911,6 +1913,68 @@ static struct cipher_testvec aes_dec_tv_template[] = {
1911 }, 1913 },
1912}; 1914};
1913 1915
1916static struct cipher_testvec aes_cbc_enc_tv_template[] = {
1917 { /* From RFC 3602 */
1918 .key = { 0x06, 0xa9, 0x21, 0x40, 0x36, 0xb8, 0xa1, 0x5b,
1919 0x51, 0x2e, 0x03, 0xd5, 0x34, 0x12, 0x00, 0x06 },
1920 .klen = 16,
1921 .iv = { 0x3d, 0xaf, 0xba, 0x42, 0x9d, 0x9e, 0xb4, 0x30,
1922 0xb4, 0x22, 0xda, 0x80, 0x2c, 0x9f, 0xac, 0x41 },
1923 .input = { "Single block msg" },
1924 .ilen = 16,
1925 .result = { 0xe3, 0x53, 0x77, 0x9c, 0x10, 0x79, 0xae, 0xb8,
1926 0x27, 0x08, 0x94, 0x2d, 0xbe, 0x77, 0x18, 0x1a },
1927 .rlen = 16,
1928 }, {
1929 .key = { 0xc2, 0x86, 0x69, 0x6d, 0x88, 0x7c, 0x9a, 0xa0,
1930 0x61, 0x1b, 0xbb, 0x3e, 0x20, 0x25, 0xa4, 0x5a },
1931 .klen = 16,
1932 .iv = { 0x56, 0x2e, 0x17, 0x99, 0x6d, 0x09, 0x3d, 0x28,
1933 0xdd, 0xb3, 0xba, 0x69, 0x5a, 0x2e, 0x6f, 0x58 },
1934 .input = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1935 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1936 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1937 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
1938 .ilen = 32,
1939 .result = { 0xd2, 0x96, 0xcd, 0x94, 0xc2, 0xcc, 0xcf, 0x8a,
1940 0x3a, 0x86, 0x30, 0x28, 0xb5, 0xe1, 0xdc, 0x0a,
1941 0x75, 0x86, 0x60, 0x2d, 0x25, 0x3c, 0xff, 0xf9,
1942 0x1b, 0x82, 0x66, 0xbe, 0xa6, 0xd6, 0x1a, 0xb1 },
1943 .rlen = 32,
1944 },
1945};
1946
1947static struct cipher_testvec aes_cbc_dec_tv_template[] = {
1948 { /* From RFC 3602 */
1949 .key = { 0x06, 0xa9, 0x21, 0x40, 0x36, 0xb8, 0xa1, 0x5b,
1950 0x51, 0x2e, 0x03, 0xd5, 0x34, 0x12, 0x00, 0x06 },
1951 .klen = 16,
1952 .iv = { 0x3d, 0xaf, 0xba, 0x42, 0x9d, 0x9e, 0xb4, 0x30,
1953 0xb4, 0x22, 0xda, 0x80, 0x2c, 0x9f, 0xac, 0x41 },
1954 .input = { 0xe3, 0x53, 0x77, 0x9c, 0x10, 0x79, 0xae, 0xb8,
1955 0x27, 0x08, 0x94, 0x2d, 0xbe, 0x77, 0x18, 0x1a },
1956 .ilen = 16,
1957 .result = { "Single block msg" },
1958 .rlen = 16,
1959 }, {
1960 .key = { 0xc2, 0x86, 0x69, 0x6d, 0x88, 0x7c, 0x9a, 0xa0,
1961 0x61, 0x1b, 0xbb, 0x3e, 0x20, 0x25, 0xa4, 0x5a },
1962 .klen = 16,
1963 .iv = { 0x56, 0x2e, 0x17, 0x99, 0x6d, 0x09, 0x3d, 0x28,
1964 0xdd, 0xb3, 0xba, 0x69, 0x5a, 0x2e, 0x6f, 0x58 },
1965 .input = { 0xd2, 0x96, 0xcd, 0x94, 0xc2, 0xcc, 0xcf, 0x8a,
1966 0x3a, 0x86, 0x30, 0x28, 0xb5, 0xe1, 0xdc, 0x0a,
1967 0x75, 0x86, 0x60, 0x2d, 0x25, 0x3c, 0xff, 0xf9,
1968 0x1b, 0x82, 0x66, 0xbe, 0xa6, 0xd6, 0x1a, 0xb1 },
1969 .ilen = 32,
1970 .result = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1971 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1972 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1973 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
1974 .rlen = 32,
1975 },
1976};
1977
1914/* Cast5 test vectors from RFC 2144 */ 1978/* Cast5 test vectors from RFC 2144 */
1915#define CAST5_ENC_TEST_VECTORS 3 1979#define CAST5_ENC_TEST_VECTORS 3
1916#define CAST5_DEC_TEST_VECTORS 3 1980#define CAST5_DEC_TEST_VECTORS 3
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 7e1d077874df..58801d718cc2 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -49,12 +49,12 @@ static struct kset_uevent_ops memory_uevent_ops = {
49 49
50static struct notifier_block *memory_chain; 50static struct notifier_block *memory_chain;
51 51
52static int register_memory_notifier(struct notifier_block *nb) 52int register_memory_notifier(struct notifier_block *nb)
53{ 53{
54 return notifier_chain_register(&memory_chain, nb); 54 return notifier_chain_register(&memory_chain, nb);
55} 55}
56 56
57static void unregister_memory_notifier(struct notifier_block *nb) 57void unregister_memory_notifier(struct notifier_block *nb)
58{ 58{
59 notifier_chain_unregister(&memory_chain, nb); 59 notifier_chain_unregister(&memory_chain, nb);
60} 60}
@@ -62,8 +62,7 @@ static void unregister_memory_notifier(struct notifier_block *nb)
62/* 62/*
63 * register_memory - Setup a sysfs device for a memory block 63 * register_memory - Setup a sysfs device for a memory block
64 */ 64 */
65static int 65int register_memory(struct memory_block *memory, struct mem_section *section,
66register_memory(struct memory_block *memory, struct mem_section *section,
67 struct node *root) 66 struct node *root)
68{ 67{
69 int error; 68 int error;
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 70eaa5c7ac08..21097a39a057 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3471,7 +3471,7 @@ static inline boolean DAC960_ProcessCompletedRequest(DAC960_Command_T *Command,
3471 3471
3472 if (!end_that_request_first(Request, UpToDate, Command->BlockCount)) { 3472 if (!end_that_request_first(Request, UpToDate, Command->BlockCount)) {
3473 3473
3474 end_that_request_last(Request); 3474 end_that_request_last(Request, UpToDate);
3475 3475
3476 if (Command->Completion) { 3476 if (Command->Completion) {
3477 complete(Command->Completion); 3477 complete(Command->Completion);
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index c4b9d2adfc08..139cbba76180 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -117,7 +117,7 @@ config BLK_DEV_XD
117 117
118config PARIDE 118config PARIDE
119 tristate "Parallel port IDE device support" 119 tristate "Parallel port IDE device support"
120 depends on PARPORT 120 depends on PARPORT_PC
121 ---help--- 121 ---help---
122 There are many external CD-ROM and disk devices that connect through 122 There are many external CD-ROM and disk devices that connect through
123 your computer's parallel port. Most of them are actually IDE devices 123 your computer's parallel port. Most of them are actually IDE devices
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index c3441b3f086e..d2815b7a9150 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2310,7 +2310,7 @@ static inline void complete_command( ctlr_info_t *h, CommandList_struct *cmd,
2310 printk("Done with %p\n", cmd->rq); 2310 printk("Done with %p\n", cmd->rq);
2311#endif /* CCISS_DEBUG */ 2311#endif /* CCISS_DEBUG */
2312 2312
2313 end_that_request_last(cmd->rq); 2313 end_that_request_last(cmd->rq, status ? 1 : -EIO);
2314 cmd_free(h,cmd,1); 2314 cmd_free(h,cmd,1);
2315} 2315}
2316 2316
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index cf1822a6361c..9bddb6874873 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -1036,7 +1036,7 @@ static inline void complete_command(cmdlist_t *cmd, int timeout)
1036 complete_buffers(cmd->rq->bio, ok); 1036 complete_buffers(cmd->rq->bio, ok);
1037 1037
1038 DBGPX(printk("Done with %p\n", cmd->rq);); 1038 DBGPX(printk("Done with %p\n", cmd->rq););
1039 end_that_request_last(cmd->rq); 1039 end_that_request_last(cmd->rq, ok ? 1 : -EIO);
1040} 1040}
1041 1041
1042/* 1042/*
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index f7e765a1d313..a5b857c5c4b8 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2301,7 +2301,7 @@ static void floppy_end_request(struct request *req, int uptodate)
2301 add_disk_randomness(req->rq_disk); 2301 add_disk_randomness(req->rq_disk);
2302 floppy_off((long)req->rq_disk->private_data); 2302 floppy_off((long)req->rq_disk->private_data);
2303 blkdev_dequeue_request(req); 2303 blkdev_dequeue_request(req);
2304 end_that_request_last(req); 2304 end_that_request_last(req, uptodate);
2305 2305
2306 /* We're done with the request */ 2306 /* We're done with the request */
2307 current_req = NULL; 2307 current_req = NULL;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 96c664af8d06..a452b13620a2 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -213,7 +213,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
213 struct address_space_operations *aops = mapping->a_ops; 213 struct address_space_operations *aops = mapping->a_ops;
214 pgoff_t index; 214 pgoff_t index;
215 unsigned offset, bv_offs; 215 unsigned offset, bv_offs;
216 int len, ret = 0; 216 int len, ret;
217 217
218 down(&mapping->host->i_sem); 218 down(&mapping->host->i_sem);
219 index = pos >> PAGE_CACHE_SHIFT; 219 index = pos >> PAGE_CACHE_SHIFT;
@@ -232,9 +232,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
232 page = grab_cache_page(mapping, index); 232 page = grab_cache_page(mapping, index);
233 if (unlikely(!page)) 233 if (unlikely(!page))
234 goto fail; 234 goto fail;
235 if (unlikely(aops->prepare_write(file, page, offset, 235 ret = aops->prepare_write(file, page, offset,
236 offset + size))) 236 offset + size);
237 if (unlikely(ret)) {
238 if (ret == AOP_TRUNCATED_PAGE) {
239 page_cache_release(page);
240 continue;
241 }
237 goto unlock; 242 goto unlock;
243 }
238 transfer_result = lo_do_transfer(lo, WRITE, page, offset, 244 transfer_result = lo_do_transfer(lo, WRITE, page, offset,
239 bvec->bv_page, bv_offs, size, IV); 245 bvec->bv_page, bv_offs, size, IV);
240 if (unlikely(transfer_result)) { 246 if (unlikely(transfer_result)) {
@@ -251,9 +257,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
251 kunmap_atomic(kaddr, KM_USER0); 257 kunmap_atomic(kaddr, KM_USER0);
252 } 258 }
253 flush_dcache_page(page); 259 flush_dcache_page(page);
254 if (unlikely(aops->commit_write(file, page, offset, 260 ret = aops->commit_write(file, page, offset,
255 offset + size))) 261 offset + size);
262 if (unlikely(ret)) {
263 if (ret == AOP_TRUNCATED_PAGE) {
264 page_cache_release(page);
265 continue;
266 }
256 goto unlock; 267 goto unlock;
268 }
257 if (unlikely(transfer_result)) 269 if (unlikely(transfer_result))
258 goto unlock; 270 goto unlock;
259 bv_offs += size; 271 bv_offs += size;
@@ -264,6 +276,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
264 unlock_page(page); 276 unlock_page(page);
265 page_cache_release(page); 277 page_cache_release(page);
266 } 278 }
279 ret = 0;
267out: 280out:
268 up(&mapping->host->i_sem); 281 up(&mapping->host->i_sem);
269 return ret; 282 return ret;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9e268ddedfbd..33d6f237b2ed 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -54,11 +54,15 @@
54#include <linux/errno.h> 54#include <linux/errno.h>
55#include <linux/file.h> 55#include <linux/file.h>
56#include <linux/ioctl.h> 56#include <linux/ioctl.h>
57#include <linux/compiler.h>
58#include <linux/err.h>
59#include <linux/kernel.h>
57#include <net/sock.h> 60#include <net/sock.h>
58 61
59#include <linux/devfs_fs_kernel.h> 62#include <linux/devfs_fs_kernel.h>
60 63
61#include <asm/uaccess.h> 64#include <asm/uaccess.h>
65#include <asm/system.h>
62#include <asm/types.h> 66#include <asm/types.h>
63 67
64#include <linux/nbd.h> 68#include <linux/nbd.h>
@@ -136,7 +140,7 @@ static void nbd_end_request(struct request *req)
136 140
137 spin_lock_irqsave(q->queue_lock, flags); 141 spin_lock_irqsave(q->queue_lock, flags);
138 if (!end_that_request_first(req, uptodate, req->nr_sectors)) { 142 if (!end_that_request_first(req, uptodate, req->nr_sectors)) {
139 end_that_request_last(req); 143 end_that_request_last(req, uptodate);
140 } 144 }
141 spin_unlock_irqrestore(q->queue_lock, flags); 145 spin_unlock_irqrestore(q->queue_lock, flags);
142} 146}
@@ -230,14 +234,6 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
230 request.len = htonl(size); 234 request.len = htonl(size);
231 memcpy(request.handle, &req, sizeof(req)); 235 memcpy(request.handle, &req, sizeof(req));
232 236
233 down(&lo->tx_lock);
234
235 if (!sock || !lo->sock) {
236 printk(KERN_ERR "%s: Attempted send on closed socket\n",
237 lo->disk->disk_name);
238 goto error_out;
239 }
240
241 dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%luB)\n", 237 dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%luB)\n",
242 lo->disk->disk_name, req, 238 lo->disk->disk_name, req,
243 nbdcmd_to_ascii(nbd_cmd(req)), 239 nbdcmd_to_ascii(nbd_cmd(req)),
@@ -276,11 +272,9 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
276 } 272 }
277 } 273 }
278 } 274 }
279 up(&lo->tx_lock);
280 return 0; 275 return 0;
281 276
282error_out: 277error_out:
283 up(&lo->tx_lock);
284 return 1; 278 return 1;
285} 279}
286 280
@@ -289,9 +283,14 @@ static struct request *nbd_find_request(struct nbd_device *lo, char *handle)
289 struct request *req; 283 struct request *req;
290 struct list_head *tmp; 284 struct list_head *tmp;
291 struct request *xreq; 285 struct request *xreq;
286 int err;
292 287
293 memcpy(&xreq, handle, sizeof(xreq)); 288 memcpy(&xreq, handle, sizeof(xreq));
294 289
290 err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq);
291 if (unlikely(err))
292 goto out;
293
295 spin_lock(&lo->queue_lock); 294 spin_lock(&lo->queue_lock);
296 list_for_each(tmp, &lo->queue_head) { 295 list_for_each(tmp, &lo->queue_head) {
297 req = list_entry(tmp, struct request, queuelist); 296 req = list_entry(tmp, struct request, queuelist);
@@ -302,7 +301,11 @@ static struct request *nbd_find_request(struct nbd_device *lo, char *handle)
302 return req; 301 return req;
303 } 302 }
304 spin_unlock(&lo->queue_lock); 303 spin_unlock(&lo->queue_lock);
305 return NULL; 304
305 err = -ENOENT;
306
307out:
308 return ERR_PTR(err);
306} 309}
307 310
308static inline int sock_recv_bvec(struct socket *sock, struct bio_vec *bvec) 311static inline int sock_recv_bvec(struct socket *sock, struct bio_vec *bvec)
@@ -331,7 +334,11 @@ static struct request *nbd_read_stat(struct nbd_device *lo)
331 goto harderror; 334 goto harderror;
332 } 335 }
333 req = nbd_find_request(lo, reply.handle); 336 req = nbd_find_request(lo, reply.handle);
334 if (req == NULL) { 337 if (unlikely(IS_ERR(req))) {
338 result = PTR_ERR(req);
339 if (result != -ENOENT)
340 goto harderror;
341
335 printk(KERN_ERR "%s: Unexpected reply (%p)\n", 342 printk(KERN_ERR "%s: Unexpected reply (%p)\n",
336 lo->disk->disk_name, reply.handle); 343 lo->disk->disk_name, reply.handle);
337 result = -EBADR; 344 result = -EBADR;
@@ -395,19 +402,24 @@ static void nbd_clear_que(struct nbd_device *lo)
395 402
396 BUG_ON(lo->magic != LO_MAGIC); 403 BUG_ON(lo->magic != LO_MAGIC);
397 404
398 do { 405 /*
399 req = NULL; 406 * Because we have set lo->sock to NULL under the tx_lock, all
400 spin_lock(&lo->queue_lock); 407 * modifications to the list must have completed by now. For
401 if (!list_empty(&lo->queue_head)) { 408 * the same reason, the active_req must be NULL.
402 req = list_entry(lo->queue_head.next, struct request, queuelist); 409 *
403 list_del_init(&req->queuelist); 410 * As a consequence, we don't need to take the spin lock while
404 } 411 * purging the list here.
405 spin_unlock(&lo->queue_lock); 412 */
406 if (req) { 413 BUG_ON(lo->sock);
407 req->errors++; 414 BUG_ON(lo->active_req);
408 nbd_end_request(req); 415
409 } 416 while (!list_empty(&lo->queue_head)) {
410 } while (req); 417 req = list_entry(lo->queue_head.next, struct request,
418 queuelist);
419 list_del_init(&req->queuelist);
420 req->errors++;
421 nbd_end_request(req);
422 }
411} 423}
412 424
413/* 425/*
@@ -435,11 +447,6 @@ static void do_nbd_request(request_queue_t * q)
435 447
436 BUG_ON(lo->magic != LO_MAGIC); 448 BUG_ON(lo->magic != LO_MAGIC);
437 449
438 if (!lo->file) {
439 printk(KERN_ERR "%s: Request when not-ready\n",
440 lo->disk->disk_name);
441 goto error_out;
442 }
443 nbd_cmd(req) = NBD_CMD_READ; 450 nbd_cmd(req) = NBD_CMD_READ;
444 if (rq_data_dir(req) == WRITE) { 451 if (rq_data_dir(req) == WRITE) {
445 nbd_cmd(req) = NBD_CMD_WRITE; 452 nbd_cmd(req) = NBD_CMD_WRITE;
@@ -453,32 +460,34 @@ static void do_nbd_request(request_queue_t * q)
453 req->errors = 0; 460 req->errors = 0;
454 spin_unlock_irq(q->queue_lock); 461 spin_unlock_irq(q->queue_lock);
455 462
456 spin_lock(&lo->queue_lock); 463 down(&lo->tx_lock);
457 464 if (unlikely(!lo->sock)) {
458 if (!lo->file) { 465 up(&lo->tx_lock);
459 spin_unlock(&lo->queue_lock); 466 printk(KERN_ERR "%s: Attempted send on closed socket\n",
460 printk(KERN_ERR "%s: failed between accept and semaphore, file lost\n", 467 lo->disk->disk_name);
461 lo->disk->disk_name);
462 req->errors++; 468 req->errors++;
463 nbd_end_request(req); 469 nbd_end_request(req);
464 spin_lock_irq(q->queue_lock); 470 spin_lock_irq(q->queue_lock);
465 continue; 471 continue;
466 } 472 }
467 473
468 list_add(&req->queuelist, &lo->queue_head); 474 lo->active_req = req;
469 spin_unlock(&lo->queue_lock);
470 475
471 if (nbd_send_req(lo, req) != 0) { 476 if (nbd_send_req(lo, req) != 0) {
472 printk(KERN_ERR "%s: Request send failed\n", 477 printk(KERN_ERR "%s: Request send failed\n",
473 lo->disk->disk_name); 478 lo->disk->disk_name);
474 if (nbd_find_request(lo, (char *)&req) != NULL) { 479 req->errors++;
475 /* we still own req */ 480 nbd_end_request(req);
476 req->errors++; 481 } else {
477 nbd_end_request(req); 482 spin_lock(&lo->queue_lock);
478 } else /* we're racing with nbd_clear_que */ 483 list_add(&req->queuelist, &lo->queue_head);
479 printk(KERN_DEBUG "nbd: can't find req\n"); 484 spin_unlock(&lo->queue_lock);
480 } 485 }
481 486
487 lo->active_req = NULL;
488 up(&lo->tx_lock);
489 wake_up_all(&lo->active_wq);
490
482 spin_lock_irq(q->queue_lock); 491 spin_lock_irq(q->queue_lock);
483 continue; 492 continue;
484 493
@@ -529,17 +538,10 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
529 down(&lo->tx_lock); 538 down(&lo->tx_lock);
530 lo->sock = NULL; 539 lo->sock = NULL;
531 up(&lo->tx_lock); 540 up(&lo->tx_lock);
532 spin_lock(&lo->queue_lock);
533 file = lo->file; 541 file = lo->file;
534 lo->file = NULL; 542 lo->file = NULL;
535 spin_unlock(&lo->queue_lock);
536 nbd_clear_que(lo); 543 nbd_clear_que(lo);
537 spin_lock(&lo->queue_lock); 544 BUG_ON(!list_empty(&lo->queue_head));
538 if (!list_empty(&lo->queue_head)) {
539 printk(KERN_ERR "nbd: disconnect: some requests are in progress -> please try again.\n");
540 error = -EBUSY;
541 }
542 spin_unlock(&lo->queue_lock);
543 if (file) 545 if (file)
544 fput(file); 546 fput(file);
545 return error; 547 return error;
@@ -598,24 +600,19 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
598 lo->sock = NULL; 600 lo->sock = NULL;
599 } 601 }
600 up(&lo->tx_lock); 602 up(&lo->tx_lock);
601 spin_lock(&lo->queue_lock);
602 file = lo->file; 603 file = lo->file;
603 lo->file = NULL; 604 lo->file = NULL;
604 spin_unlock(&lo->queue_lock);
605 nbd_clear_que(lo); 605 nbd_clear_que(lo);
606 printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name); 606 printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name);
607 if (file) 607 if (file)
608 fput(file); 608 fput(file);
609 return lo->harderror; 609 return lo->harderror;
610 case NBD_CLEAR_QUE: 610 case NBD_CLEAR_QUE:
611 down(&lo->tx_lock); 611 /*
612 if (lo->sock) { 612 * This is for compatibility only. The queue is always cleared
613 up(&lo->tx_lock); 613 * by NBD_DO_IT or NBD_CLEAR_SOCK.
614 return 0; /* probably should be error, but that would 614 */
615 * break "nbd-client -d", so just return 0 */ 615 BUG_ON(!lo->sock && !list_empty(&lo->queue_head));
616 }
617 up(&lo->tx_lock);
618 nbd_clear_que(lo);
619 return 0; 616 return 0;
620 case NBD_PRINT_DEBUG: 617 case NBD_PRINT_DEBUG:
621 printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n", 618 printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n",
@@ -688,6 +685,7 @@ static int __init nbd_init(void)
688 spin_lock_init(&nbd_dev[i].queue_lock); 685 spin_lock_init(&nbd_dev[i].queue_lock);
689 INIT_LIST_HEAD(&nbd_dev[i].queue_head); 686 INIT_LIST_HEAD(&nbd_dev[i].queue_head);
690 init_MUTEX(&nbd_dev[i].tx_lock); 687 init_MUTEX(&nbd_dev[i].tx_lock);
688 init_waitqueue_head(&nbd_dev[i].active_wq);
691 nbd_dev[i].blksize = 1024; 689 nbd_dev[i].blksize = 1024;
692 nbd_dev[i].bytesize = 0x7ffffc00ULL << 10; /* 2TB */ 690 nbd_dev[i].bytesize = 0x7ffffc00ULL << 10; /* 2TB */
693 disk->major = NBD_MAJOR; 691 disk->major = NBD_MAJOR;
diff --git a/drivers/block/paride/Kconfig b/drivers/block/paride/Kconfig
index 17ff40561257..c0d2854dd097 100644
--- a/drivers/block/paride/Kconfig
+++ b/drivers/block/paride/Kconfig
@@ -4,11 +4,12 @@
4# PARIDE doesn't need PARPORT, but if PARPORT is configured as a module, 4# PARIDE doesn't need PARPORT, but if PARPORT is configured as a module,
5# PARIDE must also be a module. The bogus CONFIG_PARIDE_PARPORT option 5# PARIDE must also be a module. The bogus CONFIG_PARIDE_PARPORT option
6# controls the choices given to the user ... 6# controls the choices given to the user ...
7# PARIDE only supports PC style parports. Tough for USB or other parports...
7config PARIDE_PARPORT 8config PARIDE_PARPORT
8 tristate 9 tristate
9 depends on PARIDE!=n 10 depends on PARIDE!=n
10 default m if PARPORT=m 11 default m if PARPORT_PC=m
11 default y if PARPORT!=m 12 default y if PARPORT_PC!=m
12 13
13comment "Parallel IDE high-level drivers" 14comment "Parallel IDE high-level drivers"
14 depends on PARIDE 15 depends on PARIDE
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index 68c60a5bcdab..ffd6abd6d5a0 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -154,7 +154,7 @@ static int ramdisk_commit_write(struct file *file, struct page *page,
154 154
155/* 155/*
156 * ->writepage to the the blockdev's mapping has to redirty the page so that the 156 * ->writepage to the the blockdev's mapping has to redirty the page so that the
157 * VM doesn't go and steal it. We return WRITEPAGE_ACTIVATE so that the VM 157 * VM doesn't go and steal it. We return AOP_WRITEPAGE_ACTIVATE so that the VM
158 * won't try to (pointlessly) write the page again for a while. 158 * won't try to (pointlessly) write the page again for a while.
159 * 159 *
160 * Really, these pages should not be on the LRU at all. 160 * Really, these pages should not be on the LRU at all.
@@ -165,7 +165,7 @@ static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
165 make_page_uptodate(page); 165 make_page_uptodate(page);
166 SetPageDirty(page); 166 SetPageDirty(page);
167 if (wbc->for_reclaim) 167 if (wbc->for_reclaim)
168 return WRITEPAGE_ACTIVATE; 168 return AOP_WRITEPAGE_ACTIVATE;
169 unlock_page(page); 169 unlock_page(page);
170 return 0; 170 return 0;
171} 171}
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 1ded3b433459..9251f4131b53 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -770,7 +770,7 @@ static inline void carm_end_request_queued(struct carm_host *host,
770 rc = end_that_request_first(req, uptodate, req->hard_nr_sectors); 770 rc = end_that_request_first(req, uptodate, req->hard_nr_sectors);
771 assert(rc == 0); 771 assert(rc == 0);
772 772
773 end_that_request_last(req); 773 end_that_request_last(req, uptodate);
774 774
775 rc = carm_put_request(host, crq); 775 rc = carm_put_request(host, crq);
776 assert(rc == 0); 776 assert(rc == 0);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 10740a065088..a05fe5843e6c 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -951,7 +951,7 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
951static void ub_end_rq(struct request *rq, int uptodate) 951static void ub_end_rq(struct request *rq, int uptodate)
952{ 952{
953 end_that_request_first(rq, uptodate, rq->hard_nr_sectors); 953 end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
954 end_that_request_last(rq); 954 end_that_request_last(rq, uptodate);
955} 955}
956 956
957static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun, 957static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun,
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 2d518aa2720a..063f0304a163 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -305,7 +305,7 @@ static void viodasd_end_request(struct request *req, int uptodate,
305 if (end_that_request_first(req, uptodate, num_sectors)) 305 if (end_that_request_first(req, uptodate, num_sectors))
306 return; 306 return;
307 add_disk_randomness(req->rq_disk); 307 add_disk_randomness(req->rq_disk);
308 end_that_request_last(req); 308 end_that_request_last(req, uptodate);
309} 309}
310 310
311/* 311/*
diff --git a/drivers/cdrom/cdu31a.c b/drivers/cdrom/cdu31a.c
index ac96de15d833..378e88d20757 100644
--- a/drivers/cdrom/cdu31a.c
+++ b/drivers/cdrom/cdu31a.c
@@ -1402,7 +1402,7 @@ static void do_cdu31a_request(request_queue_t * q)
1402 if (!end_that_request_first(req, 1, nblock)) { 1402 if (!end_that_request_first(req, 1, nblock)) {
1403 spin_lock_irq(q->queue_lock); 1403 spin_lock_irq(q->queue_lock);
1404 blkdev_dequeue_request(req); 1404 blkdev_dequeue_request(req);
1405 end_that_request_last(req); 1405 end_that_request_last(req, 1);
1406 spin_unlock_irq(q->queue_lock); 1406 spin_unlock_irq(q->queue_lock);
1407 } 1407 }
1408 continue; 1408 continue;
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 84e68cdd451b..5ebd06b1b4ca 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -985,7 +985,7 @@ config HPET_MMAP
985 985
986config HANGCHECK_TIMER 986config HANGCHECK_TIMER
987 tristate "Hangcheck timer" 987 tristate "Hangcheck timer"
988 depends on X86 || IA64 || PPC64 || ARCH_S390 988 depends on X86 || IA64 || PPC64 || S390
989 help 989 help
990 The hangcheck-timer module detects when the system has gone 990 The hangcheck-timer module detects when the system has gone
991 out to lunch past a certain margin. It can reboot the system 991 out to lunch past a certain margin. It can reboot the system
diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 66e53dd450ff..40a67c86420c 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -120,7 +120,7 @@ __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks);
120#if defined(CONFIG_X86) 120#if defined(CONFIG_X86)
121# define HAVE_MONOTONIC 121# define HAVE_MONOTONIC
122# define TIMER_FREQ 1000000000ULL 122# define TIMER_FREQ 1000000000ULL
123#elif defined(CONFIG_ARCH_S390) 123#elif defined(CONFIG_S390)
124/* FA240000 is 1 Second in the IBM time universe (Page 4-38 Principles of Op for zSeries */ 124/* FA240000 is 1 Second in the IBM time universe (Page 4-38 Principles of Op for zSeries */
125# define TIMER_FREQ 0xFA240000ULL 125# define TIMER_FREQ 0xFA240000ULL
126#elif defined(CONFIG_IA64) 126#elif defined(CONFIG_IA64)
diff --git a/drivers/char/hw_random.c b/drivers/char/hw_random.c
index 6f673d2de0b1..49769f59ea1b 100644
--- a/drivers/char/hw_random.c
+++ b/drivers/char/hw_random.c
@@ -1,4 +1,9 @@
1/* 1/*
2 Added support for the AMD Geode LX RNG
3 (c) Copyright 2004-2005 Advanced Micro Devices, Inc.
4
5 derived from
6
2 Hardware driver for the Intel/AMD/VIA Random Number Generators (RNG) 7 Hardware driver for the Intel/AMD/VIA Random Number Generators (RNG)
3 (c) Copyright 2003 Red Hat Inc <jgarzik@redhat.com> 8 (c) Copyright 2003 Red Hat Inc <jgarzik@redhat.com>
4 9
@@ -95,6 +100,11 @@ static unsigned int via_data_present (void);
95static u32 via_data_read (void); 100static u32 via_data_read (void);
96#endif 101#endif
97 102
103static int __init geode_init(struct pci_dev *dev);
104static void geode_cleanup(void);
105static unsigned int geode_data_present (void);
106static u32 geode_data_read (void);
107
98struct rng_operations { 108struct rng_operations {
99 int (*init) (struct pci_dev *dev); 109 int (*init) (struct pci_dev *dev);
100 void (*cleanup) (void); 110 void (*cleanup) (void);
@@ -122,6 +132,7 @@ enum {
122 rng_hw_intel, 132 rng_hw_intel,
123 rng_hw_amd, 133 rng_hw_amd,
124 rng_hw_via, 134 rng_hw_via,
135 rng_hw_geode,
125}; 136};
126 137
127static struct rng_operations rng_vendor_ops[] = { 138static struct rng_operations rng_vendor_ops[] = {
@@ -139,6 +150,9 @@ static struct rng_operations rng_vendor_ops[] = {
139 /* rng_hw_via */ 150 /* rng_hw_via */
140 { via_init, via_cleanup, via_data_present, via_data_read, 1 }, 151 { via_init, via_cleanup, via_data_present, via_data_read, 1 },
141#endif 152#endif
153
154 /* rng_hw_geode */
155 { geode_init, geode_cleanup, geode_data_present, geode_data_read, 4 }
142}; 156};
143 157
144/* 158/*
@@ -159,6 +173,9 @@ static struct pci_device_id rng_pci_tbl[] = {
159 { 0x8086, 0x244e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel }, 173 { 0x8086, 0x244e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel },
160 { 0x8086, 0x245e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel }, 174 { 0x8086, 0x245e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_intel },
161 175
176 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_LX_AES,
177 PCI_ANY_ID, PCI_ANY_ID, 0, 0, rng_hw_geode },
178
162 { 0, }, /* terminate list */ 179 { 0, }, /* terminate list */
163}; 180};
164MODULE_DEVICE_TABLE (pci, rng_pci_tbl); 181MODULE_DEVICE_TABLE (pci, rng_pci_tbl);
@@ -460,6 +477,57 @@ static void via_cleanup(void)
460} 477}
461#endif 478#endif
462 479
480/***********************************************************************
481 *
482 * AMD Geode RNG operations
483 *
484 */
485
486static void __iomem *geode_rng_base = NULL;
487
488#define GEODE_RNG_DATA_REG 0x50
489#define GEODE_RNG_STATUS_REG 0x54
490
491static u32 geode_data_read(void)
492{
493 u32 val;
494
495 assert(geode_rng_base != NULL);
496 val = readl(geode_rng_base + GEODE_RNG_DATA_REG);
497 return val;
498}
499
500static unsigned int geode_data_present(void)
501{
502 u32 val;
503
504 assert(geode_rng_base != NULL);
505 val = readl(geode_rng_base + GEODE_RNG_STATUS_REG);
506 return val;
507}
508
509static void geode_cleanup(void)
510{
511 iounmap(geode_rng_base);
512 geode_rng_base = NULL;
513}
514
515static int geode_init(struct pci_dev *dev)
516{
517 unsigned long rng_base = pci_resource_start(dev, 0);
518
519 if (rng_base == 0)
520 return 1;
521
522 geode_rng_base = ioremap(rng_base, 0x58);
523
524 if (geode_rng_base == NULL) {
525 printk(KERN_ERR PFX "Cannot ioremap RNG memory\n");
526 return -EBUSY;
527 }
528
529 return 0;
530}
463 531
464/*********************************************************************** 532/***********************************************************************
465 * 533 *
@@ -574,7 +642,7 @@ static int __init rng_init (void)
574 642
575 DPRINTK ("ENTER\n"); 643 DPRINTK ("ENTER\n");
576 644
577 /* Probe for Intel, AMD RNGs */ 645 /* Probe for Intel, AMD, Geode RNGs */
578 for_each_pci_dev(pdev) { 646 for_each_pci_dev(pdev) {
579 ent = pci_match_id(rng_pci_tbl, pdev); 647 ent = pci_match_id(rng_pci_tbl, pdev);
580 if (ent) { 648 if (ent) {
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 1f56b4cf0f58..561430ed94af 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -787,7 +787,6 @@ int ipmi_destroy_user(ipmi_user_t user)
787 int i; 787 int i;
788 unsigned long flags; 788 unsigned long flags;
789 struct cmd_rcvr *rcvr; 789 struct cmd_rcvr *rcvr;
790 struct list_head *entry1, *entry2;
791 struct cmd_rcvr *rcvrs = NULL; 790 struct cmd_rcvr *rcvrs = NULL;
792 791
793 user->valid = 1; 792 user->valid = 1;
@@ -812,8 +811,7 @@ int ipmi_destroy_user(ipmi_user_t user)
812 * synchronize_rcu()) then free everything in that list. 811 * synchronize_rcu()) then free everything in that list.
813 */ 812 */
814 down(&intf->cmd_rcvrs_lock); 813 down(&intf->cmd_rcvrs_lock);
815 list_for_each_safe_rcu(entry1, entry2, &intf->cmd_rcvrs) { 814 list_for_each_entry_rcu(rcvr, &intf->cmd_rcvrs, link) {
816 rcvr = list_entry(entry1, struct cmd_rcvr, link);
817 if (rcvr->user == user) { 815 if (rcvr->user == user) {
818 list_del_rcu(&rcvr->link); 816 list_del_rcu(&rcvr->link);
819 rcvr->next = rcvrs; 817 rcvr->next = rcvrs;
diff --git a/drivers/char/watchdog/Kconfig b/drivers/char/watchdog/Kconfig
index 344001b45af9..a6544790af60 100644
--- a/drivers/char/watchdog/Kconfig
+++ b/drivers/char/watchdog/Kconfig
@@ -438,7 +438,7 @@ config INDYDOG
438 438
439config ZVM_WATCHDOG 439config ZVM_WATCHDOG
440 tristate "z/VM Watchdog Timer" 440 tristate "z/VM Watchdog Timer"
441 depends on WATCHDOG && ARCH_S390 441 depends on WATCHDOG && S390
442 help 442 help
443 IBM s/390 and zSeries machines running under z/VM 5.1 or later 443 IBM s/390 and zSeries machines running under z/VM 5.1 or later
444 provide a virtual watchdog timer to their guest that cause a 444 provide a virtual watchdog timer to their guest that cause a
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 70aeb3a60120..d31117eb95aa 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -614,7 +614,7 @@ static void cdrom_end_request (ide_drive_t *drive, int uptodate)
614 */ 614 */
615 spin_lock_irqsave(&ide_lock, flags); 615 spin_lock_irqsave(&ide_lock, flags);
616 end_that_request_chunk(failed, 0, failed->data_len); 616 end_that_request_chunk(failed, 0, failed->data_len);
617 end_that_request_last(failed); 617 end_that_request_last(failed, 0);
618 spin_unlock_irqrestore(&ide_lock, flags); 618 spin_unlock_irqrestore(&ide_lock, flags);
619 } 619 }
620 620
@@ -1735,7 +1735,7 @@ end_request:
1735 1735
1736 spin_lock_irqsave(&ide_lock, flags); 1736 spin_lock_irqsave(&ide_lock, flags);
1737 blkdev_dequeue_request(rq); 1737 blkdev_dequeue_request(rq);
1738 end_that_request_last(rq); 1738 end_that_request_last(rq, 1);
1739 HWGROUP(drive)->rq = NULL; 1739 HWGROUP(drive)->rq = NULL;
1740 spin_unlock_irqrestore(&ide_lock, flags); 1740 spin_unlock_irqrestore(&ide_lock, flags);
1741 return ide_stopped; 1741 return ide_stopped;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 4e5767968d7f..4b441720b6ba 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -681,50 +681,9 @@ static ide_proc_entry_t idedisk_proc[] = {
681 681
682#endif /* CONFIG_PROC_FS */ 682#endif /* CONFIG_PROC_FS */
683 683
684static void idedisk_end_flush(request_queue_t *q, struct request *flush_rq) 684static void idedisk_prepare_flush(request_queue_t *q, struct request *rq)
685{ 685{
686 ide_drive_t *drive = q->queuedata; 686 ide_drive_t *drive = q->queuedata;
687 struct request *rq = flush_rq->end_io_data;
688 int good_sectors = rq->hard_nr_sectors;
689 int bad_sectors;
690 sector_t sector;
691
692 if (flush_rq->errors & ABRT_ERR) {
693 printk(KERN_ERR "%s: barrier support doesn't work\n", drive->name);
694 blk_queue_ordered(drive->queue, QUEUE_ORDERED_NONE);
695 blk_queue_issue_flush_fn(drive->queue, NULL);
696 good_sectors = 0;
697 } else if (flush_rq->errors) {
698 good_sectors = 0;
699 if (blk_barrier_preflush(rq)) {
700 sector = ide_get_error_location(drive,flush_rq->buffer);
701 if ((sector >= rq->hard_sector) &&
702 (sector < rq->hard_sector + rq->hard_nr_sectors))
703 good_sectors = sector - rq->hard_sector;
704 }
705 }
706
707 if (flush_rq->errors)
708 printk(KERN_ERR "%s: failed barrier write: "
709 "sector=%Lx(good=%d/bad=%d)\n",
710 drive->name, (unsigned long long)rq->sector,
711 good_sectors,
712 (int) (rq->hard_nr_sectors-good_sectors));
713
714 bad_sectors = rq->hard_nr_sectors - good_sectors;
715
716 if (good_sectors)
717 __ide_end_request(drive, rq, 1, good_sectors);
718 if (bad_sectors)
719 __ide_end_request(drive, rq, 0, bad_sectors);
720}
721
722static int idedisk_prepare_flush(request_queue_t *q, struct request *rq)
723{
724 ide_drive_t *drive = q->queuedata;
725
726 if (!drive->wcache)
727 return 0;
728 687
729 memset(rq->cmd, 0, sizeof(rq->cmd)); 688 memset(rq->cmd, 0, sizeof(rq->cmd));
730 689
@@ -735,9 +694,8 @@ static int idedisk_prepare_flush(request_queue_t *q, struct request *rq)
735 rq->cmd[0] = WIN_FLUSH_CACHE; 694 rq->cmd[0] = WIN_FLUSH_CACHE;
736 695
737 696
738 rq->flags |= REQ_DRIVE_TASK | REQ_SOFTBARRIER; 697 rq->flags |= REQ_DRIVE_TASK;
739 rq->buffer = rq->cmd; 698 rq->buffer = rq->cmd;
740 return 1;
741} 699}
742 700
743static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk, 701static int idedisk_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -794,27 +752,64 @@ static int set_nowerr(ide_drive_t *drive, int arg)
794 return 0; 752 return 0;
795} 753}
796 754
755static void update_ordered(ide_drive_t *drive)
756{
757 struct hd_driveid *id = drive->id;
758 unsigned ordered = QUEUE_ORDERED_NONE;
759 prepare_flush_fn *prep_fn = NULL;
760 issue_flush_fn *issue_fn = NULL;
761
762 if (drive->wcache) {
763 unsigned long long capacity;
764 int barrier;
765 /*
766 * We must avoid issuing commands a drive does not
767 * understand or we may crash it. We check flush cache
768 * is supported. We also check we have the LBA48 flush
769 * cache if the drive capacity is too large. By this
770 * time we have trimmed the drive capacity if LBA48 is
771 * not available so we don't need to recheck that.
772 */
773 capacity = idedisk_capacity(drive);
774 barrier = ide_id_has_flush_cache(id) &&
775 (drive->addressing == 0 || capacity <= (1ULL << 28) ||
776 ide_id_has_flush_cache_ext(id));
777
778 printk(KERN_INFO "%s: cache flushes %ssupported\n",
779 drive->name, barrier ? "" : "not");
780
781 if (barrier) {
782 ordered = QUEUE_ORDERED_DRAIN_FLUSH;
783 prep_fn = idedisk_prepare_flush;
784 issue_fn = idedisk_issue_flush;
785 }
786 } else
787 ordered = QUEUE_ORDERED_DRAIN;
788
789 blk_queue_ordered(drive->queue, ordered, prep_fn);
790 blk_queue_issue_flush_fn(drive->queue, issue_fn);
791}
792
797static int write_cache(ide_drive_t *drive, int arg) 793static int write_cache(ide_drive_t *drive, int arg)
798{ 794{
799 ide_task_t args; 795 ide_task_t args;
800 int err; 796 int err = 1;
801
802 if (!ide_id_has_flush_cache(drive->id))
803 return 1;
804 797
805 memset(&args, 0, sizeof(ide_task_t)); 798 if (ide_id_has_flush_cache(drive->id)) {
806 args.tfRegister[IDE_FEATURE_OFFSET] = (arg) ? 799 memset(&args, 0, sizeof(ide_task_t));
800 args.tfRegister[IDE_FEATURE_OFFSET] = (arg) ?
807 SETFEATURES_EN_WCACHE : SETFEATURES_DIS_WCACHE; 801 SETFEATURES_EN_WCACHE : SETFEATURES_DIS_WCACHE;
808 args.tfRegister[IDE_COMMAND_OFFSET] = WIN_SETFEATURES; 802 args.tfRegister[IDE_COMMAND_OFFSET] = WIN_SETFEATURES;
809 args.command_type = IDE_DRIVE_TASK_NO_DATA; 803 args.command_type = IDE_DRIVE_TASK_NO_DATA;
810 args.handler = &task_no_data_intr; 804 args.handler = &task_no_data_intr;
805 err = ide_raw_taskfile(drive, &args, NULL);
806 if (err == 0)
807 drive->wcache = arg;
808 }
811 809
812 err = ide_raw_taskfile(drive, &args, NULL); 810 update_ordered(drive);
813 if (err)
814 return err;
815 811
816 drive->wcache = arg; 812 return err;
817 return 0;
818} 813}
819 814
820static int do_idedisk_flushcache (ide_drive_t *drive) 815static int do_idedisk_flushcache (ide_drive_t *drive)
@@ -888,7 +883,6 @@ static void idedisk_setup (ide_drive_t *drive)
888{ 883{
889 struct hd_driveid *id = drive->id; 884 struct hd_driveid *id = drive->id;
890 unsigned long long capacity; 885 unsigned long long capacity;
891 int barrier;
892 886
893 idedisk_add_settings(drive); 887 idedisk_add_settings(drive);
894 888
@@ -992,31 +986,6 @@ static void idedisk_setup (ide_drive_t *drive)
992 drive->wcache = 1; 986 drive->wcache = 1;
993 987
994 write_cache(drive, 1); 988 write_cache(drive, 1);
995
996 /*
997 * We must avoid issuing commands a drive does not understand
998 * or we may crash it. We check flush cache is supported. We also
999 * check we have the LBA48 flush cache if the drive capacity is
1000 * too large. By this time we have trimmed the drive capacity if
1001 * LBA48 is not available so we don't need to recheck that.
1002 */
1003 barrier = 0;
1004 if (ide_id_has_flush_cache(id))
1005 barrier = 1;
1006 if (drive->addressing == 1) {
1007 /* Can't issue the correct flush ? */
1008 if (capacity > (1ULL << 28) && !ide_id_has_flush_cache_ext(id))
1009 barrier = 0;
1010 }
1011
1012 printk(KERN_INFO "%s: cache flushes %ssupported\n",
1013 drive->name, barrier ? "" : "not ");
1014 if (barrier) {
1015 blk_queue_ordered(drive->queue, QUEUE_ORDERED_FLUSH);
1016 drive->queue->prepare_flush_fn = idedisk_prepare_flush;
1017 drive->queue->end_flush_fn = idedisk_end_flush;
1018 blk_queue_issue_flush_fn(drive->queue, idedisk_issue_flush);
1019 }
1020} 989}
1021 990
1022static void ide_cacheflush_p(ide_drive_t *drive) 991static void ide_cacheflush_p(ide_drive_t *drive)
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index ecfafcdafea4..b5dc6df8e67d 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -89,7 +89,7 @@ int __ide_end_request(ide_drive_t *drive, struct request *rq, int uptodate,
89 89
90 blkdev_dequeue_request(rq); 90 blkdev_dequeue_request(rq);
91 HWGROUP(drive)->rq = NULL; 91 HWGROUP(drive)->rq = NULL;
92 end_that_request_last(rq); 92 end_that_request_last(rq, uptodate);
93 ret = 0; 93 ret = 0;
94 } 94 }
95 return ret; 95 return ret;
@@ -119,10 +119,7 @@ int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors)
119 if (!nr_sectors) 119 if (!nr_sectors)
120 nr_sectors = rq->hard_cur_sectors; 120 nr_sectors = rq->hard_cur_sectors;
121 121
122 if (blk_complete_barrier_rq_locked(drive->queue, rq, nr_sectors)) 122 ret = __ide_end_request(drive, rq, uptodate, nr_sectors);
123 ret = rq->nr_sectors != 0;
124 else
125 ret = __ide_end_request(drive, rq, uptodate, nr_sectors);
126 123
127 spin_unlock_irqrestore(&ide_lock, flags); 124 spin_unlock_irqrestore(&ide_lock, flags);
128 return ret; 125 return ret;
@@ -247,7 +244,7 @@ static void ide_complete_pm_request (ide_drive_t *drive, struct request *rq)
247 } 244 }
248 blkdev_dequeue_request(rq); 245 blkdev_dequeue_request(rq);
249 HWGROUP(drive)->rq = NULL; 246 HWGROUP(drive)->rq = NULL;
250 end_that_request_last(rq); 247 end_that_request_last(rq, 1);
251 spin_unlock_irqrestore(&ide_lock, flags); 248 spin_unlock_irqrestore(&ide_lock, flags);
252} 249}
253 250
@@ -379,7 +376,7 @@ void ide_end_drive_cmd (ide_drive_t *drive, u8 stat, u8 err)
379 blkdev_dequeue_request(rq); 376 blkdev_dequeue_request(rq);
380 HWGROUP(drive)->rq = NULL; 377 HWGROUP(drive)->rq = NULL;
381 rq->errors = err; 378 rq->errors = err;
382 end_that_request_last(rq); 379 end_that_request_last(rq, !rq->errors);
383 spin_unlock_irqrestore(&ide_lock, flags); 380 spin_unlock_irqrestore(&ide_lock, flags);
384} 381}
385 382
diff --git a/drivers/ieee1394/Kconfig b/drivers/ieee1394/Kconfig
index 25103a0ef9b3..39142e2f804b 100644
--- a/drivers/ieee1394/Kconfig
+++ b/drivers/ieee1394/Kconfig
@@ -169,27 +169,4 @@ config IEEE1394_RAWIO
169 To compile this driver as a module, say M here: the 169 To compile this driver as a module, say M here: the
170 module will be called raw1394. 170 module will be called raw1394.
171 171
172config IEEE1394_CMP
173 tristate "IEC61883-1 Plug support"
174 depends on IEEE1394
175 help
176 This option enables the Connection Management Procedures
177 (IEC61883-1) driver, which implements input and output plugs.
178
179 To compile this driver as a module, say M here: the
180 module will be called cmp.
181
182config IEEE1394_AMDTP
183 tristate "IEC61883-6 (Audio transmission) support"
184 depends on IEEE1394 && IEEE1394_OHCI1394 && IEEE1394_CMP
185 help
186 This option enables the Audio & Music Data Transmission Protocol
187 (IEC61883-6) driver, which implements audio transmission over
188 IEEE1394.
189
190 The userspace interface is documented in amdtp.h.
191
192 To compile this driver as a module, say M here: the
193 module will be called amdtp.
194
195endmenu 172endmenu
diff --git a/drivers/ieee1394/Makefile b/drivers/ieee1394/Makefile
index e8b4d48d376e..6f53611fe255 100644
--- a/drivers/ieee1394/Makefile
+++ b/drivers/ieee1394/Makefile
@@ -14,8 +14,6 @@ obj-$(CONFIG_IEEE1394_RAWIO) += raw1394.o
14obj-$(CONFIG_IEEE1394_SBP2) += sbp2.o 14obj-$(CONFIG_IEEE1394_SBP2) += sbp2.o
15obj-$(CONFIG_IEEE1394_DV1394) += dv1394.o 15obj-$(CONFIG_IEEE1394_DV1394) += dv1394.o
16obj-$(CONFIG_IEEE1394_ETH1394) += eth1394.o 16obj-$(CONFIG_IEEE1394_ETH1394) += eth1394.o
17obj-$(CONFIG_IEEE1394_AMDTP) += amdtp.o
18obj-$(CONFIG_IEEE1394_CMP) += cmp.o
19 17
20quiet_cmd_oui2c = OUI2C $@ 18quiet_cmd_oui2c = OUI2C $@
21 cmd_oui2c = $(CONFIG_SHELL) $(srctree)/$(src)/oui2c.sh < $< > $@ 19 cmd_oui2c = $(CONFIG_SHELL) $(srctree)/$(src)/oui2c.sh < $< > $@
diff --git a/drivers/ieee1394/csr1212.c b/drivers/ieee1394/csr1212.c
index 61ddd5d37eff..15773544234b 100644
--- a/drivers/ieee1394/csr1212.c
+++ b/drivers/ieee1394/csr1212.c
@@ -1261,7 +1261,7 @@ static int csr1212_parse_bus_info_block(struct csr1212_csr *csr)
1261 return CSR1212_EINVAL; 1261 return CSR1212_EINVAL;
1262#endif 1262#endif
1263 1263
1264 cr = CSR1212_MALLOC(sizeof(struct csr1212_cache_region)); 1264 cr = CSR1212_MALLOC(sizeof(*cr));
1265 if (!cr) 1265 if (!cr)
1266 return CSR1212_ENOMEM; 1266 return CSR1212_ENOMEM;
1267 1267
@@ -1393,8 +1393,7 @@ int csr1212_parse_keyval(struct csr1212_keyval *kv,
1393 case CSR1212_KV_TYPE_LEAF: 1393 case CSR1212_KV_TYPE_LEAF:
1394 if (kv->key.id != CSR1212_KV_ID_EXTENDED_ROM) { 1394 if (kv->key.id != CSR1212_KV_ID_EXTENDED_ROM) {
1395 kv->value.leaf.data = CSR1212_MALLOC(quads_to_bytes(kvi_len)); 1395 kv->value.leaf.data = CSR1212_MALLOC(quads_to_bytes(kvi_len));
1396 if (!kv->value.leaf.data) 1396 if (!kv->value.leaf.data) {
1397 {
1398 ret = CSR1212_ENOMEM; 1397 ret = CSR1212_ENOMEM;
1399 goto fail; 1398 goto fail;
1400 } 1399 }
@@ -1462,7 +1461,7 @@ int _csr1212_read_keyval(struct csr1212_csr *csr, struct csr1212_keyval *kv)
1462 cache->next = NULL; 1461 cache->next = NULL;
1463 csr->cache_tail = cache; 1462 csr->cache_tail = cache;
1464 cache->filled_head = 1463 cache->filled_head =
1465 CSR1212_MALLOC(sizeof(struct csr1212_cache_region)); 1464 CSR1212_MALLOC(sizeof(*cache->filled_head));
1466 if (!cache->filled_head) { 1465 if (!cache->filled_head) {
1467 return CSR1212_ENOMEM; 1466 return CSR1212_ENOMEM;
1468 } 1467 }
@@ -1484,7 +1483,7 @@ int _csr1212_read_keyval(struct csr1212_csr *csr, struct csr1212_keyval *kv)
1484 /* Now seach read portions of the cache to see if it is there. */ 1483 /* Now seach read portions of the cache to see if it is there. */
1485 for (cr = cache->filled_head; cr; cr = cr->next) { 1484 for (cr = cache->filled_head; cr; cr = cr->next) {
1486 if (cache_index < cr->offset_start) { 1485 if (cache_index < cr->offset_start) {
1487 newcr = CSR1212_MALLOC(sizeof(struct csr1212_cache_region)); 1486 newcr = CSR1212_MALLOC(sizeof(*newcr));
1488 if (!newcr) 1487 if (!newcr)
1489 return CSR1212_ENOMEM; 1488 return CSR1212_ENOMEM;
1490 1489
@@ -1508,7 +1507,7 @@ int _csr1212_read_keyval(struct csr1212_csr *csr, struct csr1212_keyval *kv)
1508 1507
1509 if (!cr) { 1508 if (!cr) {
1510 cr = cache->filled_tail; 1509 cr = cache->filled_tail;
1511 newcr = CSR1212_MALLOC(sizeof(struct csr1212_cache_region)); 1510 newcr = CSR1212_MALLOC(sizeof(*newcr));
1512 if (!newcr) 1511 if (!newcr)
1513 return CSR1212_ENOMEM; 1512 return CSR1212_ENOMEM;
1514 1513
@@ -1611,15 +1610,17 @@ int csr1212_parse_csr(struct csr1212_csr *csr)
1611 csr->root_kv->valid = 0; 1610 csr->root_kv->valid = 0;
1612 csr->root_kv->next = csr->root_kv; 1611 csr->root_kv->next = csr->root_kv;
1613 csr->root_kv->prev = csr->root_kv; 1612 csr->root_kv->prev = csr->root_kv;
1614 csr1212_get_keyval(csr, csr->root_kv); 1613 ret = _csr1212_read_keyval(csr, csr->root_kv);
1614 if (ret != CSR1212_SUCCESS)
1615 return ret;
1615 1616
1616 /* Scan through the Root directory finding all extended ROM regions 1617 /* Scan through the Root directory finding all extended ROM regions
1617 * and make cache regions for them */ 1618 * and make cache regions for them */
1618 for (dentry = csr->root_kv->value.directory.dentries_head; 1619 for (dentry = csr->root_kv->value.directory.dentries_head;
1619 dentry; dentry = dentry->next) { 1620 dentry; dentry = dentry->next) {
1620 if (dentry->kv->key.id == CSR1212_KV_ID_EXTENDED_ROM) { 1621 if (dentry->kv->key.id == CSR1212_KV_ID_EXTENDED_ROM &&
1621 csr1212_get_keyval(csr, dentry->kv); 1622 !dentry->kv->valid) {
1622 1623 ret = _csr1212_read_keyval(csr, dentry->kv);
1623 if (ret != CSR1212_SUCCESS) 1624 if (ret != CSR1212_SUCCESS)
1624 return ret; 1625 return ret;
1625 } 1626 }
diff --git a/drivers/ieee1394/csr1212.h b/drivers/ieee1394/csr1212.h
index 28c5f4b726e2..cecd5871f2de 100644
--- a/drivers/ieee1394/csr1212.h
+++ b/drivers/ieee1394/csr1212.h
@@ -646,7 +646,7 @@ static inline struct csr1212_csr_rom_cache *csr1212_rom_cache_malloc(u_int32_t o
646{ 646{
647 struct csr1212_csr_rom_cache *cache; 647 struct csr1212_csr_rom_cache *cache;
648 648
649 cache = CSR1212_MALLOC(sizeof(struct csr1212_csr_rom_cache) + size); 649 cache = CSR1212_MALLOC(sizeof(*cache) + size);
650 if (!cache) 650 if (!cache)
651 return NULL; 651 return NULL;
652 652
diff --git a/drivers/ieee1394/dma.c b/drivers/ieee1394/dma.c
index b79ddb43e746..9fb2769d9abc 100644
--- a/drivers/ieee1394/dma.c
+++ b/drivers/ieee1394/dma.c
@@ -23,7 +23,8 @@ void dma_prog_region_init(struct dma_prog_region *prog)
23 prog->bus_addr = 0; 23 prog->bus_addr = 0;
24} 24}
25 25
26int dma_prog_region_alloc(struct dma_prog_region *prog, unsigned long n_bytes, struct pci_dev *dev) 26int dma_prog_region_alloc(struct dma_prog_region *prog, unsigned long n_bytes,
27 struct pci_dev *dev)
27{ 28{
28 /* round up to page size */ 29 /* round up to page size */
29 n_bytes = PAGE_ALIGN(n_bytes); 30 n_bytes = PAGE_ALIGN(n_bytes);
@@ -32,7 +33,8 @@ int dma_prog_region_alloc(struct dma_prog_region *prog, unsigned long n_bytes,
32 33
33 prog->kvirt = pci_alloc_consistent(dev, n_bytes, &prog->bus_addr); 34 prog->kvirt = pci_alloc_consistent(dev, n_bytes, &prog->bus_addr);
34 if (!prog->kvirt) { 35 if (!prog->kvirt) {
35 printk(KERN_ERR "dma_prog_region_alloc: pci_alloc_consistent() failed\n"); 36 printk(KERN_ERR
37 "dma_prog_region_alloc: pci_alloc_consistent() failed\n");
36 dma_prog_region_free(prog); 38 dma_prog_region_free(prog);
37 return -ENOMEM; 39 return -ENOMEM;
38 } 40 }
@@ -45,7 +47,8 @@ int dma_prog_region_alloc(struct dma_prog_region *prog, unsigned long n_bytes,
45void dma_prog_region_free(struct dma_prog_region *prog) 47void dma_prog_region_free(struct dma_prog_region *prog)
46{ 48{
47 if (prog->kvirt) { 49 if (prog->kvirt) {
48 pci_free_consistent(prog->dev, prog->n_pages << PAGE_SHIFT, prog->kvirt, prog->bus_addr); 50 pci_free_consistent(prog->dev, prog->n_pages << PAGE_SHIFT,
51 prog->kvirt, prog->bus_addr);
49 } 52 }
50 53
51 prog->kvirt = NULL; 54 prog->kvirt = NULL;
@@ -65,7 +68,8 @@ void dma_region_init(struct dma_region *dma)
65 dma->sglist = NULL; 68 dma->sglist = NULL;
66} 69}
67 70
68int dma_region_alloc(struct dma_region *dma, unsigned long n_bytes, struct pci_dev *dev, int direction) 71int dma_region_alloc(struct dma_region *dma, unsigned long n_bytes,
72 struct pci_dev *dev, int direction)
69{ 73{
70 unsigned int i; 74 unsigned int i;
71 75
@@ -95,14 +99,16 @@ int dma_region_alloc(struct dma_region *dma, unsigned long n_bytes, struct pci_d
95 99
96 /* fill scatter/gather list with pages */ 100 /* fill scatter/gather list with pages */
97 for (i = 0; i < dma->n_pages; i++) { 101 for (i = 0; i < dma->n_pages; i++) {
98 unsigned long va = (unsigned long) dma->kvirt + (i << PAGE_SHIFT); 102 unsigned long va =
103 (unsigned long)dma->kvirt + (i << PAGE_SHIFT);
99 104
100 dma->sglist[i].page = vmalloc_to_page((void *)va); 105 dma->sglist[i].page = vmalloc_to_page((void *)va);
101 dma->sglist[i].length = PAGE_SIZE; 106 dma->sglist[i].length = PAGE_SIZE;
102 } 107 }
103 108
104 /* map sglist to the IOMMU */ 109 /* map sglist to the IOMMU */
105 dma->n_dma_pages = pci_map_sg(dev, dma->sglist, dma->n_pages, direction); 110 dma->n_dma_pages =
111 pci_map_sg(dev, dma->sglist, dma->n_pages, direction);
106 112
107 if (dma->n_dma_pages == 0) { 113 if (dma->n_dma_pages == 0) {
108 printk(KERN_ERR "dma_region_alloc: pci_map_sg() failed\n"); 114 printk(KERN_ERR "dma_region_alloc: pci_map_sg() failed\n");
@@ -114,7 +120,7 @@ int dma_region_alloc(struct dma_region *dma, unsigned long n_bytes, struct pci_d
114 120
115 return 0; 121 return 0;
116 122
117err: 123 err:
118 dma_region_free(dma); 124 dma_region_free(dma);
119 return -ENOMEM; 125 return -ENOMEM;
120} 126}
@@ -122,7 +128,8 @@ err:
122void dma_region_free(struct dma_region *dma) 128void dma_region_free(struct dma_region *dma)
123{ 129{
124 if (dma->n_dma_pages) { 130 if (dma->n_dma_pages) {
125 pci_unmap_sg(dma->dev, dma->sglist, dma->n_pages, dma->direction); 131 pci_unmap_sg(dma->dev, dma->sglist, dma->n_pages,
132 dma->direction);
126 dma->n_dma_pages = 0; 133 dma->n_dma_pages = 0;
127 dma->dev = NULL; 134 dma->dev = NULL;
128 } 135 }
@@ -137,7 +144,8 @@ void dma_region_free(struct dma_region *dma)
137 144
138/* find the scatterlist index and remaining offset corresponding to a 145/* find the scatterlist index and remaining offset corresponding to a
139 given offset from the beginning of the buffer */ 146 given offset from the beginning of the buffer */
140static inline int dma_region_find(struct dma_region *dma, unsigned long offset, unsigned long *rem) 147static inline int dma_region_find(struct dma_region *dma, unsigned long offset,
148 unsigned long *rem)
141{ 149{
142 int i; 150 int i;
143 unsigned long off = offset; 151 unsigned long off = offset;
@@ -156,15 +164,18 @@ static inline int dma_region_find(struct dma_region *dma, unsigned long offset,
156 return i; 164 return i;
157} 165}
158 166
159dma_addr_t dma_region_offset_to_bus(struct dma_region *dma, unsigned long offset) 167dma_addr_t dma_region_offset_to_bus(struct dma_region * dma,
168 unsigned long offset)
160{ 169{
161 unsigned long rem = 0; 170 unsigned long rem = 0;
162 171
163 struct scatterlist *sg = &dma->sglist[dma_region_find(dma, offset, &rem)]; 172 struct scatterlist *sg =
173 &dma->sglist[dma_region_find(dma, offset, &rem)];
164 return sg_dma_address(sg) + rem; 174 return sg_dma_address(sg) + rem;
165} 175}
166 176
167void dma_region_sync_for_cpu(struct dma_region *dma, unsigned long offset, unsigned long len) 177void dma_region_sync_for_cpu(struct dma_region *dma, unsigned long offset,
178 unsigned long len)
168{ 179{
169 int first, last; 180 int first, last;
170 unsigned long rem; 181 unsigned long rem;
@@ -175,10 +186,12 @@ void dma_region_sync_for_cpu(struct dma_region *dma, unsigned long offset, unsig
175 first = dma_region_find(dma, offset, &rem); 186 first = dma_region_find(dma, offset, &rem);
176 last = dma_region_find(dma, offset + len - 1, &rem); 187 last = dma_region_find(dma, offset + len - 1, &rem);
177 188
178 pci_dma_sync_sg_for_cpu(dma->dev, &dma->sglist[first], last - first + 1, dma->direction); 189 pci_dma_sync_sg_for_cpu(dma->dev, &dma->sglist[first], last - first + 1,
190 dma->direction);
179} 191}
180 192
181void dma_region_sync_for_device(struct dma_region *dma, unsigned long offset, unsigned long len) 193void dma_region_sync_for_device(struct dma_region *dma, unsigned long offset,
194 unsigned long len)
182{ 195{
183 int first, last; 196 int first, last;
184 unsigned long rem; 197 unsigned long rem;
@@ -189,44 +202,47 @@ void dma_region_sync_for_device(struct dma_region *dma, unsigned long offset, un
189 first = dma_region_find(dma, offset, &rem); 202 first = dma_region_find(dma, offset, &rem);
190 last = dma_region_find(dma, offset + len - 1, &rem); 203 last = dma_region_find(dma, offset + len - 1, &rem);
191 204
192 pci_dma_sync_sg_for_device(dma->dev, &dma->sglist[first], last - first + 1, dma->direction); 205 pci_dma_sync_sg_for_device(dma->dev, &dma->sglist[first],
206 last - first + 1, dma->direction);
193} 207}
194 208
195#ifdef CONFIG_MMU 209#ifdef CONFIG_MMU
196 210
197/* nopage() handler for mmap access */ 211/* nopage() handler for mmap access */
198 212
199static struct page* 213static struct page *dma_region_pagefault(struct vm_area_struct *area,
200dma_region_pagefault(struct vm_area_struct *area, unsigned long address, int *type) 214 unsigned long address, int *type)
201{ 215{
202 unsigned long offset; 216 unsigned long offset;
203 unsigned long kernel_virt_addr; 217 unsigned long kernel_virt_addr;
204 struct page *ret = NOPAGE_SIGBUS; 218 struct page *ret = NOPAGE_SIGBUS;
205 219
206 struct dma_region *dma = (struct dma_region*) area->vm_private_data; 220 struct dma_region *dma = (struct dma_region *)area->vm_private_data;
207 221
208 if (!dma->kvirt) 222 if (!dma->kvirt)
209 goto out; 223 goto out;
210 224
211 if ( (address < (unsigned long) area->vm_start) || 225 if ((address < (unsigned long)area->vm_start) ||
212 (address > (unsigned long) area->vm_start + (dma->n_pages << PAGE_SHIFT)) ) 226 (address >
227 (unsigned long)area->vm_start + (dma->n_pages << PAGE_SHIFT)))
213 goto out; 228 goto out;
214 229
215 if (type) 230 if (type)
216 *type = VM_FAULT_MINOR; 231 *type = VM_FAULT_MINOR;
217 offset = address - area->vm_start; 232 offset = address - area->vm_start;
218 kernel_virt_addr = (unsigned long) dma->kvirt + offset; 233 kernel_virt_addr = (unsigned long)dma->kvirt + offset;
219 ret = vmalloc_to_page((void*) kernel_virt_addr); 234 ret = vmalloc_to_page((void *)kernel_virt_addr);
220 get_page(ret); 235 get_page(ret);
221out: 236 out:
222 return ret; 237 return ret;
223} 238}
224 239
225static struct vm_operations_struct dma_region_vm_ops = { 240static struct vm_operations_struct dma_region_vm_ops = {
226 .nopage = dma_region_pagefault, 241 .nopage = dma_region_pagefault,
227}; 242};
228 243
229int dma_region_mmap(struct dma_region *dma, struct file *file, struct vm_area_struct *vma) 244int dma_region_mmap(struct dma_region *dma, struct file *file,
245 struct vm_area_struct *vma)
230{ 246{
231 unsigned long size; 247 unsigned long size;
232 248
@@ -250,11 +266,12 @@ int dma_region_mmap(struct dma_region *dma, struct file *file, struct vm_area_st
250 return 0; 266 return 0;
251} 267}
252 268
253#else /* CONFIG_MMU */ 269#else /* CONFIG_MMU */
254 270
255int dma_region_mmap(struct dma_region *dma, struct file *file, struct vm_area_struct *vma) 271int dma_region_mmap(struct dma_region *dma, struct file *file,
272 struct vm_area_struct *vma)
256{ 273{
257 return -EINVAL; 274 return -EINVAL;
258} 275}
259 276
260#endif /* CONFIG_MMU */ 277#endif /* CONFIG_MMU */
diff --git a/drivers/ieee1394/dv1394.c b/drivers/ieee1394/dv1394.c
index cbbbe14b8849..196db7439272 100644
--- a/drivers/ieee1394/dv1394.c
+++ b/drivers/ieee1394/dv1394.c
@@ -123,15 +123,6 @@
123 123
124#include "ohci1394.h" 124#include "ohci1394.h"
125 125
126#ifndef virt_to_page
127#define virt_to_page(x) MAP_NR(x)
128#endif
129
130#ifndef vmalloc_32
131#define vmalloc_32(x) vmalloc(x)
132#endif
133
134
135/* DEBUG LEVELS: 126/* DEBUG LEVELS:
136 0 - no debugging messages 127 0 - no debugging messages
137 1 - some debugging messages, but none during DMA frame transmission 128 1 - some debugging messages, but none during DMA frame transmission
@@ -2218,14 +2209,12 @@ static int dv1394_init(struct ti_ohci *ohci, enum pal_or_ntsc format, enum modes
2218 unsigned long flags; 2209 unsigned long flags;
2219 int i; 2210 int i;
2220 2211
2221 video = kmalloc(sizeof(struct video_card), GFP_KERNEL); 2212 video = kzalloc(sizeof(*video), GFP_KERNEL);
2222 if (!video) { 2213 if (!video) {
2223 printk(KERN_ERR "dv1394: cannot allocate video_card\n"); 2214 printk(KERN_ERR "dv1394: cannot allocate video_card\n");
2224 goto err; 2215 goto err;
2225 } 2216 }
2226 2217
2227 memset(video, 0, sizeof(struct video_card));
2228
2229 video->ohci = ohci; 2218 video->ohci = ohci;
2230 /* lower 2 bits of id indicate which of four "plugs" 2219 /* lower 2 bits of id indicate which of four "plugs"
2231 per host */ 2220 per host */
diff --git a/drivers/ieee1394/eth1394.c b/drivers/ieee1394/eth1394.c
index c9e92d85c893..30fa0d43a43a 100644
--- a/drivers/ieee1394/eth1394.c
+++ b/drivers/ieee1394/eth1394.c
@@ -88,9 +88,6 @@
88 printk(KERN_ERR "%s:%s[%d]: " fmt "\n", driver_name, __FUNCTION__, __LINE__, ## args) 88 printk(KERN_ERR "%s:%s[%d]: " fmt "\n", driver_name, __FUNCTION__, __LINE__, ## args)
89#define TRACE() printk(KERN_ERR "%s:%s[%d] ---- TRACE\n", driver_name, __FUNCTION__, __LINE__) 89#define TRACE() printk(KERN_ERR "%s:%s[%d] ---- TRACE\n", driver_name, __FUNCTION__, __LINE__)
90 90
91static char version[] __devinitdata =
92 "$Rev: 1312 $ Ben Collins <bcollins@debian.org>";
93
94struct fragment_info { 91struct fragment_info {
95 struct list_head list; 92 struct list_head list;
96 int offset; 93 int offset;
@@ -355,12 +352,12 @@ static int eth1394_probe(struct device *dev)
355 if (!hi) 352 if (!hi)
356 return -ENOENT; 353 return -ENOENT;
357 354
358 new_node = kmalloc(sizeof(struct eth1394_node_ref), 355 new_node = kmalloc(sizeof(*new_node),
359 in_interrupt() ? GFP_ATOMIC : GFP_KERNEL); 356 in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
360 if (!new_node) 357 if (!new_node)
361 return -ENOMEM; 358 return -ENOMEM;
362 359
363 node_info = kmalloc(sizeof(struct eth1394_node_info), 360 node_info = kmalloc(sizeof(*node_info),
364 in_interrupt() ? GFP_ATOMIC : GFP_KERNEL); 361 in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
365 if (!node_info) { 362 if (!node_info) {
366 kfree(new_node); 363 kfree(new_node);
@@ -436,12 +433,12 @@ static int eth1394_update(struct unit_directory *ud)
436 node = eth1394_find_node(&priv->ip_node_list, ud); 433 node = eth1394_find_node(&priv->ip_node_list, ud);
437 434
438 if (!node) { 435 if (!node) {
439 node = kmalloc(sizeof(struct eth1394_node_ref), 436 node = kmalloc(sizeof(*node),
440 in_interrupt() ? GFP_ATOMIC : GFP_KERNEL); 437 in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
441 if (!node) 438 if (!node)
442 return -ENOMEM; 439 return -ENOMEM;
443 440
444 node_info = kmalloc(sizeof(struct eth1394_node_info), 441 node_info = kmalloc(sizeof(*node_info),
445 in_interrupt() ? GFP_ATOMIC : GFP_KERNEL); 442 in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
446 if (!node_info) { 443 if (!node_info) {
447 kfree(node); 444 kfree(node);
@@ -566,7 +563,6 @@ static void ether1394_add_host (struct hpsb_host *host)
566 struct eth1394_host_info *hi = NULL; 563 struct eth1394_host_info *hi = NULL;
567 struct net_device *dev = NULL; 564 struct net_device *dev = NULL;
568 struct eth1394_priv *priv; 565 struct eth1394_priv *priv;
569 static int version_printed = 0;
570 u64 fifo_addr; 566 u64 fifo_addr;
571 567
572 if (!(host->config_roms & HPSB_CONFIG_ROM_ENTRY_IP1394)) 568 if (!(host->config_roms & HPSB_CONFIG_ROM_ENTRY_IP1394))
@@ -581,9 +577,6 @@ static void ether1394_add_host (struct hpsb_host *host)
581 if (fifo_addr == ~0ULL) 577 if (fifo_addr == ~0ULL)
582 goto out; 578 goto out;
583 579
584 if (version_printed++ == 0)
585 ETH1394_PRINT_G (KERN_INFO, "%s\n", version);
586
587 /* We should really have our own alloc_hpsbdev() function in 580 /* We should really have our own alloc_hpsbdev() function in
588 * net_init.c instead of calling the one for ethernet then hijacking 581 * net_init.c instead of calling the one for ethernet then hijacking
589 * it for ourselves. That way we'd be a real networking device. */ 582 * it for ourselves. That way we'd be a real networking device. */
@@ -1021,7 +1014,7 @@ static inline int new_fragment(struct list_head *frag_info, int offset, int len)
1021 } 1014 }
1022 } 1015 }
1023 1016
1024 new = kmalloc(sizeof(struct fragment_info), GFP_ATOMIC); 1017 new = kmalloc(sizeof(*new), GFP_ATOMIC);
1025 if (!new) 1018 if (!new)
1026 return -ENOMEM; 1019 return -ENOMEM;
1027 1020
@@ -1040,7 +1033,7 @@ static inline int new_partial_datagram(struct net_device *dev,
1040{ 1033{
1041 struct partial_datagram *new; 1034 struct partial_datagram *new;
1042 1035
1043 new = kmalloc(sizeof(struct partial_datagram), GFP_ATOMIC); 1036 new = kmalloc(sizeof(*new), GFP_ATOMIC);
1044 if (!new) 1037 if (!new)
1045 return -ENOMEM; 1038 return -ENOMEM;
1046 1039
@@ -1768,7 +1761,6 @@ fail:
1768static void ether1394_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 1761static void ether1394_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
1769{ 1762{
1770 strcpy (info->driver, driver_name); 1763 strcpy (info->driver, driver_name);
1771 strcpy (info->version, "$Rev: 1312 $");
1772 /* FIXME XXX provide sane businfo */ 1764 /* FIXME XXX provide sane businfo */
1773 strcpy (info->bus_info, "ieee1394"); 1765 strcpy (info->bus_info, "ieee1394");
1774} 1766}
diff --git a/drivers/ieee1394/highlevel.c b/drivers/ieee1394/highlevel.c
index 997e1bf6297f..734b121a0554 100644
--- a/drivers/ieee1394/highlevel.c
+++ b/drivers/ieee1394/highlevel.c
@@ -101,12 +101,10 @@ void *hpsb_create_hostinfo(struct hpsb_highlevel *hl, struct hpsb_host *host,
101 return NULL; 101 return NULL;
102 } 102 }
103 103
104 hi = kmalloc(sizeof(*hi) + data_size, GFP_ATOMIC); 104 hi = kzalloc(sizeof(*hi) + data_size, GFP_ATOMIC);
105 if (!hi) 105 if (!hi)
106 return NULL; 106 return NULL;
107 107
108 memset(hi, 0, sizeof(*hi) + data_size);
109
110 if (data_size) { 108 if (data_size) {
111 data = hi->data = hi + 1; 109 data = hi->data = hi + 1;
112 hi->size = data_size; 110 hi->size = data_size;
@@ -326,11 +324,9 @@ u64 hpsb_allocate_and_register_addrspace(struct hpsb_highlevel *hl,
326 return retval; 324 return retval;
327 } 325 }
328 326
329 as = (struct hpsb_address_serve *) 327 as = kmalloc(sizeof(*as), GFP_KERNEL);
330 kmalloc(sizeof(struct hpsb_address_serve), GFP_KERNEL); 328 if (!as)
331 if (as == NULL) {
332 return retval; 329 return retval;
333 }
334 330
335 INIT_LIST_HEAD(&as->host_list); 331 INIT_LIST_HEAD(&as->host_list);
336 INIT_LIST_HEAD(&as->hl_list); 332 INIT_LIST_HEAD(&as->hl_list);
@@ -383,11 +379,9 @@ int hpsb_register_addrspace(struct hpsb_highlevel *hl, struct hpsb_host *host,
383 return 0; 379 return 0;
384 } 380 }
385 381
386 as = (struct hpsb_address_serve *) 382 as = kmalloc(sizeof(*as), GFP_ATOMIC);
387 kmalloc(sizeof(struct hpsb_address_serve), GFP_ATOMIC); 383 if (!as)
388 if (as == NULL) { 384 return 0;
389 return 0;
390 }
391 385
392 INIT_LIST_HEAD(&as->host_list); 386 INIT_LIST_HEAD(&as->host_list);
393 INIT_LIST_HEAD(&as->hl_list); 387 INIT_LIST_HEAD(&as->hl_list);
diff --git a/drivers/ieee1394/hosts.c b/drivers/ieee1394/hosts.c
index aeeaeb670d03..ba09741fc826 100644
--- a/drivers/ieee1394/hosts.c
+++ b/drivers/ieee1394/hosts.c
@@ -61,12 +61,12 @@ static void delayed_reset_bus(void * __reset_info)
61 61
62static int dummy_transmit_packet(struct hpsb_host *h, struct hpsb_packet *p) 62static int dummy_transmit_packet(struct hpsb_host *h, struct hpsb_packet *p)
63{ 63{
64 return 0; 64 return 0;
65} 65}
66 66
67static int dummy_devctl(struct hpsb_host *h, enum devctl_cmd c, int arg) 67static int dummy_devctl(struct hpsb_host *h, enum devctl_cmd c, int arg)
68{ 68{
69 return -1; 69 return -1;
70} 70}
71 71
72static int dummy_isoctl(struct hpsb_iso *iso, enum isoctl_cmd command, unsigned long arg) 72static int dummy_isoctl(struct hpsb_iso *iso, enum isoctl_cmd command, unsigned long arg)
@@ -75,9 +75,9 @@ static int dummy_isoctl(struct hpsb_iso *iso, enum isoctl_cmd command, unsigned
75} 75}
76 76
77static struct hpsb_host_driver dummy_driver = { 77static struct hpsb_host_driver dummy_driver = {
78 .transmit_packet = dummy_transmit_packet, 78 .transmit_packet = dummy_transmit_packet,
79 .devctl = dummy_devctl, 79 .devctl = dummy_devctl,
80 .isoctl = dummy_isoctl 80 .isoctl = dummy_isoctl
81}; 81};
82 82
83static int alloc_hostnum_cb(struct hpsb_host *host, void *__data) 83static int alloc_hostnum_cb(struct hpsb_host *host, void *__data)
@@ -110,13 +110,13 @@ static DECLARE_MUTEX(host_num_alloc);
110struct hpsb_host *hpsb_alloc_host(struct hpsb_host_driver *drv, size_t extra, 110struct hpsb_host *hpsb_alloc_host(struct hpsb_host_driver *drv, size_t extra,
111 struct device *dev) 111 struct device *dev)
112{ 112{
113 struct hpsb_host *h; 113 struct hpsb_host *h;
114 int i; 114 int i;
115 int hostnum = 0; 115 int hostnum = 0;
116 116
117 h = kmalloc(sizeof(struct hpsb_host) + extra, SLAB_KERNEL); 117 h = kzalloc(sizeof(*h) + extra, SLAB_KERNEL);
118 if (!h) return NULL; 118 if (!h)
119 memset(h, 0, sizeof(struct hpsb_host) + extra); 119 return NULL;
120 120
121 h->csr.rom = csr1212_create_csr(&csr_bus_ops, CSR_BUS_INFO_SIZE, h); 121 h->csr.rom = csr1212_create_csr(&csr_bus_ops, CSR_BUS_INFO_SIZE, h);
122 if (!h->csr.rom) { 122 if (!h->csr.rom) {
@@ -125,7 +125,7 @@ struct hpsb_host *hpsb_alloc_host(struct hpsb_host_driver *drv, size_t extra,
125 } 125 }
126 126
127 h->hostdata = h + 1; 127 h->hostdata = h + 1;
128 h->driver = drv; 128 h->driver = drv;
129 129
130 skb_queue_head_init(&h->pending_packet_queue); 130 skb_queue_head_init(&h->pending_packet_queue);
131 INIT_LIST_HEAD(&h->addr_space); 131 INIT_LIST_HEAD(&h->addr_space);
@@ -145,8 +145,8 @@ struct hpsb_host *hpsb_alloc_host(struct hpsb_host_driver *drv, size_t extra,
145 h->timeout.function = abort_timedouts; 145 h->timeout.function = abort_timedouts;
146 h->timeout_interval = HZ / 20; // 50ms by default 146 h->timeout_interval = HZ / 20; // 50ms by default
147 147
148 h->topology_map = h->csr.topology_map + 3; 148 h->topology_map = h->csr.topology_map + 3;
149 h->speed_map = (u8 *)(h->csr.speed_map + 2); 149 h->speed_map = (u8 *)(h->csr.speed_map + 2);
150 150
151 down(&host_num_alloc); 151 down(&host_num_alloc);
152 152
@@ -186,14 +186,14 @@ int hpsb_add_host(struct hpsb_host *host)
186 186
187void hpsb_remove_host(struct hpsb_host *host) 187void hpsb_remove_host(struct hpsb_host *host)
188{ 188{
189 host->is_shutdown = 1; 189 host->is_shutdown = 1;
190 190
191 cancel_delayed_work(&host->delayed_reset); 191 cancel_delayed_work(&host->delayed_reset);
192 flush_scheduled_work(); 192 flush_scheduled_work();
193 193
194 host->driver = &dummy_driver; 194 host->driver = &dummy_driver;
195 195
196 highlevel_remove_host(host); 196 highlevel_remove_host(host);
197 197
198 hpsb_remove_extra_config_roms(host); 198 hpsb_remove_extra_config_roms(host);
199 199
diff --git a/drivers/ieee1394/hosts.h b/drivers/ieee1394/hosts.h
index ae9b02cc013f..07d188ca8495 100644
--- a/drivers/ieee1394/hosts.h
+++ b/drivers/ieee1394/hosts.h
@@ -17,47 +17,47 @@ struct hpsb_packet;
17struct hpsb_iso; 17struct hpsb_iso;
18 18
19struct hpsb_host { 19struct hpsb_host {
20 struct list_head host_list; 20 struct list_head host_list;
21 21
22 void *hostdata; 22 void *hostdata;
23 23
24 atomic_t generation; 24 atomic_t generation;
25 25
26 struct sk_buff_head pending_packet_queue; 26 struct sk_buff_head pending_packet_queue;
27 27
28 struct timer_list timeout; 28 struct timer_list timeout;
29 unsigned long timeout_interval; 29 unsigned long timeout_interval;
30 30
31 unsigned char iso_listen_count[64]; 31 unsigned char iso_listen_count[64];
32 32
33 int node_count; /* number of identified nodes on this bus */ 33 int node_count; /* number of identified nodes on this bus */
34 int selfid_count; /* total number of SelfIDs received */ 34 int selfid_count; /* total number of SelfIDs received */
35 int nodes_active; /* number of nodes that are actually active */ 35 int nodes_active; /* number of nodes that are actually active */
36 36
37 nodeid_t node_id; /* node ID of this host */ 37 nodeid_t node_id; /* node ID of this host */
38 nodeid_t irm_id; /* ID of this bus' isochronous resource manager */ 38 nodeid_t irm_id; /* ID of this bus' isochronous resource manager */
39 nodeid_t busmgr_id; /* ID of this bus' bus manager */ 39 nodeid_t busmgr_id; /* ID of this bus' bus manager */
40 40
41 /* this nodes state */ 41 /* this nodes state */
42 unsigned in_bus_reset:1; 42 unsigned in_bus_reset:1;
43 unsigned is_shutdown:1; 43 unsigned is_shutdown:1;
44 unsigned resume_packet_sent:1; 44 unsigned resume_packet_sent:1;
45 45
46 /* this nodes' duties on the bus */ 46 /* this nodes' duties on the bus */
47 unsigned is_root:1; 47 unsigned is_root:1;
48 unsigned is_cycmst:1; 48 unsigned is_cycmst:1;
49 unsigned is_irm:1; 49 unsigned is_irm:1;
50 unsigned is_busmgr:1; 50 unsigned is_busmgr:1;
51 51
52 int reset_retries; 52 int reset_retries;
53 quadlet_t *topology_map; 53 quadlet_t *topology_map;
54 u8 *speed_map; 54 u8 *speed_map;
55 struct csr_control csr; 55 struct csr_control csr;
56 56
57 /* Per node tlabel pool allocation */ 57 /* Per node tlabel pool allocation */
58 struct hpsb_tlabel_pool tpool[64]; 58 struct hpsb_tlabel_pool tpool[64];
59 59
60 struct hpsb_host_driver *driver; 60 struct hpsb_host_driver *driver;
61 61
62 struct pci_dev *pdev; 62 struct pci_dev *pdev;
63 63
@@ -77,34 +77,34 @@ struct hpsb_host {
77 77
78 78
79enum devctl_cmd { 79enum devctl_cmd {
80 /* Host is requested to reset its bus and cancel all outstanding async 80 /* Host is requested to reset its bus and cancel all outstanding async
81 * requests. If arg == 1, it shall also attempt to become root on the 81 * requests. If arg == 1, it shall also attempt to become root on the
82 * bus. Return void. */ 82 * bus. Return void. */
83 RESET_BUS, 83 RESET_BUS,
84 84
85 /* Arg is void, return value is the hardware cycle counter value. */ 85 /* Arg is void, return value is the hardware cycle counter value. */
86 GET_CYCLE_COUNTER, 86 GET_CYCLE_COUNTER,
87 87
88 /* Set the hardware cycle counter to the value in arg, return void. 88 /* Set the hardware cycle counter to the value in arg, return void.
89 * FIXME - setting is probably not required. */ 89 * FIXME - setting is probably not required. */
90 SET_CYCLE_COUNTER, 90 SET_CYCLE_COUNTER,
91 91
92 /* Configure hardware for new bus ID in arg, return void. */ 92 /* Configure hardware for new bus ID in arg, return void. */
93 SET_BUS_ID, 93 SET_BUS_ID,
94 94
95 /* If arg true, start sending cycle start packets, stop if arg == 0. 95 /* If arg true, start sending cycle start packets, stop if arg == 0.
96 * Return void. */ 96 * Return void. */
97 ACT_CYCLE_MASTER, 97 ACT_CYCLE_MASTER,
98 98
99 /* Cancel all outstanding async requests without resetting the bus. 99 /* Cancel all outstanding async requests without resetting the bus.
100 * Return void. */ 100 * Return void. */
101 CANCEL_REQUESTS, 101 CANCEL_REQUESTS,
102 102
103 /* Start or stop receiving isochronous channel in arg. Return void. 103 /* Start or stop receiving isochronous channel in arg. Return void.
104 * This acts as an optimization hint, hosts are not required not to 104 * This acts as an optimization hint, hosts are not required not to
105 * listen on unrequested channels. */ 105 * listen on unrequested channels. */
106 ISO_LISTEN_CHANNEL, 106 ISO_LISTEN_CHANNEL,
107 ISO_UNLISTEN_CHANNEL 107 ISO_UNLISTEN_CHANNEL
108}; 108};
109 109
110enum isoctl_cmd { 110enum isoctl_cmd {
@@ -135,13 +135,13 @@ enum isoctl_cmd {
135}; 135};
136 136
137enum reset_types { 137enum reset_types {
138 /* 166 microsecond reset -- only type of reset available on 138 /* 166 microsecond reset -- only type of reset available on
139 non-1394a capable controllers */ 139 non-1394a capable controllers */
140 LONG_RESET, 140 LONG_RESET,
141 141
142 /* Short (arbitrated) reset -- only available on 1394a capable 142 /* Short (arbitrated) reset -- only available on 1394a capable
143 controllers */ 143 controllers */
144 SHORT_RESET, 144 SHORT_RESET,
145 145
146 /* Variants that set force_root before issueing the bus reset */ 146 /* Variants that set force_root before issueing the bus reset */
147 LONG_RESET_FORCE_ROOT, SHORT_RESET_FORCE_ROOT, 147 LONG_RESET_FORCE_ROOT, SHORT_RESET_FORCE_ROOT,
@@ -159,22 +159,22 @@ struct hpsb_host_driver {
159 * reads to the ConfigROM on its own. */ 159 * reads to the ConfigROM on its own. */
160 void (*set_hw_config_rom) (struct hpsb_host *host, quadlet_t *config_rom); 160 void (*set_hw_config_rom) (struct hpsb_host *host, quadlet_t *config_rom);
161 161
162 /* This function shall implement packet transmission based on 162 /* This function shall implement packet transmission based on
163 * packet->type. It shall CRC both parts of the packet (unless 163 * packet->type. It shall CRC both parts of the packet (unless
164 * packet->type == raw) and do byte-swapping as necessary or instruct 164 * packet->type == raw) and do byte-swapping as necessary or instruct
165 * the hardware to do so. It can return immediately after the packet 165 * the hardware to do so. It can return immediately after the packet
166 * was queued for sending. After sending, hpsb_sent_packet() has to be 166 * was queued for sending. After sending, hpsb_sent_packet() has to be
167 * called. Return 0 on success, negative errno on failure. 167 * called. Return 0 on success, negative errno on failure.
168 * NOTE: The function must be callable in interrupt context. 168 * NOTE: The function must be callable in interrupt context.
169 */ 169 */
170 int (*transmit_packet) (struct hpsb_host *host, 170 int (*transmit_packet) (struct hpsb_host *host,
171 struct hpsb_packet *packet); 171 struct hpsb_packet *packet);
172 172
173 /* This function requests miscellanous services from the driver, see 173 /* This function requests miscellanous services from the driver, see
174 * above for command codes and expected actions. Return -1 for unknown 174 * above for command codes and expected actions. Return -1 for unknown
175 * command, though that should never happen. 175 * command, though that should never happen.
176 */ 176 */
177 int (*devctl) (struct hpsb_host *host, enum devctl_cmd command, int arg); 177 int (*devctl) (struct hpsb_host *host, enum devctl_cmd command, int arg);
178 178
179 /* ISO transmission/reception functions. Return 0 on success, -1 179 /* ISO transmission/reception functions. Return 0 on success, -1
180 * (or -EXXX errno code) on failure. If the low-level driver does not 180 * (or -EXXX errno code) on failure. If the low-level driver does not
@@ -182,15 +182,15 @@ struct hpsb_host_driver {
182 */ 182 */
183 int (*isoctl) (struct hpsb_iso *iso, enum isoctl_cmd command, unsigned long arg); 183 int (*isoctl) (struct hpsb_iso *iso, enum isoctl_cmd command, unsigned long arg);
184 184
185 /* This function is mainly to redirect local CSR reads/locks to the iso 185 /* This function is mainly to redirect local CSR reads/locks to the iso
186 * management registers (bus manager id, bandwidth available, channels 186 * management registers (bus manager id, bandwidth available, channels
187 * available) to the hardware registers in OHCI. reg is 0,1,2,3 for bus 187 * available) to the hardware registers in OHCI. reg is 0,1,2,3 for bus
188 * mgr, bwdth avail, ch avail hi, ch avail lo respectively (the same ids 188 * mgr, bwdth avail, ch avail hi, ch avail lo respectively (the same ids
189 * as OHCI uses). data and compare are the new data and expected data 189 * as OHCI uses). data and compare are the new data and expected data
190 * respectively, return value is the old value. 190 * respectively, return value is the old value.
191 */ 191 */
192 quadlet_t (*hw_csr_reg) (struct hpsb_host *host, int reg, 192 quadlet_t (*hw_csr_reg) (struct hpsb_host *host, int reg,
193 quadlet_t data, quadlet_t compare); 193 quadlet_t data, quadlet_t compare);
194}; 194};
195 195
196 196
diff --git a/drivers/ieee1394/ieee1394-ioctl.h b/drivers/ieee1394/ieee1394-ioctl.h
index f92b566363d5..156703986348 100644
--- a/drivers/ieee1394/ieee1394-ioctl.h
+++ b/drivers/ieee1394/ieee1394-ioctl.h
@@ -7,14 +7,6 @@
7#include <linux/ioctl.h> 7#include <linux/ioctl.h>
8#include <linux/types.h> 8#include <linux/types.h>
9 9
10
11/* AMDTP Gets 6 */
12#define AMDTP_IOC_CHANNEL _IOW('#', 0x00, struct amdtp_ioctl)
13#define AMDTP_IOC_PLUG _IOW('#', 0x01, struct amdtp_ioctl)
14#define AMDTP_IOC_PING _IOW('#', 0x02, struct amdtp_ioctl)
15#define AMDTP_IOC_ZAP _IO ('#', 0x03)
16
17
18/* DV1394 Gets 10 */ 10/* DV1394 Gets 10 */
19 11
20/* Get the driver ready to transmit video. pass a struct dv1394_init* as 12/* Get the driver ready to transmit video. pass a struct dv1394_init* as
diff --git a/drivers/ieee1394/ieee1394.h b/drivers/ieee1394/ieee1394.h
index b634a9bb365c..936d776de00a 100644
--- a/drivers/ieee1394/ieee1394.h
+++ b/drivers/ieee1394/ieee1394.h
@@ -62,6 +62,7 @@
62extern const char *hpsb_speedto_str[]; 62extern const char *hpsb_speedto_str[];
63 63
64 64
65/* 1394a cable PHY packets */
65#define SELFID_PWRCL_NO_POWER 0x0 66#define SELFID_PWRCL_NO_POWER 0x0
66#define SELFID_PWRCL_PROVIDE_15W 0x1 67#define SELFID_PWRCL_PROVIDE_15W 0x1
67#define SELFID_PWRCL_PROVIDE_30W 0x2 68#define SELFID_PWRCL_PROVIDE_30W 0x2
@@ -76,8 +77,24 @@ extern const char *hpsb_speedto_str[];
76#define SELFID_PORT_NCONN 0x1 77#define SELFID_PORT_NCONN 0x1
77#define SELFID_PORT_NONE 0x0 78#define SELFID_PORT_NONE 0x0
78 79
80#define PHYPACKET_LINKON 0x40000000
81#define PHYPACKET_PHYCONFIG_R 0x00800000
82#define PHYPACKET_PHYCONFIG_T 0x00400000
83#define EXTPHYPACKET_TYPE_PING 0x00000000
84#define EXTPHYPACKET_TYPE_REMOTEACCESS_BASE 0x00040000
85#define EXTPHYPACKET_TYPE_REMOTEACCESS_PAGED 0x00140000
86#define EXTPHYPACKET_TYPE_REMOTEREPLY_BASE 0x000C0000
87#define EXTPHYPACKET_TYPE_REMOTEREPLY_PAGED 0x001C0000
88#define EXTPHYPACKET_TYPE_REMOTECOMMAND 0x00200000
89#define EXTPHYPACKET_TYPE_REMOTECONFIRMATION 0x00280000
90#define EXTPHYPACKET_TYPE_RESUME 0x003C0000
79 91
80/* 1394a PHY bitmasks */ 92#define EXTPHYPACKET_TYPEMASK 0xC0FC0000
93
94#define PHYPACKET_PORT_SHIFT 24
95#define PHYPACKET_GAPCOUNT_SHIFT 16
96
97/* 1394a PHY register map bitmasks */
81#define PHY_00_PHYSICAL_ID 0xFC 98#define PHY_00_PHYSICAL_ID 0xFC
82#define PHY_00_R 0x02 /* Root */ 99#define PHY_00_R 0x02 /* Root */
83#define PHY_00_PS 0x01 /* Power Status*/ 100#define PHY_00_PS 0x01 /* Power Status*/
diff --git a/drivers/ieee1394/ieee1394_core.c b/drivers/ieee1394/ieee1394_core.c
index 32a1e016c85e..25ef5a86f5f0 100644
--- a/drivers/ieee1394/ieee1394_core.c
+++ b/drivers/ieee1394/ieee1394_core.c
@@ -179,34 +179,34 @@ void hpsb_free_packet(struct hpsb_packet *packet)
179 179
180int hpsb_reset_bus(struct hpsb_host *host, int type) 180int hpsb_reset_bus(struct hpsb_host *host, int type)
181{ 181{
182 if (!host->in_bus_reset) { 182 if (!host->in_bus_reset) {
183 host->driver->devctl(host, RESET_BUS, type); 183 host->driver->devctl(host, RESET_BUS, type);
184 return 0; 184 return 0;
185 } else { 185 } else {
186 return 1; 186 return 1;
187 } 187 }
188} 188}
189 189
190 190
191int hpsb_bus_reset(struct hpsb_host *host) 191int hpsb_bus_reset(struct hpsb_host *host)
192{ 192{
193 if (host->in_bus_reset) { 193 if (host->in_bus_reset) {
194 HPSB_NOTICE("%s called while bus reset already in progress", 194 HPSB_NOTICE("%s called while bus reset already in progress",
195 __FUNCTION__); 195 __FUNCTION__);
196 return 1; 196 return 1;
197 } 197 }
198 198
199 abort_requests(host); 199 abort_requests(host);
200 host->in_bus_reset = 1; 200 host->in_bus_reset = 1;
201 host->irm_id = -1; 201 host->irm_id = -1;
202 host->is_irm = 0; 202 host->is_irm = 0;
203 host->busmgr_id = -1; 203 host->busmgr_id = -1;
204 host->is_busmgr = 0; 204 host->is_busmgr = 0;
205 host->is_cycmst = 0; 205 host->is_cycmst = 0;
206 host->node_count = 0; 206 host->node_count = 0;
207 host->selfid_count = 0; 207 host->selfid_count = 0;
208 208
209 return 0; 209 return 0;
210} 210}
211 211
212 212
@@ -216,150 +216,156 @@ int hpsb_bus_reset(struct hpsb_host *host)
216 */ 216 */
217static int check_selfids(struct hpsb_host *host) 217static int check_selfids(struct hpsb_host *host)
218{ 218{
219 int nodeid = -1; 219 int nodeid = -1;
220 int rest_of_selfids = host->selfid_count; 220 int rest_of_selfids = host->selfid_count;
221 struct selfid *sid = (struct selfid *)host->topology_map; 221 struct selfid *sid = (struct selfid *)host->topology_map;
222 struct ext_selfid *esid; 222 struct ext_selfid *esid;
223 int esid_seq = 23; 223 int esid_seq = 23;
224 224
225 host->nodes_active = 0; 225 host->nodes_active = 0;
226 226
227 while (rest_of_selfids--) { 227 while (rest_of_selfids--) {
228 if (!sid->extended) { 228 if (!sid->extended) {
229 nodeid++; 229 nodeid++;
230 esid_seq = 0; 230 esid_seq = 0;
231 231
232 if (sid->phy_id != nodeid) { 232 if (sid->phy_id != nodeid) {
233 HPSB_INFO("SelfIDs failed monotony check with " 233 HPSB_INFO("SelfIDs failed monotony check with "
234 "%d", sid->phy_id); 234 "%d", sid->phy_id);
235 return 0; 235 return 0;
236 } 236 }
237 237
238 if (sid->link_active) { 238 if (sid->link_active) {
239 host->nodes_active++; 239 host->nodes_active++;
240 if (sid->contender) 240 if (sid->contender)
241 host->irm_id = LOCAL_BUS | sid->phy_id; 241 host->irm_id = LOCAL_BUS | sid->phy_id;
242 } 242 }
243 } else { 243 } else {
244 esid = (struct ext_selfid *)sid; 244 esid = (struct ext_selfid *)sid;
245 245
246 if ((esid->phy_id != nodeid) 246 if ((esid->phy_id != nodeid)
247 || (esid->seq_nr != esid_seq)) { 247 || (esid->seq_nr != esid_seq)) {
248 HPSB_INFO("SelfIDs failed monotony check with " 248 HPSB_INFO("SelfIDs failed monotony check with "
249 "%d/%d", esid->phy_id, esid->seq_nr); 249 "%d/%d", esid->phy_id, esid->seq_nr);
250 return 0; 250 return 0;
251 } 251 }
252 esid_seq++; 252 esid_seq++;
253 } 253 }
254 sid++; 254 sid++;
255 } 255 }
256 256
257 esid = (struct ext_selfid *)(sid - 1); 257 esid = (struct ext_selfid *)(sid - 1);
258 while (esid->extended) { 258 while (esid->extended) {
259 if ((esid->porta == 0x2) || (esid->portb == 0x2) 259 if ((esid->porta == SELFID_PORT_PARENT) ||
260 || (esid->portc == 0x2) || (esid->portd == 0x2) 260 (esid->portb == SELFID_PORT_PARENT) ||
261 || (esid->porte == 0x2) || (esid->portf == 0x2) 261 (esid->portc == SELFID_PORT_PARENT) ||
262 || (esid->portg == 0x2) || (esid->porth == 0x2)) { 262 (esid->portd == SELFID_PORT_PARENT) ||
263 (esid->porte == SELFID_PORT_PARENT) ||
264 (esid->portf == SELFID_PORT_PARENT) ||
265 (esid->portg == SELFID_PORT_PARENT) ||
266 (esid->porth == SELFID_PORT_PARENT)) {
263 HPSB_INFO("SelfIDs failed root check on " 267 HPSB_INFO("SelfIDs failed root check on "
264 "extended SelfID"); 268 "extended SelfID");
265 return 0; 269 return 0;
266 } 270 }
267 esid--; 271 esid--;
268 } 272 }
269 273
270 sid = (struct selfid *)esid; 274 sid = (struct selfid *)esid;
271 if ((sid->port0 == 0x2) || (sid->port1 == 0x2) || (sid->port2 == 0x2)) { 275 if ((sid->port0 == SELFID_PORT_PARENT) ||
276 (sid->port1 == SELFID_PORT_PARENT) ||
277 (sid->port2 == SELFID_PORT_PARENT)) {
272 HPSB_INFO("SelfIDs failed root check"); 278 HPSB_INFO("SelfIDs failed root check");
273 return 0; 279 return 0;
274 } 280 }
275 281
276 host->node_count = nodeid + 1; 282 host->node_count = nodeid + 1;
277 return 1; 283 return 1;
278} 284}
279 285
280static void build_speed_map(struct hpsb_host *host, int nodecount) 286static void build_speed_map(struct hpsb_host *host, int nodecount)
281{ 287{
282 u8 speedcap[nodecount]; 288 u8 speedcap[nodecount];
283 u8 cldcnt[nodecount]; 289 u8 cldcnt[nodecount];
284 u8 *map = host->speed_map; 290 u8 *map = host->speed_map;
285 struct selfid *sid; 291 struct selfid *sid;
286 struct ext_selfid *esid; 292 struct ext_selfid *esid;
287 int i, j, n; 293 int i, j, n;
288 294
289 for (i = 0; i < (nodecount * 64); i += 64) { 295 for (i = 0; i < (nodecount * 64); i += 64) {
290 for (j = 0; j < nodecount; j++) { 296 for (j = 0; j < nodecount; j++) {
291 map[i+j] = IEEE1394_SPEED_MAX; 297 map[i+j] = IEEE1394_SPEED_MAX;
292 } 298 }
293 } 299 }
294 300
295 for (i = 0; i < nodecount; i++) { 301 for (i = 0; i < nodecount; i++) {
296 cldcnt[i] = 0; 302 cldcnt[i] = 0;
297 } 303 }
298 304
299 /* find direct children count and speed */ 305 /* find direct children count and speed */
300 for (sid = (struct selfid *)&host->topology_map[host->selfid_count-1], 306 for (sid = (struct selfid *)&host->topology_map[host->selfid_count-1],
301 n = nodecount - 1; 307 n = nodecount - 1;
302 (void *)sid >= (void *)host->topology_map; sid--) { 308 (void *)sid >= (void *)host->topology_map; sid--) {
303 if (sid->extended) { 309 if (sid->extended) {
304 esid = (struct ext_selfid *)sid; 310 esid = (struct ext_selfid *)sid;
305 311
306 if (esid->porta == 0x3) cldcnt[n]++; 312 if (esid->porta == SELFID_PORT_CHILD) cldcnt[n]++;
307 if (esid->portb == 0x3) cldcnt[n]++; 313 if (esid->portb == SELFID_PORT_CHILD) cldcnt[n]++;
308 if (esid->portc == 0x3) cldcnt[n]++; 314 if (esid->portc == SELFID_PORT_CHILD) cldcnt[n]++;
309 if (esid->portd == 0x3) cldcnt[n]++; 315 if (esid->portd == SELFID_PORT_CHILD) cldcnt[n]++;
310 if (esid->porte == 0x3) cldcnt[n]++; 316 if (esid->porte == SELFID_PORT_CHILD) cldcnt[n]++;
311 if (esid->portf == 0x3) cldcnt[n]++; 317 if (esid->portf == SELFID_PORT_CHILD) cldcnt[n]++;
312 if (esid->portg == 0x3) cldcnt[n]++; 318 if (esid->portg == SELFID_PORT_CHILD) cldcnt[n]++;
313 if (esid->porth == 0x3) cldcnt[n]++; 319 if (esid->porth == SELFID_PORT_CHILD) cldcnt[n]++;
314 } else { 320 } else {
315 if (sid->port0 == 0x3) cldcnt[n]++; 321 if (sid->port0 == SELFID_PORT_CHILD) cldcnt[n]++;
316 if (sid->port1 == 0x3) cldcnt[n]++; 322 if (sid->port1 == SELFID_PORT_CHILD) cldcnt[n]++;
317 if (sid->port2 == 0x3) cldcnt[n]++; 323 if (sid->port2 == SELFID_PORT_CHILD) cldcnt[n]++;
318 324
319 speedcap[n] = sid->speed; 325 speedcap[n] = sid->speed;
320 n--; 326 n--;
321 } 327 }
322 } 328 }
323 329
324 /* set self mapping */ 330 /* set self mapping */
325 for (i = 0; i < nodecount; i++) { 331 for (i = 0; i < nodecount; i++) {
326 map[64*i + i] = speedcap[i]; 332 map[64*i + i] = speedcap[i];
327 } 333 }
328 334
329 /* fix up direct children count to total children count; 335 /* fix up direct children count to total children count;
330 * also fix up speedcaps for sibling and parent communication */ 336 * also fix up speedcaps for sibling and parent communication */
331 for (i = 1; i < nodecount; i++) { 337 for (i = 1; i < nodecount; i++) {
332 for (j = cldcnt[i], n = i - 1; j > 0; j--) { 338 for (j = cldcnt[i], n = i - 1; j > 0; j--) {
333 cldcnt[i] += cldcnt[n]; 339 cldcnt[i] += cldcnt[n];
334 speedcap[n] = min(speedcap[n], speedcap[i]); 340 speedcap[n] = min(speedcap[n], speedcap[i]);
335 n -= cldcnt[n] + 1; 341 n -= cldcnt[n] + 1;
336 } 342 }
337 } 343 }
338 344
339 for (n = 0; n < nodecount; n++) { 345 for (n = 0; n < nodecount; n++) {
340 for (i = n - cldcnt[n]; i <= n; i++) { 346 for (i = n - cldcnt[n]; i <= n; i++) {
341 for (j = 0; j < (n - cldcnt[n]); j++) { 347 for (j = 0; j < (n - cldcnt[n]); j++) {
342 map[j*64 + i] = map[i*64 + j] = 348 map[j*64 + i] = map[i*64 + j] =
343 min(map[i*64 + j], speedcap[n]); 349 min(map[i*64 + j], speedcap[n]);
344 } 350 }
345 for (j = n + 1; j < nodecount; j++) { 351 for (j = n + 1; j < nodecount; j++) {
346 map[j*64 + i] = map[i*64 + j] = 352 map[j*64 + i] = map[i*64 + j] =
347 min(map[i*64 + j], speedcap[n]); 353 min(map[i*64 + j], speedcap[n]);
348 } 354 }
349 } 355 }
350 } 356 }
351} 357}
352 358
353 359
354void hpsb_selfid_received(struct hpsb_host *host, quadlet_t sid) 360void hpsb_selfid_received(struct hpsb_host *host, quadlet_t sid)
355{ 361{
356 if (host->in_bus_reset) { 362 if (host->in_bus_reset) {
357 HPSB_VERBOSE("Including SelfID 0x%x", sid); 363 HPSB_VERBOSE("Including SelfID 0x%x", sid);
358 host->topology_map[host->selfid_count++] = sid; 364 host->topology_map[host->selfid_count++] = sid;
359 } else { 365 } else {
360 HPSB_NOTICE("Spurious SelfID packet (0x%08x) received from bus %d", 366 HPSB_NOTICE("Spurious SelfID packet (0x%08x) received from bus %d",
361 sid, NODEID_TO_BUS(host->node_id)); 367 sid, NODEID_TO_BUS(host->node_id));
362 } 368 }
363} 369}
364 370
365void hpsb_selfid_complete(struct hpsb_host *host, int phyid, int isroot) 371void hpsb_selfid_complete(struct hpsb_host *host, int phyid, int isroot)
@@ -367,50 +373,50 @@ void hpsb_selfid_complete(struct hpsb_host *host, int phyid, int isroot)
367 if (!host->in_bus_reset) 373 if (!host->in_bus_reset)
368 HPSB_NOTICE("SelfID completion called outside of bus reset!"); 374 HPSB_NOTICE("SelfID completion called outside of bus reset!");
369 375
370 host->node_id = LOCAL_BUS | phyid; 376 host->node_id = LOCAL_BUS | phyid;
371 host->is_root = isroot; 377 host->is_root = isroot;
372 378
373 if (!check_selfids(host)) { 379 if (!check_selfids(host)) {
374 if (host->reset_retries++ < 20) { 380 if (host->reset_retries++ < 20) {
375 /* selfid stage did not complete without error */ 381 /* selfid stage did not complete without error */
376 HPSB_NOTICE("Error in SelfID stage, resetting"); 382 HPSB_NOTICE("Error in SelfID stage, resetting");
377 host->in_bus_reset = 0; 383 host->in_bus_reset = 0;
378 /* this should work from ohci1394 now... */ 384 /* this should work from ohci1394 now... */
379 hpsb_reset_bus(host, LONG_RESET); 385 hpsb_reset_bus(host, LONG_RESET);
380 return; 386 return;
381 } else { 387 } else {
382 HPSB_NOTICE("Stopping out-of-control reset loop"); 388 HPSB_NOTICE("Stopping out-of-control reset loop");
383 HPSB_NOTICE("Warning - topology map and speed map will not be valid"); 389 HPSB_NOTICE("Warning - topology map and speed map will not be valid");
384 host->reset_retries = 0; 390 host->reset_retries = 0;
385 } 391 }
386 } else { 392 } else {
387 host->reset_retries = 0; 393 host->reset_retries = 0;
388 build_speed_map(host, host->node_count); 394 build_speed_map(host, host->node_count);
389 } 395 }
390 396
391 HPSB_VERBOSE("selfid_complete called with successful SelfID stage " 397 HPSB_VERBOSE("selfid_complete called with successful SelfID stage "
392 "... irm_id: 0x%X node_id: 0x%X",host->irm_id,host->node_id); 398 "... irm_id: 0x%X node_id: 0x%X",host->irm_id,host->node_id);
393 399
394 /* irm_id is kept up to date by check_selfids() */ 400 /* irm_id is kept up to date by check_selfids() */
395 if (host->irm_id == host->node_id) { 401 if (host->irm_id == host->node_id) {
396 host->is_irm = 1; 402 host->is_irm = 1;
397 } else { 403 } else {
398 host->is_busmgr = 0; 404 host->is_busmgr = 0;
399 host->is_irm = 0; 405 host->is_irm = 0;
400 } 406 }
401 407
402 if (isroot) { 408 if (isroot) {
403 host->driver->devctl(host, ACT_CYCLE_MASTER, 1); 409 host->driver->devctl(host, ACT_CYCLE_MASTER, 1);
404 host->is_cycmst = 1; 410 host->is_cycmst = 1;
405 } 411 }
406 atomic_inc(&host->generation); 412 atomic_inc(&host->generation);
407 host->in_bus_reset = 0; 413 host->in_bus_reset = 0;
408 highlevel_host_reset(host); 414 highlevel_host_reset(host);
409} 415}
410 416
411 417
412void hpsb_packet_sent(struct hpsb_host *host, struct hpsb_packet *packet, 418void hpsb_packet_sent(struct hpsb_host *host, struct hpsb_packet *packet,
413 int ackcode) 419 int ackcode)
414{ 420{
415 unsigned long flags; 421 unsigned long flags;
416 422
@@ -457,6 +463,7 @@ void hpsb_packet_sent(struct hpsb_host *host, struct hpsb_packet *packet,
457int hpsb_send_phy_config(struct hpsb_host *host, int rootid, int gapcnt) 463int hpsb_send_phy_config(struct hpsb_host *host, int rootid, int gapcnt)
458{ 464{
459 struct hpsb_packet *packet; 465 struct hpsb_packet *packet;
466 quadlet_t d = 0;
460 int retval = 0; 467 int retval = 0;
461 468
462 if (rootid >= ALL_NODES || rootid < -1 || gapcnt > 0x3f || gapcnt < -1 || 469 if (rootid >= ALL_NODES || rootid < -1 || gapcnt > 0x3f || gapcnt < -1 ||
@@ -466,26 +473,16 @@ int hpsb_send_phy_config(struct hpsb_host *host, int rootid, int gapcnt)
466 return -EINVAL; 473 return -EINVAL;
467 } 474 }
468 475
469 packet = hpsb_alloc_packet(0);
470 if (!packet)
471 return -ENOMEM;
472
473 packet->host = host;
474 packet->header_size = 8;
475 packet->data_size = 0;
476 packet->expect_response = 0;
477 packet->no_waiter = 0;
478 packet->type = hpsb_raw;
479 packet->header[0] = 0;
480 if (rootid != -1) 476 if (rootid != -1)
481 packet->header[0] |= rootid << 24 | 1 << 23; 477 d |= PHYPACKET_PHYCONFIG_R | rootid << PHYPACKET_PORT_SHIFT;
482 if (gapcnt != -1) 478 if (gapcnt != -1)
483 packet->header[0] |= gapcnt << 16 | 1 << 22; 479 d |= PHYPACKET_PHYCONFIG_T | gapcnt << PHYPACKET_GAPCOUNT_SHIFT;
484 480
485 packet->header[1] = ~packet->header[0]; 481 packet = hpsb_make_phypacket(host, d);
482 if (!packet)
483 return -ENOMEM;
486 484
487 packet->generation = get_hpsb_generation(host); 485 packet->generation = get_hpsb_generation(host);
488
489 retval = hpsb_send_packet_and_wait(packet); 486 retval = hpsb_send_packet_and_wait(packet);
490 hpsb_free_packet(packet); 487 hpsb_free_packet(packet);
491 488
@@ -510,13 +507,13 @@ int hpsb_send_packet(struct hpsb_packet *packet)
510{ 507{
511 struct hpsb_host *host = packet->host; 508 struct hpsb_host *host = packet->host;
512 509
513 if (host->is_shutdown) 510 if (host->is_shutdown)
514 return -EINVAL; 511 return -EINVAL;
515 if (host->in_bus_reset || 512 if (host->in_bus_reset ||
516 (packet->generation != get_hpsb_generation(host))) 513 (packet->generation != get_hpsb_generation(host)))
517 return -EAGAIN; 514 return -EAGAIN;
518 515
519 packet->state = hpsb_queued; 516 packet->state = hpsb_queued;
520 517
521 /* This just seems silly to me */ 518 /* This just seems silly to me */
522 WARN_ON(packet->no_waiter && packet->expect_response); 519 WARN_ON(packet->no_waiter && packet->expect_response);
@@ -530,42 +527,42 @@ int hpsb_send_packet(struct hpsb_packet *packet)
530 skb_queue_tail(&host->pending_packet_queue, packet->skb); 527 skb_queue_tail(&host->pending_packet_queue, packet->skb);
531 } 528 }
532 529
533 if (packet->node_id == host->node_id) { 530 if (packet->node_id == host->node_id) {
534 /* it is a local request, so handle it locally */ 531 /* it is a local request, so handle it locally */
535 532
536 quadlet_t *data; 533 quadlet_t *data;
537 size_t size = packet->data_size + packet->header_size; 534 size_t size = packet->data_size + packet->header_size;
538 535
539 data = kmalloc(size, GFP_ATOMIC); 536 data = kmalloc(size, GFP_ATOMIC);
540 if (!data) { 537 if (!data) {
541 HPSB_ERR("unable to allocate memory for concatenating header and data"); 538 HPSB_ERR("unable to allocate memory for concatenating header and data");
542 return -ENOMEM; 539 return -ENOMEM;
543 } 540 }
544 541
545 memcpy(data, packet->header, packet->header_size); 542 memcpy(data, packet->header, packet->header_size);
546 543
547 if (packet->data_size) 544 if (packet->data_size)
548 memcpy(((u8*)data) + packet->header_size, packet->data, packet->data_size); 545 memcpy(((u8*)data) + packet->header_size, packet->data, packet->data_size);
549 546
550 dump_packet("send packet local", packet->header, packet->header_size, -1); 547 dump_packet("send packet local", packet->header, packet->header_size, -1);
551 548
552 hpsb_packet_sent(host, packet, packet->expect_response ? ACK_PENDING : ACK_COMPLETE); 549 hpsb_packet_sent(host, packet, packet->expect_response ? ACK_PENDING : ACK_COMPLETE);
553 hpsb_packet_received(host, data, size, 0); 550 hpsb_packet_received(host, data, size, 0);
554 551
555 kfree(data); 552 kfree(data);
556 553
557 return 0; 554 return 0;
558 } 555 }
559 556
560 if (packet->type == hpsb_async && packet->node_id != ALL_NODES) { 557 if (packet->type == hpsb_async && packet->node_id != ALL_NODES) {
561 packet->speed_code = 558 packet->speed_code =
562 host->speed_map[NODEID_TO_NODE(host->node_id) * 64 559 host->speed_map[NODEID_TO_NODE(host->node_id) * 64
563 + NODEID_TO_NODE(packet->node_id)]; 560 + NODEID_TO_NODE(packet->node_id)];
564 } 561 }
565 562
566 dump_packet("send packet", packet->header, packet->header_size, packet->speed_code); 563 dump_packet("send packet", packet->header, packet->header_size, packet->speed_code);
567 564
568 return host->driver->transmit_packet(host, packet); 565 return host->driver->transmit_packet(host, packet);
569} 566}
570 567
571/* We could just use complete() directly as the packet complete 568/* We could just use complete() directly as the packet complete
@@ -593,81 +590,81 @@ int hpsb_send_packet_and_wait(struct hpsb_packet *packet)
593 590
594static void send_packet_nocare(struct hpsb_packet *packet) 591static void send_packet_nocare(struct hpsb_packet *packet)
595{ 592{
596 if (hpsb_send_packet(packet) < 0) { 593 if (hpsb_send_packet(packet) < 0) {
597 hpsb_free_packet(packet); 594 hpsb_free_packet(packet);
598 } 595 }
599} 596}
600 597
601 598
602static void handle_packet_response(struct hpsb_host *host, int tcode, 599static void handle_packet_response(struct hpsb_host *host, int tcode,
603 quadlet_t *data, size_t size) 600 quadlet_t *data, size_t size)
604{ 601{
605 struct hpsb_packet *packet = NULL; 602 struct hpsb_packet *packet = NULL;
606 struct sk_buff *skb; 603 struct sk_buff *skb;
607 int tcode_match = 0; 604 int tcode_match = 0;
608 int tlabel; 605 int tlabel;
609 unsigned long flags; 606 unsigned long flags;
610 607
611 tlabel = (data[0] >> 10) & 0x3f; 608 tlabel = (data[0] >> 10) & 0x3f;
612 609
613 spin_lock_irqsave(&host->pending_packet_queue.lock, flags); 610 spin_lock_irqsave(&host->pending_packet_queue.lock, flags);
614 611
615 skb_queue_walk(&host->pending_packet_queue, skb) { 612 skb_queue_walk(&host->pending_packet_queue, skb) {
616 packet = (struct hpsb_packet *)skb->data; 613 packet = (struct hpsb_packet *)skb->data;
617 if ((packet->tlabel == tlabel) 614 if ((packet->tlabel == tlabel)
618 && (packet->node_id == (data[1] >> 16))){ 615 && (packet->node_id == (data[1] >> 16))){
619 break; 616 break;
620 } 617 }
621 618
622 packet = NULL; 619 packet = NULL;
623 } 620 }
624 621
625 if (packet == NULL) { 622 if (packet == NULL) {
626 HPSB_DEBUG("unsolicited response packet received - no tlabel match"); 623 HPSB_DEBUG("unsolicited response packet received - no tlabel match");
627 dump_packet("contents", data, 16, -1); 624 dump_packet("contents", data, 16, -1);
628 spin_unlock_irqrestore(&host->pending_packet_queue.lock, flags); 625 spin_unlock_irqrestore(&host->pending_packet_queue.lock, flags);
629 return; 626 return;
630 } 627 }
631 628
632 switch (packet->tcode) { 629 switch (packet->tcode) {
633 case TCODE_WRITEQ: 630 case TCODE_WRITEQ:
634 case TCODE_WRITEB: 631 case TCODE_WRITEB:
635 if (tcode != TCODE_WRITE_RESPONSE) 632 if (tcode != TCODE_WRITE_RESPONSE)
636 break; 633 break;
637 tcode_match = 1; 634 tcode_match = 1;
638 memcpy(packet->header, data, 12); 635 memcpy(packet->header, data, 12);
639 break; 636 break;
640 case TCODE_READQ: 637 case TCODE_READQ:
641 if (tcode != TCODE_READQ_RESPONSE) 638 if (tcode != TCODE_READQ_RESPONSE)
642 break; 639 break;
643 tcode_match = 1; 640 tcode_match = 1;
644 memcpy(packet->header, data, 16); 641 memcpy(packet->header, data, 16);
645 break; 642 break;
646 case TCODE_READB: 643 case TCODE_READB:
647 if (tcode != TCODE_READB_RESPONSE) 644 if (tcode != TCODE_READB_RESPONSE)
648 break; 645 break;
649 tcode_match = 1; 646 tcode_match = 1;
650 BUG_ON(packet->skb->len - sizeof(*packet) < size - 16); 647 BUG_ON(packet->skb->len - sizeof(*packet) < size - 16);
651 memcpy(packet->header, data, 16); 648 memcpy(packet->header, data, 16);
652 memcpy(packet->data, data + 4, size - 16); 649 memcpy(packet->data, data + 4, size - 16);
653 break; 650 break;
654 case TCODE_LOCK_REQUEST: 651 case TCODE_LOCK_REQUEST:
655 if (tcode != TCODE_LOCK_RESPONSE) 652 if (tcode != TCODE_LOCK_RESPONSE)
656 break; 653 break;
657 tcode_match = 1; 654 tcode_match = 1;
658 size = min((size - 16), (size_t)8); 655 size = min((size - 16), (size_t)8);
659 BUG_ON(packet->skb->len - sizeof(*packet) < size); 656 BUG_ON(packet->skb->len - sizeof(*packet) < size);
660 memcpy(packet->header, data, 16); 657 memcpy(packet->header, data, 16);
661 memcpy(packet->data, data + 4, size); 658 memcpy(packet->data, data + 4, size);
662 break; 659 break;
663 } 660 }
664 661
665 if (!tcode_match) { 662 if (!tcode_match) {
666 spin_unlock_irqrestore(&host->pending_packet_queue.lock, flags); 663 spin_unlock_irqrestore(&host->pending_packet_queue.lock, flags);
667 HPSB_INFO("unsolicited response packet received - tcode mismatch"); 664 HPSB_INFO("unsolicited response packet received - tcode mismatch");
668 dump_packet("contents", data, 16, -1); 665 dump_packet("contents", data, 16, -1);
669 return; 666 return;
670 } 667 }
671 668
672 __skb_unlink(skb, &host->pending_packet_queue); 669 __skb_unlink(skb, &host->pending_packet_queue);
673 670
@@ -686,27 +683,27 @@ static void handle_packet_response(struct hpsb_host *host, int tcode,
686static struct hpsb_packet *create_reply_packet(struct hpsb_host *host, 683static struct hpsb_packet *create_reply_packet(struct hpsb_host *host,
687 quadlet_t *data, size_t dsize) 684 quadlet_t *data, size_t dsize)
688{ 685{
689 struct hpsb_packet *p; 686 struct hpsb_packet *p;
690 687
691 p = hpsb_alloc_packet(dsize); 688 p = hpsb_alloc_packet(dsize);
692 if (unlikely(p == NULL)) { 689 if (unlikely(p == NULL)) {
693 /* FIXME - send data_error response */ 690 /* FIXME - send data_error response */
694 return NULL; 691 return NULL;
695 } 692 }
696 693
697 p->type = hpsb_async; 694 p->type = hpsb_async;
698 p->state = hpsb_unused; 695 p->state = hpsb_unused;
699 p->host = host; 696 p->host = host;
700 p->node_id = data[1] >> 16; 697 p->node_id = data[1] >> 16;
701 p->tlabel = (data[0] >> 10) & 0x3f; 698 p->tlabel = (data[0] >> 10) & 0x3f;
702 p->no_waiter = 1; 699 p->no_waiter = 1;
703 700
704 p->generation = get_hpsb_generation(host); 701 p->generation = get_hpsb_generation(host);
705 702
706 if (dsize % 4) 703 if (dsize % 4)
707 p->data[dsize / 4] = 0; 704 p->data[dsize / 4] = 0;
708 705
709 return p; 706 return p;
710} 707}
711 708
712#define PREP_ASYNC_HEAD_RCODE(tc) \ 709#define PREP_ASYNC_HEAD_RCODE(tc) \
@@ -717,7 +714,7 @@ static struct hpsb_packet *create_reply_packet(struct hpsb_host *host,
717 packet->header[2] = 0 714 packet->header[2] = 0
718 715
719static void fill_async_readquad_resp(struct hpsb_packet *packet, int rcode, 716static void fill_async_readquad_resp(struct hpsb_packet *packet, int rcode,
720 quadlet_t data) 717 quadlet_t data)
721{ 718{
722 PREP_ASYNC_HEAD_RCODE(TCODE_READQ_RESPONSE); 719 PREP_ASYNC_HEAD_RCODE(TCODE_READQ_RESPONSE);
723 packet->header[3] = data; 720 packet->header[3] = data;
@@ -726,7 +723,7 @@ static void fill_async_readquad_resp(struct hpsb_packet *packet, int rcode,
726} 723}
727 724
728static void fill_async_readblock_resp(struct hpsb_packet *packet, int rcode, 725static void fill_async_readblock_resp(struct hpsb_packet *packet, int rcode,
729 int length) 726 int length)
730{ 727{
731 if (rcode != RCODE_COMPLETE) 728 if (rcode != RCODE_COMPLETE)
732 length = 0; 729 length = 0;
@@ -746,7 +743,7 @@ static void fill_async_write_resp(struct hpsb_packet *packet, int rcode)
746} 743}
747 744
748static void fill_async_lock_resp(struct hpsb_packet *packet, int rcode, int extcode, 745static void fill_async_lock_resp(struct hpsb_packet *packet, int rcode, int extcode,
749 int length) 746 int length)
750{ 747{
751 if (rcode != RCODE_COMPLETE) 748 if (rcode != RCODE_COMPLETE)
752 length = 0; 749 length = 0;
@@ -758,184 +755,184 @@ static void fill_async_lock_resp(struct hpsb_packet *packet, int rcode, int extc
758} 755}
759 756
760#define PREP_REPLY_PACKET(length) \ 757#define PREP_REPLY_PACKET(length) \
761 packet = create_reply_packet(host, data, length); \ 758 packet = create_reply_packet(host, data, length); \
762 if (packet == NULL) break 759 if (packet == NULL) break
763 760
764static void handle_incoming_packet(struct hpsb_host *host, int tcode, 761static void handle_incoming_packet(struct hpsb_host *host, int tcode,
765 quadlet_t *data, size_t size, int write_acked) 762 quadlet_t *data, size_t size, int write_acked)
766{ 763{
767 struct hpsb_packet *packet; 764 struct hpsb_packet *packet;
768 int length, rcode, extcode; 765 int length, rcode, extcode;
769 quadlet_t buffer; 766 quadlet_t buffer;
770 nodeid_t source = data[1] >> 16; 767 nodeid_t source = data[1] >> 16;
771 nodeid_t dest = data[0] >> 16; 768 nodeid_t dest = data[0] >> 16;
772 u16 flags = (u16) data[0]; 769 u16 flags = (u16) data[0];
773 u64 addr; 770 u64 addr;
774 771
775 /* big FIXME - no error checking is done for an out of bounds length */ 772 /* big FIXME - no error checking is done for an out of bounds length */
776 773
777 switch (tcode) { 774 switch (tcode) {
778 case TCODE_WRITEQ: 775 case TCODE_WRITEQ:
779 addr = (((u64)(data[1] & 0xffff)) << 32) | data[2]; 776 addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
780 rcode = highlevel_write(host, source, dest, data+3, 777 rcode = highlevel_write(host, source, dest, data+3,
781 addr, 4, flags); 778 addr, 4, flags);
782 779
783 if (!write_acked 780 if (!write_acked
784 && (NODEID_TO_NODE(data[0] >> 16) != NODE_MASK) 781 && (NODEID_TO_NODE(data[0] >> 16) != NODE_MASK)
785 && (rcode >= 0)) { 782 && (rcode >= 0)) {
786 /* not a broadcast write, reply */ 783 /* not a broadcast write, reply */
787 PREP_REPLY_PACKET(0); 784 PREP_REPLY_PACKET(0);
788 fill_async_write_resp(packet, rcode); 785 fill_async_write_resp(packet, rcode);
789 send_packet_nocare(packet); 786 send_packet_nocare(packet);
790 } 787 }
791 break; 788 break;
792 789
793 case TCODE_WRITEB: 790 case TCODE_WRITEB:
794 addr = (((u64)(data[1] & 0xffff)) << 32) | data[2]; 791 addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
795 rcode = highlevel_write(host, source, dest, data+4, 792 rcode = highlevel_write(host, source, dest, data+4,
796 addr, data[3]>>16, flags); 793 addr, data[3]>>16, flags);
797 794
798 if (!write_acked 795 if (!write_acked
799 && (NODEID_TO_NODE(data[0] >> 16) != NODE_MASK) 796 && (NODEID_TO_NODE(data[0] >> 16) != NODE_MASK)
800 && (rcode >= 0)) { 797 && (rcode >= 0)) {
801 /* not a broadcast write, reply */ 798 /* not a broadcast write, reply */
802 PREP_REPLY_PACKET(0); 799 PREP_REPLY_PACKET(0);
803 fill_async_write_resp(packet, rcode); 800 fill_async_write_resp(packet, rcode);
804 send_packet_nocare(packet); 801 send_packet_nocare(packet);
805 } 802 }
806 break; 803 break;
807 804
808 case TCODE_READQ: 805 case TCODE_READQ:
809 addr = (((u64)(data[1] & 0xffff)) << 32) | data[2]; 806 addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
810 rcode = highlevel_read(host, source, &buffer, addr, 4, flags); 807 rcode = highlevel_read(host, source, &buffer, addr, 4, flags);
811 808
812 if (rcode >= 0) { 809 if (rcode >= 0) {
813 PREP_REPLY_PACKET(0); 810 PREP_REPLY_PACKET(0);
814 fill_async_readquad_resp(packet, rcode, buffer); 811 fill_async_readquad_resp(packet, rcode, buffer);
815 send_packet_nocare(packet); 812 send_packet_nocare(packet);
816 } 813 }
817 break; 814 break;
818 815
819 case TCODE_READB: 816 case TCODE_READB:
820 length = data[3] >> 16; 817 length = data[3] >> 16;
821 PREP_REPLY_PACKET(length); 818 PREP_REPLY_PACKET(length);
822 819
823 addr = (((u64)(data[1] & 0xffff)) << 32) | data[2]; 820 addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
824 rcode = highlevel_read(host, source, packet->data, addr, 821 rcode = highlevel_read(host, source, packet->data, addr,
825 length, flags); 822 length, flags);
826 823
827 if (rcode >= 0) { 824 if (rcode >= 0) {
828 fill_async_readblock_resp(packet, rcode, length); 825 fill_async_readblock_resp(packet, rcode, length);
829 send_packet_nocare(packet); 826 send_packet_nocare(packet);
830 } else { 827 } else {
831 hpsb_free_packet(packet); 828 hpsb_free_packet(packet);
832 } 829 }
833 break; 830 break;
834 831
835 case TCODE_LOCK_REQUEST: 832 case TCODE_LOCK_REQUEST:
836 length = data[3] >> 16; 833 length = data[3] >> 16;
837 extcode = data[3] & 0xffff; 834 extcode = data[3] & 0xffff;
838 addr = (((u64)(data[1] & 0xffff)) << 32) | data[2]; 835 addr = (((u64)(data[1] & 0xffff)) << 32) | data[2];
839 836
840 PREP_REPLY_PACKET(8); 837 PREP_REPLY_PACKET(8);
841 838
842 if ((extcode == 0) || (extcode >= 7)) { 839 if ((extcode == 0) || (extcode >= 7)) {
843 /* let switch default handle error */ 840 /* let switch default handle error */
844 length = 0; 841 length = 0;
845 } 842 }
846 843
847 switch (length) { 844 switch (length) {
848 case 4: 845 case 4:
849 rcode = highlevel_lock(host, source, packet->data, addr, 846 rcode = highlevel_lock(host, source, packet->data, addr,
850 data[4], 0, extcode,flags); 847 data[4], 0, extcode,flags);
851 fill_async_lock_resp(packet, rcode, extcode, 4); 848 fill_async_lock_resp(packet, rcode, extcode, 4);
852 break; 849 break;
853 case 8: 850 case 8:
854 if ((extcode != EXTCODE_FETCH_ADD) 851 if ((extcode != EXTCODE_FETCH_ADD)
855 && (extcode != EXTCODE_LITTLE_ADD)) { 852 && (extcode != EXTCODE_LITTLE_ADD)) {
856 rcode = highlevel_lock(host, source, 853 rcode = highlevel_lock(host, source,
857 packet->data, addr, 854 packet->data, addr,
858 data[5], data[4], 855 data[5], data[4],
859 extcode, flags); 856 extcode, flags);
860 fill_async_lock_resp(packet, rcode, extcode, 4); 857 fill_async_lock_resp(packet, rcode, extcode, 4);
861 } else { 858 } else {
862 rcode = highlevel_lock64(host, source, 859 rcode = highlevel_lock64(host, source,
863 (octlet_t *)packet->data, addr, 860 (octlet_t *)packet->data, addr,
864 *(octlet_t *)(data + 4), 0ULL, 861 *(octlet_t *)(data + 4), 0ULL,
865 extcode, flags); 862 extcode, flags);
866 fill_async_lock_resp(packet, rcode, extcode, 8); 863 fill_async_lock_resp(packet, rcode, extcode, 8);
867 } 864 }
868 break; 865 break;
869 case 16: 866 case 16:
870 rcode = highlevel_lock64(host, source, 867 rcode = highlevel_lock64(host, source,
871 (octlet_t *)packet->data, addr, 868 (octlet_t *)packet->data, addr,
872 *(octlet_t *)(data + 6), 869 *(octlet_t *)(data + 6),
873 *(octlet_t *)(data + 4), 870 *(octlet_t *)(data + 4),
874 extcode, flags); 871 extcode, flags);
875 fill_async_lock_resp(packet, rcode, extcode, 8); 872 fill_async_lock_resp(packet, rcode, extcode, 8);
876 break; 873 break;
877 default: 874 default:
878 rcode = RCODE_TYPE_ERROR; 875 rcode = RCODE_TYPE_ERROR;
879 fill_async_lock_resp(packet, rcode, 876 fill_async_lock_resp(packet, rcode,
880 extcode, 0); 877 extcode, 0);
881 } 878 }
882 879
883 if (rcode >= 0) { 880 if (rcode >= 0) {
884 send_packet_nocare(packet); 881 send_packet_nocare(packet);
885 } else { 882 } else {
886 hpsb_free_packet(packet); 883 hpsb_free_packet(packet);
887 } 884 }
888 break; 885 break;
889 } 886 }
890 887
891} 888}
892#undef PREP_REPLY_PACKET 889#undef PREP_REPLY_PACKET
893 890
894 891
895void hpsb_packet_received(struct hpsb_host *host, quadlet_t *data, size_t size, 892void hpsb_packet_received(struct hpsb_host *host, quadlet_t *data, size_t size,
896 int write_acked) 893 int write_acked)
897{ 894{
898 int tcode; 895 int tcode;
899 896
900 if (host->in_bus_reset) { 897 if (host->in_bus_reset) {
901 HPSB_INFO("received packet during reset; ignoring"); 898 HPSB_INFO("received packet during reset; ignoring");
902 return; 899 return;
903 } 900 }
904 901
905 dump_packet("received packet", data, size, -1); 902 dump_packet("received packet", data, size, -1);
906 903
907 tcode = (data[0] >> 4) & 0xf; 904 tcode = (data[0] >> 4) & 0xf;
908 905
909 switch (tcode) { 906 switch (tcode) {
910 case TCODE_WRITE_RESPONSE: 907 case TCODE_WRITE_RESPONSE:
911 case TCODE_READQ_RESPONSE: 908 case TCODE_READQ_RESPONSE:
912 case TCODE_READB_RESPONSE: 909 case TCODE_READB_RESPONSE:
913 case TCODE_LOCK_RESPONSE: 910 case TCODE_LOCK_RESPONSE:
914 handle_packet_response(host, tcode, data, size); 911 handle_packet_response(host, tcode, data, size);
915 break; 912 break;
916 913
917 case TCODE_WRITEQ: 914 case TCODE_WRITEQ:
918 case TCODE_WRITEB: 915 case TCODE_WRITEB:
919 case TCODE_READQ: 916 case TCODE_READQ:
920 case TCODE_READB: 917 case TCODE_READB:
921 case TCODE_LOCK_REQUEST: 918 case TCODE_LOCK_REQUEST:
922 handle_incoming_packet(host, tcode, data, size, write_acked); 919 handle_incoming_packet(host, tcode, data, size, write_acked);
923 break; 920 break;
924 921
925 922
926 case TCODE_ISO_DATA: 923 case TCODE_ISO_DATA:
927 highlevel_iso_receive(host, data, size); 924 highlevel_iso_receive(host, data, size);
928 break; 925 break;
929 926
930 case TCODE_CYCLE_START: 927 case TCODE_CYCLE_START:
931 /* simply ignore this packet if it is passed on */ 928 /* simply ignore this packet if it is passed on */
932 break; 929 break;
933 930
934 default: 931 default:
935 HPSB_NOTICE("received packet with bogus transaction code %d", 932 HPSB_NOTICE("received packet with bogus transaction code %d",
936 tcode); 933 tcode);
937 break; 934 break;
938 } 935 }
939} 936}
940 937
941 938
@@ -1030,10 +1027,10 @@ static int hpsbpkt_thread(void *__hi)
1030 1027
1031 daemonize("khpsbpkt"); 1028 daemonize("khpsbpkt");
1032 1029
1030 current->flags |= PF_NOFREEZE;
1031
1033 while (1) { 1032 while (1) {
1034 if (down_interruptible(&khpsbpkt_sig)) { 1033 if (down_interruptible(&khpsbpkt_sig)) {
1035 if (try_to_freeze())
1036 continue;
1037 printk("khpsbpkt: received unexpected signal?!\n" ); 1034 printk("khpsbpkt: received unexpected signal?!\n" );
1038 break; 1035 break;
1039 } 1036 }
@@ -1129,7 +1126,7 @@ static int __init ieee1394_init(void)
1129 nodemgr implements functionality required of ieee1394a-2000 1126 nodemgr implements functionality required of ieee1394a-2000
1130 IRMs */ 1127 IRMs */
1131 hpsb_disable_irm = 1; 1128 hpsb_disable_irm = 1;
1132 1129
1133 return 0; 1130 return 0;
1134 } 1131 }
1135 1132
diff --git a/drivers/ieee1394/ieee1394_core.h b/drivers/ieee1394/ieee1394_core.h
index 0b31429d0a68..b35466023f00 100644
--- a/drivers/ieee1394/ieee1394_core.h
+++ b/drivers/ieee1394/ieee1394_core.h
@@ -10,8 +10,8 @@
10 10
11 11
12struct hpsb_packet { 12struct hpsb_packet {
13 /* This struct is basically read-only for hosts with the exception of 13 /* This struct is basically read-only for hosts with the exception of
14 * the data buffer contents and xnext - see below. */ 14 * the data buffer contents and xnext - see below. */
15 15
16 /* This can be used for host driver internal linking. 16 /* This can be used for host driver internal linking.
17 * 17 *
@@ -21,47 +21,47 @@ struct hpsb_packet {
21 * driver_list when free'ing it. */ 21 * driver_list when free'ing it. */
22 struct list_head driver_list; 22 struct list_head driver_list;
23 23
24 nodeid_t node_id; 24 nodeid_t node_id;
25 25
26 /* Async and Iso types should be clear, raw means send-as-is, do not 26 /* Async and Iso types should be clear, raw means send-as-is, do not
27 * CRC! Byte swapping shall still be done in this case. */ 27 * CRC! Byte swapping shall still be done in this case. */
28 enum { hpsb_async, hpsb_iso, hpsb_raw } __attribute__((packed)) type; 28 enum { hpsb_async, hpsb_iso, hpsb_raw } __attribute__((packed)) type;
29 29
30 /* Okay, this is core internal and a no care for hosts. 30 /* Okay, this is core internal and a no care for hosts.
31 * queued = queued for sending 31 * queued = queued for sending
32 * pending = sent, waiting for response 32 * pending = sent, waiting for response
33 * complete = processing completed, successful or not 33 * complete = processing completed, successful or not
34 */ 34 */
35 enum { 35 enum {
36 hpsb_unused, hpsb_queued, hpsb_pending, hpsb_complete 36 hpsb_unused, hpsb_queued, hpsb_pending, hpsb_complete
37 } __attribute__((packed)) state; 37 } __attribute__((packed)) state;
38 38
39 /* These are core internal. */ 39 /* These are core internal. */
40 signed char tlabel; 40 signed char tlabel;
41 signed char ack_code; 41 signed char ack_code;
42 unsigned char tcode; 42 unsigned char tcode;
43 43
44 unsigned expect_response:1; 44 unsigned expect_response:1;
45 unsigned no_waiter:1; 45 unsigned no_waiter:1;
46 46
47 /* Speed to transmit with: 0 = 100Mbps, 1 = 200Mbps, 2 = 400Mbps */ 47 /* Speed to transmit with: 0 = 100Mbps, 1 = 200Mbps, 2 = 400Mbps */
48 unsigned speed_code:2; 48 unsigned speed_code:2;
49 49
50 /* 50 /*
51 * *header and *data are guaranteed to be 32-bit DMAable and may be 51 * *header and *data are guaranteed to be 32-bit DMAable and may be
52 * overwritten to allow in-place byte swapping. Neither of these is 52 * overwritten to allow in-place byte swapping. Neither of these is
53 * CRCed (the sizes also don't include CRC), but contain space for at 53 * CRCed (the sizes also don't include CRC), but contain space for at
54 * least one additional quadlet to allow in-place CRCing. The memory is 54 * least one additional quadlet to allow in-place CRCing. The memory is
55 * also guaranteed to be DMA mappable. 55 * also guaranteed to be DMA mappable.
56 */ 56 */
57 quadlet_t *header; 57 quadlet_t *header;
58 quadlet_t *data; 58 quadlet_t *data;
59 size_t header_size; 59 size_t header_size;
60 size_t data_size; 60 size_t data_size;
61 61
62 62
63 struct hpsb_host *host; 63 struct hpsb_host *host;
64 unsigned int generation; 64 unsigned int generation;
65 65
66 atomic_t refcnt; 66 atomic_t refcnt;
67 67
@@ -73,10 +73,10 @@ struct hpsb_packet {
73 /* XXX This is just a hack at the moment */ 73 /* XXX This is just a hack at the moment */
74 struct sk_buff *skb; 74 struct sk_buff *skb;
75 75
76 /* Store jiffies for implementing bus timeouts. */ 76 /* Store jiffies for implementing bus timeouts. */
77 unsigned long sendtime; 77 unsigned long sendtime;
78 78
79 quadlet_t embedded_header[5]; 79 quadlet_t embedded_header[5];
80}; 80};
81 81
82/* Set a task for when a packet completes */ 82/* Set a task for when a packet completes */
@@ -102,7 +102,7 @@ void hpsb_free_packet(struct hpsb_packet *packet);
102 */ 102 */
103static inline unsigned int get_hpsb_generation(struct hpsb_host *host) 103static inline unsigned int get_hpsb_generation(struct hpsb_host *host)
104{ 104{
105 return atomic_read(&host->generation); 105 return atomic_read(&host->generation);
106} 106}
107 107
108/* 108/*
@@ -157,7 +157,7 @@ void hpsb_selfid_complete(struct hpsb_host *host, int phyid, int isroot);
157 * from within a transmit packet routine. 157 * from within a transmit packet routine.
158 */ 158 */
159void hpsb_packet_sent(struct hpsb_host *host, struct hpsb_packet *packet, 159void hpsb_packet_sent(struct hpsb_host *host, struct hpsb_packet *packet,
160 int ackcode); 160 int ackcode);
161 161
162/* 162/*
163 * Hand over received packet to the core. The contents of data are expected to 163 * Hand over received packet to the core. The contents of data are expected to
@@ -171,7 +171,7 @@ void hpsb_packet_sent(struct hpsb_host *host, struct hpsb_packet *packet,
171 * packet type. 171 * packet type.
172 */ 172 */
173void hpsb_packet_received(struct hpsb_host *host, quadlet_t *data, size_t size, 173void hpsb_packet_received(struct hpsb_host *host, quadlet_t *data, size_t size,
174 int write_acked); 174 int write_acked);
175 175
176 176
177/* 177/*
@@ -197,20 +197,20 @@ void hpsb_packet_received(struct hpsb_host *host, quadlet_t *data, size_t size,
197 * Block 15 (240-255) reserved for drivers under development, etc. 197 * Block 15 (240-255) reserved for drivers under development, etc.
198 */ 198 */
199 199
200#define IEEE1394_MAJOR 171 200#define IEEE1394_MAJOR 171
201 201
202#define IEEE1394_MINOR_BLOCK_RAW1394 0 202#define IEEE1394_MINOR_BLOCK_RAW1394 0
203#define IEEE1394_MINOR_BLOCK_VIDEO1394 1 203#define IEEE1394_MINOR_BLOCK_VIDEO1394 1
204#define IEEE1394_MINOR_BLOCK_DV1394 2 204#define IEEE1394_MINOR_BLOCK_DV1394 2
205#define IEEE1394_MINOR_BLOCK_AMDTP 3 205#define IEEE1394_MINOR_BLOCK_AMDTP 3
206#define IEEE1394_MINOR_BLOCK_EXPERIMENTAL 15 206#define IEEE1394_MINOR_BLOCK_EXPERIMENTAL 15
207 207
208#define IEEE1394_CORE_DEV MKDEV(IEEE1394_MAJOR, 0) 208#define IEEE1394_CORE_DEV MKDEV(IEEE1394_MAJOR, 0)
209#define IEEE1394_RAW1394_DEV MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16) 209#define IEEE1394_RAW1394_DEV MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16)
210#define IEEE1394_VIDEO1394_DEV MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_VIDEO1394 * 16) 210#define IEEE1394_VIDEO1394_DEV MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_VIDEO1394 * 16)
211#define IEEE1394_DV1394_DEV MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_DV1394 * 16) 211#define IEEE1394_DV1394_DEV MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_DV1394 * 16)
212#define IEEE1394_AMDTP_DEV MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_AMDTP * 16) 212#define IEEE1394_AMDTP_DEV MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_AMDTP * 16)
213#define IEEE1394_EXPERIMENTAL_DEV MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_EXPERIMENTAL * 16) 213#define IEEE1394_EXPERIMENTAL_DEV MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_EXPERIMENTAL * 16)
214 214
215/* return the index (within a minor number block) of a file */ 215/* return the index (within a minor number block) of a file */
216static inline unsigned char ieee1394_file_to_instance(struct file *file) 216static inline unsigned char ieee1394_file_to_instance(struct file *file)
diff --git a/drivers/ieee1394/ieee1394_transactions.c b/drivers/ieee1394/ieee1394_transactions.c
index 0aa876360f9b..3fe2f6c4a253 100644
--- a/drivers/ieee1394/ieee1394_transactions.c
+++ b/drivers/ieee1394/ieee1394_transactions.c
@@ -22,7 +22,7 @@
22#include "ieee1394_core.h" 22#include "ieee1394_core.h"
23#include "highlevel.h" 23#include "highlevel.h"
24#include "nodemgr.h" 24#include "nodemgr.h"
25 25#include "ieee1394_transactions.h"
26 26
27#define PREP_ASYNC_HEAD_ADDRESS(tc) \ 27#define PREP_ASYNC_HEAD_ADDRESS(tc) \
28 packet->tcode = tc; \ 28 packet->tcode = tc; \
@@ -31,80 +31,82 @@
31 packet->header[1] = (packet->host->node_id << 16) | (addr >> 32); \ 31 packet->header[1] = (packet->host->node_id << 16) | (addr >> 32); \
32 packet->header[2] = addr & 0xffffffff 32 packet->header[2] = addr & 0xffffffff
33 33
34
35static void fill_async_readquad(struct hpsb_packet *packet, u64 addr) 34static void fill_async_readquad(struct hpsb_packet *packet, u64 addr)
36{ 35{
37 PREP_ASYNC_HEAD_ADDRESS(TCODE_READQ); 36 PREP_ASYNC_HEAD_ADDRESS(TCODE_READQ);
38 packet->header_size = 12; 37 packet->header_size = 12;
39 packet->data_size = 0; 38 packet->data_size = 0;
40 packet->expect_response = 1; 39 packet->expect_response = 1;
41} 40}
42 41
43static void fill_async_readblock(struct hpsb_packet *packet, u64 addr, int length) 42static void fill_async_readblock(struct hpsb_packet *packet, u64 addr,
43 int length)
44{ 44{
45 PREP_ASYNC_HEAD_ADDRESS(TCODE_READB); 45 PREP_ASYNC_HEAD_ADDRESS(TCODE_READB);
46 packet->header[3] = length << 16; 46 packet->header[3] = length << 16;
47 packet->header_size = 16; 47 packet->header_size = 16;
48 packet->data_size = 0; 48 packet->data_size = 0;
49 packet->expect_response = 1; 49 packet->expect_response = 1;
50} 50}
51 51
52static void fill_async_writequad(struct hpsb_packet *packet, u64 addr, quadlet_t data) 52static void fill_async_writequad(struct hpsb_packet *packet, u64 addr,
53 quadlet_t data)
53{ 54{
54 PREP_ASYNC_HEAD_ADDRESS(TCODE_WRITEQ); 55 PREP_ASYNC_HEAD_ADDRESS(TCODE_WRITEQ);
55 packet->header[3] = data; 56 packet->header[3] = data;
56 packet->header_size = 16; 57 packet->header_size = 16;
57 packet->data_size = 0; 58 packet->data_size = 0;
58 packet->expect_response = 1; 59 packet->expect_response = 1;
59} 60}
60 61
61static void fill_async_writeblock(struct hpsb_packet *packet, u64 addr, int length) 62static void fill_async_writeblock(struct hpsb_packet *packet, u64 addr,
63 int length)
62{ 64{
63 PREP_ASYNC_HEAD_ADDRESS(TCODE_WRITEB); 65 PREP_ASYNC_HEAD_ADDRESS(TCODE_WRITEB);
64 packet->header[3] = length << 16; 66 packet->header[3] = length << 16;
65 packet->header_size = 16; 67 packet->header_size = 16;
66 packet->expect_response = 1; 68 packet->expect_response = 1;
67 packet->data_size = length + (length % 4 ? 4 - (length % 4) : 0); 69 packet->data_size = length + (length % 4 ? 4 - (length % 4) : 0);
68} 70}
69 71
70static void fill_async_lock(struct hpsb_packet *packet, u64 addr, int extcode, 72static void fill_async_lock(struct hpsb_packet *packet, u64 addr, int extcode,
71 int length) 73 int length)
72{ 74{
73 PREP_ASYNC_HEAD_ADDRESS(TCODE_LOCK_REQUEST); 75 PREP_ASYNC_HEAD_ADDRESS(TCODE_LOCK_REQUEST);
74 packet->header[3] = (length << 16) | extcode; 76 packet->header[3] = (length << 16) | extcode;
75 packet->header_size = 16; 77 packet->header_size = 16;
76 packet->data_size = length; 78 packet->data_size = length;
77 packet->expect_response = 1; 79 packet->expect_response = 1;
78} 80}
79 81
80static void fill_iso_packet(struct hpsb_packet *packet, int length, int channel, 82static void fill_iso_packet(struct hpsb_packet *packet, int length, int channel,
81 int tag, int sync) 83 int tag, int sync)
82{ 84{
83 packet->header[0] = (length << 16) | (tag << 14) | (channel << 8) 85 packet->header[0] = (length << 16) | (tag << 14) | (channel << 8)
84 | (TCODE_ISO_DATA << 4) | sync; 86 | (TCODE_ISO_DATA << 4) | sync;
85 87
86 packet->header_size = 4; 88 packet->header_size = 4;
87 packet->data_size = length; 89 packet->data_size = length;
88 packet->type = hpsb_iso; 90 packet->type = hpsb_iso;
89 packet->tcode = TCODE_ISO_DATA; 91 packet->tcode = TCODE_ISO_DATA;
90} 92}
91 93
92static void fill_phy_packet(struct hpsb_packet *packet, quadlet_t data) 94static void fill_phy_packet(struct hpsb_packet *packet, quadlet_t data)
93{ 95{
94 packet->header[0] = data; 96 packet->header[0] = data;
95 packet->header[1] = ~data; 97 packet->header[1] = ~data;
96 packet->header_size = 8; 98 packet->header_size = 8;
97 packet->data_size = 0; 99 packet->data_size = 0;
98 packet->expect_response = 0; 100 packet->expect_response = 0;
99 packet->type = hpsb_raw; /* No CRC added */ 101 packet->type = hpsb_raw; /* No CRC added */
100 packet->speed_code = IEEE1394_SPEED_100; /* Force speed to be 100Mbps */ 102 packet->speed_code = IEEE1394_SPEED_100; /* Force speed to be 100Mbps */
101} 103}
102 104
103static void fill_async_stream_packet(struct hpsb_packet *packet, int length, 105static void fill_async_stream_packet(struct hpsb_packet *packet, int length,
104 int channel, int tag, int sync) 106 int channel, int tag, int sync)
105{ 107{
106 packet->header[0] = (length << 16) | (tag << 14) | (channel << 8) 108 packet->header[0] = (length << 16) | (tag << 14) | (channel << 8)
107 | (TCODE_STREAM_DATA << 4) | sync; 109 | (TCODE_STREAM_DATA << 4) | sync;
108 110
109 packet->header_size = 4; 111 packet->header_size = 4;
110 packet->data_size = length; 112 packet->data_size = length;
@@ -171,99 +173,96 @@ int hpsb_get_tlabel(struct hpsb_packet *packet)
171 */ 173 */
172void hpsb_free_tlabel(struct hpsb_packet *packet) 174void hpsb_free_tlabel(struct hpsb_packet *packet)
173{ 175{
174 unsigned long flags; 176 unsigned long flags;
175 struct hpsb_tlabel_pool *tp; 177 struct hpsb_tlabel_pool *tp;
176 178
177 tp = &packet->host->tpool[packet->node_id & NODE_MASK]; 179 tp = &packet->host->tpool[packet->node_id & NODE_MASK];
178 180
179 BUG_ON(packet->tlabel > 63 || packet->tlabel < 0); 181 BUG_ON(packet->tlabel > 63 || packet->tlabel < 0);
180 182
181 spin_lock_irqsave(&tp->lock, flags); 183 spin_lock_irqsave(&tp->lock, flags);
182 BUG_ON(!test_and_clear_bit(packet->tlabel, tp->pool)); 184 BUG_ON(!test_and_clear_bit(packet->tlabel, tp->pool));
183 spin_unlock_irqrestore(&tp->lock, flags); 185 spin_unlock_irqrestore(&tp->lock, flags);
184 186
185 up(&tp->count); 187 up(&tp->count);
186} 188}
187 189
188
189
190int hpsb_packet_success(struct hpsb_packet *packet) 190int hpsb_packet_success(struct hpsb_packet *packet)
191{ 191{
192 switch (packet->ack_code) { 192 switch (packet->ack_code) {
193 case ACK_PENDING: 193 case ACK_PENDING:
194 switch ((packet->header[1] >> 12) & 0xf) { 194 switch ((packet->header[1] >> 12) & 0xf) {
195 case RCODE_COMPLETE: 195 case RCODE_COMPLETE:
196 return 0; 196 return 0;
197 case RCODE_CONFLICT_ERROR: 197 case RCODE_CONFLICT_ERROR:
198 return -EAGAIN; 198 return -EAGAIN;
199 case RCODE_DATA_ERROR: 199 case RCODE_DATA_ERROR:
200 return -EREMOTEIO; 200 return -EREMOTEIO;
201 case RCODE_TYPE_ERROR: 201 case RCODE_TYPE_ERROR:
202 return -EACCES; 202 return -EACCES;
203 case RCODE_ADDRESS_ERROR: 203 case RCODE_ADDRESS_ERROR:
204 return -EINVAL; 204 return -EINVAL;
205 default: 205 default:
206 HPSB_ERR("received reserved rcode %d from node %d", 206 HPSB_ERR("received reserved rcode %d from node %d",
207 (packet->header[1] >> 12) & 0xf, 207 (packet->header[1] >> 12) & 0xf,
208 packet->node_id); 208 packet->node_id);
209 return -EAGAIN; 209 return -EAGAIN;
210 } 210 }
211 HPSB_PANIC("reached unreachable code 1 in %s", __FUNCTION__); 211 HPSB_PANIC("reached unreachable code 1 in %s", __FUNCTION__);
212 212
213 case ACK_BUSY_X: 213 case ACK_BUSY_X:
214 case ACK_BUSY_A: 214 case ACK_BUSY_A:
215 case ACK_BUSY_B: 215 case ACK_BUSY_B:
216 return -EBUSY; 216 return -EBUSY;
217 217
218 case ACK_TYPE_ERROR: 218 case ACK_TYPE_ERROR:
219 return -EACCES; 219 return -EACCES;
220 220
221 case ACK_COMPLETE: 221 case ACK_COMPLETE:
222 if (packet->tcode == TCODE_WRITEQ 222 if (packet->tcode == TCODE_WRITEQ
223 || packet->tcode == TCODE_WRITEB) { 223 || packet->tcode == TCODE_WRITEB) {
224 return 0; 224 return 0;
225 } else { 225 } else {
226 HPSB_ERR("impossible ack_complete from node %d " 226 HPSB_ERR("impossible ack_complete from node %d "
227 "(tcode %d)", packet->node_id, packet->tcode); 227 "(tcode %d)", packet->node_id, packet->tcode);
228 return -EAGAIN; 228 return -EAGAIN;
229 } 229 }
230 230
231 231 case ACK_DATA_ERROR:
232 case ACK_DATA_ERROR: 232 if (packet->tcode == TCODE_WRITEB
233 if (packet->tcode == TCODE_WRITEB 233 || packet->tcode == TCODE_LOCK_REQUEST) {
234 || packet->tcode == TCODE_LOCK_REQUEST) { 234 return -EAGAIN;
235 return -EAGAIN; 235 } else {
236 } else { 236 HPSB_ERR("impossible ack_data_error from node %d "
237 HPSB_ERR("impossible ack_data_error from node %d " 237 "(tcode %d)", packet->node_id, packet->tcode);
238 "(tcode %d)", packet->node_id, packet->tcode); 238 return -EAGAIN;
239 return -EAGAIN; 239 }
240 } 240
241 241 case ACK_ADDRESS_ERROR:
242 case ACK_ADDRESS_ERROR: 242 return -EINVAL;
243 return -EINVAL; 243
244 244 case ACK_TARDY:
245 case ACK_TARDY: 245 case ACK_CONFLICT_ERROR:
246 case ACK_CONFLICT_ERROR: 246 case ACKX_NONE:
247 case ACKX_NONE: 247 case ACKX_SEND_ERROR:
248 case ACKX_SEND_ERROR: 248 case ACKX_ABORTED:
249 case ACKX_ABORTED: 249 case ACKX_TIMEOUT:
250 case ACKX_TIMEOUT: 250 /* error while sending */
251 /* error while sending */ 251 return -EAGAIN;
252 return -EAGAIN; 252
253 253 default:
254 default: 254 HPSB_ERR("got invalid ack %d from node %d (tcode %d)",
255 HPSB_ERR("got invalid ack %d from node %d (tcode %d)", 255 packet->ack_code, packet->node_id, packet->tcode);
256 packet->ack_code, packet->node_id, packet->tcode); 256 return -EAGAIN;
257 return -EAGAIN; 257 }
258 } 258
259 259 HPSB_PANIC("reached unreachable code 2 in %s", __FUNCTION__);
260 HPSB_PANIC("reached unreachable code 2 in %s", __FUNCTION__);
261} 260}
262 261
263struct hpsb_packet *hpsb_make_readpacket(struct hpsb_host *host, nodeid_t node, 262struct hpsb_packet *hpsb_make_readpacket(struct hpsb_host *host, nodeid_t node,
264 u64 addr, size_t length) 263 u64 addr, size_t length)
265{ 264{
266 struct hpsb_packet *packet; 265 struct hpsb_packet *packet;
267 266
268 if (length == 0) 267 if (length == 0)
269 return NULL; 268 return NULL;
@@ -288,8 +287,9 @@ struct hpsb_packet *hpsb_make_readpacket(struct hpsb_host *host, nodeid_t node,
288 return packet; 287 return packet;
289} 288}
290 289
291struct hpsb_packet *hpsb_make_writepacket (struct hpsb_host *host, nodeid_t node, 290struct hpsb_packet *hpsb_make_writepacket(struct hpsb_host *host, nodeid_t node,
292 u64 addr, quadlet_t *buffer, size_t length) 291 u64 addr, quadlet_t * buffer,
292 size_t length)
293{ 293{
294 struct hpsb_packet *packet; 294 struct hpsb_packet *packet;
295 295
@@ -300,7 +300,7 @@ struct hpsb_packet *hpsb_make_writepacket (struct hpsb_host *host, nodeid_t node
300 if (!packet) 300 if (!packet)
301 return NULL; 301 return NULL;
302 302
303 if (length % 4) { /* zero padding bytes */ 303 if (length % 4) { /* zero padding bytes */
304 packet->data[length >> 2] = 0; 304 packet->data[length >> 2] = 0;
305 } 305 }
306 packet->host = host; 306 packet->host = host;
@@ -322,8 +322,9 @@ struct hpsb_packet *hpsb_make_writepacket (struct hpsb_host *host, nodeid_t node
322 return packet; 322 return packet;
323} 323}
324 324
325struct hpsb_packet *hpsb_make_streampacket(struct hpsb_host *host, u8 *buffer, int length, 325struct hpsb_packet *hpsb_make_streampacket(struct hpsb_host *host, u8 * buffer,
326 int channel, int tag, int sync) 326 int length, int channel, int tag,
327 int sync)
327{ 328{
328 struct hpsb_packet *packet; 329 struct hpsb_packet *packet;
329 330
@@ -334,7 +335,7 @@ struct hpsb_packet *hpsb_make_streampacket(struct hpsb_host *host, u8 *buffer, i
334 if (!packet) 335 if (!packet)
335 return NULL; 336 return NULL;
336 337
337 if (length % 4) { /* zero padding bytes */ 338 if (length % 4) { /* zero padding bytes */
338 packet->data[length >> 2] = 0; 339 packet->data[length >> 2] = 0;
339 } 340 }
340 packet->host = host; 341 packet->host = host;
@@ -352,14 +353,15 @@ struct hpsb_packet *hpsb_make_streampacket(struct hpsb_host *host, u8 *buffer, i
352} 353}
353 354
354struct hpsb_packet *hpsb_make_lockpacket(struct hpsb_host *host, nodeid_t node, 355struct hpsb_packet *hpsb_make_lockpacket(struct hpsb_host *host, nodeid_t node,
355 u64 addr, int extcode, quadlet_t *data, 356 u64 addr, int extcode,
356 quadlet_t arg) 357 quadlet_t * data, quadlet_t arg)
357{ 358{
358 struct hpsb_packet *p; 359 struct hpsb_packet *p;
359 u32 length; 360 u32 length;
360 361
361 p = hpsb_alloc_packet(8); 362 p = hpsb_alloc_packet(8);
362 if (!p) return NULL; 363 if (!p)
364 return NULL;
363 365
364 p->host = host; 366 p->host = host;
365 p->node_id = node; 367 p->node_id = node;
@@ -388,15 +390,16 @@ struct hpsb_packet *hpsb_make_lockpacket(struct hpsb_host *host, nodeid_t node,
388 return p; 390 return p;
389} 391}
390 392
391struct hpsb_packet *hpsb_make_lock64packet(struct hpsb_host *host, nodeid_t node, 393struct hpsb_packet *hpsb_make_lock64packet(struct hpsb_host *host,
392 u64 addr, int extcode, octlet_t *data, 394 nodeid_t node, u64 addr, int extcode,
393 octlet_t arg) 395 octlet_t * data, octlet_t arg)
394{ 396{
395 struct hpsb_packet *p; 397 struct hpsb_packet *p;
396 u32 length; 398 u32 length;
397 399
398 p = hpsb_alloc_packet(16); 400 p = hpsb_alloc_packet(16);
399 if (!p) return NULL; 401 if (!p)
402 return NULL;
400 403
401 p->host = host; 404 p->host = host;
402 p->node_id = node; 405 p->node_id = node;
@@ -429,18 +432,18 @@ struct hpsb_packet *hpsb_make_lock64packet(struct hpsb_host *host, nodeid_t node
429 return p; 432 return p;
430} 433}
431 434
432struct hpsb_packet *hpsb_make_phypacket(struct hpsb_host *host, 435struct hpsb_packet *hpsb_make_phypacket(struct hpsb_host *host, quadlet_t data)
433 quadlet_t data)
434{ 436{
435 struct hpsb_packet *p; 437 struct hpsb_packet *p;
436 438
437 p = hpsb_alloc_packet(0); 439 p = hpsb_alloc_packet(0);
438 if (!p) return NULL; 440 if (!p)
441 return NULL;
439 442
440 p->host = host; 443 p->host = host;
441 fill_phy_packet(p, data); 444 fill_phy_packet(p, data);
442 445
443 return p; 446 return p;
444} 447}
445 448
446struct hpsb_packet *hpsb_make_isopacket(struct hpsb_host *host, 449struct hpsb_packet *hpsb_make_isopacket(struct hpsb_host *host,
@@ -450,7 +453,8 @@ struct hpsb_packet *hpsb_make_isopacket(struct hpsb_host *host,
450 struct hpsb_packet *p; 453 struct hpsb_packet *p;
451 454
452 p = hpsb_alloc_packet(length); 455 p = hpsb_alloc_packet(length);
453 if (!p) return NULL; 456 if (!p)
457 return NULL;
454 458
455 p->host = host; 459 p->host = host;
456 fill_iso_packet(p, length, channel, tag, sync); 460 fill_iso_packet(p, length, channel, tag, sync);
@@ -466,47 +470,46 @@ struct hpsb_packet *hpsb_make_isopacket(struct hpsb_host *host,
466 */ 470 */
467 471
468int hpsb_read(struct hpsb_host *host, nodeid_t node, unsigned int generation, 472int hpsb_read(struct hpsb_host *host, nodeid_t node, unsigned int generation,
469 u64 addr, quadlet_t *buffer, size_t length) 473 u64 addr, quadlet_t * buffer, size_t length)
470{ 474{
471 struct hpsb_packet *packet; 475 struct hpsb_packet *packet;
472 int retval = 0; 476 int retval = 0;
473 477
474 if (length == 0) 478 if (length == 0)
475 return -EINVAL; 479 return -EINVAL;
476 480
477 BUG_ON(in_interrupt()); // We can't be called in an interrupt, yet 481 BUG_ON(in_interrupt()); // We can't be called in an interrupt, yet
478 482
479 packet = hpsb_make_readpacket(host, node, addr, length); 483 packet = hpsb_make_readpacket(host, node, addr, length);
480 484
481 if (!packet) { 485 if (!packet) {
482 return -ENOMEM; 486 return -ENOMEM;
483 } 487 }
484 488
485 packet->generation = generation; 489 packet->generation = generation;
486 retval = hpsb_send_packet_and_wait(packet); 490 retval = hpsb_send_packet_and_wait(packet);
487 if (retval < 0) 491 if (retval < 0)
488 goto hpsb_read_fail; 492 goto hpsb_read_fail;
489 493
490 retval = hpsb_packet_success(packet); 494 retval = hpsb_packet_success(packet);
491 495
492 if (retval == 0) { 496 if (retval == 0) {
493 if (length == 4) { 497 if (length == 4) {
494 *buffer = packet->header[3]; 498 *buffer = packet->header[3];
495 } else { 499 } else {
496 memcpy(buffer, packet->data, length); 500 memcpy(buffer, packet->data, length);
497 } 501 }
498 } 502 }
499 503
500hpsb_read_fail: 504 hpsb_read_fail:
501 hpsb_free_tlabel(packet); 505 hpsb_free_tlabel(packet);
502 hpsb_free_packet(packet); 506 hpsb_free_packet(packet);
503 507
504 return retval; 508 return retval;
505} 509}
506 510
507
508int hpsb_write(struct hpsb_host *host, nodeid_t node, unsigned int generation, 511int hpsb_write(struct hpsb_host *host, nodeid_t node, unsigned int generation,
509 u64 addr, quadlet_t *buffer, size_t length) 512 u64 addr, quadlet_t * buffer, size_t length)
510{ 513{
511 struct hpsb_packet *packet; 514 struct hpsb_packet *packet;
512 int retval; 515 int retval;
@@ -514,62 +517,61 @@ int hpsb_write(struct hpsb_host *host, nodeid_t node, unsigned int generation,
514 if (length == 0) 517 if (length == 0)
515 return -EINVAL; 518 return -EINVAL;
516 519
517 BUG_ON(in_interrupt()); // We can't be called in an interrupt, yet 520 BUG_ON(in_interrupt()); // We can't be called in an interrupt, yet
518 521
519 packet = hpsb_make_writepacket (host, node, addr, buffer, length); 522 packet = hpsb_make_writepacket(host, node, addr, buffer, length);
520 523
521 if (!packet) 524 if (!packet)
522 return -ENOMEM; 525 return -ENOMEM;
523 526
524 packet->generation = generation; 527 packet->generation = generation;
525 retval = hpsb_send_packet_and_wait(packet); 528 retval = hpsb_send_packet_and_wait(packet);
526 if (retval < 0) 529 if (retval < 0)
527 goto hpsb_write_fail; 530 goto hpsb_write_fail;
528 531
529 retval = hpsb_packet_success(packet); 532 retval = hpsb_packet_success(packet);
530 533
531hpsb_write_fail: 534 hpsb_write_fail:
532 hpsb_free_tlabel(packet); 535 hpsb_free_tlabel(packet);
533 hpsb_free_packet(packet); 536 hpsb_free_packet(packet);
534 537
535 return retval; 538 return retval;
536} 539}
537 540
538#if 0 541#if 0
539 542
540int hpsb_lock(struct hpsb_host *host, nodeid_t node, unsigned int generation, 543int hpsb_lock(struct hpsb_host *host, nodeid_t node, unsigned int generation,
541 u64 addr, int extcode, quadlet_t *data, quadlet_t arg) 544 u64 addr, int extcode, quadlet_t * data, quadlet_t arg)
542{ 545{
543 struct hpsb_packet *packet; 546 struct hpsb_packet *packet;
544 int retval = 0; 547 int retval = 0;
545 548
546 BUG_ON(in_interrupt()); // We can't be called in an interrupt, yet 549 BUG_ON(in_interrupt()); // We can't be called in an interrupt, yet
547 550
548 packet = hpsb_make_lockpacket(host, node, addr, extcode, data, arg); 551 packet = hpsb_make_lockpacket(host, node, addr, extcode, data, arg);
549 if (!packet) 552 if (!packet)
550 return -ENOMEM; 553 return -ENOMEM;
551 554
552 packet->generation = generation; 555 packet->generation = generation;
553 retval = hpsb_send_packet_and_wait(packet); 556 retval = hpsb_send_packet_and_wait(packet);
554 if (retval < 0) 557 if (retval < 0)
555 goto hpsb_lock_fail; 558 goto hpsb_lock_fail;
556 559
557 retval = hpsb_packet_success(packet); 560 retval = hpsb_packet_success(packet);
558 561
559 if (retval == 0) { 562 if (retval == 0) {
560 *data = packet->data[0]; 563 *data = packet->data[0];
561 } 564 }
562 565
563hpsb_lock_fail: 566 hpsb_lock_fail:
564 hpsb_free_tlabel(packet); 567 hpsb_free_tlabel(packet);
565 hpsb_free_packet(packet); 568 hpsb_free_packet(packet);
566 569
567 return retval; 570 return retval;
568} 571}
569 572
570
571int hpsb_send_gasp(struct hpsb_host *host, int channel, unsigned int generation, 573int hpsb_send_gasp(struct hpsb_host *host, int channel, unsigned int generation,
572 quadlet_t *buffer, size_t length, u32 specifier_id, 574 quadlet_t * buffer, size_t length, u32 specifier_id,
573 unsigned int version) 575 unsigned int version)
574{ 576{
575 struct hpsb_packet *packet; 577 struct hpsb_packet *packet;
@@ -586,7 +588,8 @@ int hpsb_send_gasp(struct hpsb_host *host, int channel, unsigned int generation,
586 return -ENOMEM; 588 return -ENOMEM;
587 589
588 packet->data[0] = cpu_to_be32((host->node_id << 16) | specifier_id_hi); 590 packet->data[0] = cpu_to_be32((host->node_id << 16) | specifier_id_hi);
589 packet->data[1] = cpu_to_be32((specifier_id_lo << 24) | (version & 0x00ffffff)); 591 packet->data[1] =
592 cpu_to_be32((specifier_id_lo << 24) | (version & 0x00ffffff));
590 593
591 memcpy(&(packet->data[2]), buffer, length - 8); 594 memcpy(&(packet->data[2]), buffer, length - 8);
592 595
@@ -601,4 +604,4 @@ int hpsb_send_gasp(struct hpsb_host *host, int channel, unsigned int generation,
601 return retval; 604 return retval;
602} 605}
603 606
604#endif /* 0 */ 607#endif /* 0 */
diff --git a/drivers/ieee1394/iso.c b/drivers/ieee1394/iso.c
index 615541b8b90f..f26680ebef7c 100644
--- a/drivers/ieee1394/iso.c
+++ b/drivers/ieee1394/iso.c
@@ -36,20 +36,22 @@ void hpsb_iso_shutdown(struct hpsb_iso *iso)
36 kfree(iso); 36 kfree(iso);
37} 37}
38 38
39static struct hpsb_iso* hpsb_iso_common_init(struct hpsb_host *host, enum hpsb_iso_type type, 39static struct hpsb_iso *hpsb_iso_common_init(struct hpsb_host *host,
40 enum hpsb_iso_type type,
40 unsigned int data_buf_size, 41 unsigned int data_buf_size,
41 unsigned int buf_packets, 42 unsigned int buf_packets,
42 int channel, 43 int channel, int dma_mode,
43 int dma_mode,
44 int irq_interval, 44 int irq_interval,
45 void (*callback)(struct hpsb_iso*)) 45 void (*callback) (struct hpsb_iso
46 *))
46{ 47{
47 struct hpsb_iso *iso; 48 struct hpsb_iso *iso;
48 int dma_direction; 49 int dma_direction;
49 50
50 /* make sure driver supports the ISO API */ 51 /* make sure driver supports the ISO API */
51 if (!host->driver->isoctl) { 52 if (!host->driver->isoctl) {
52 printk(KERN_INFO "ieee1394: host driver '%s' does not support the rawiso API\n", 53 printk(KERN_INFO
54 "ieee1394: host driver '%s' does not support the rawiso API\n",
53 host->driver->name); 55 host->driver->name);
54 return NULL; 56 return NULL;
55 } 57 }
@@ -59,12 +61,13 @@ static struct hpsb_iso* hpsb_iso_common_init(struct hpsb_host *host, enum hpsb_i
59 if (buf_packets < 2) 61 if (buf_packets < 2)
60 buf_packets = 2; 62 buf_packets = 2;
61 63
62 if ((dma_mode < HPSB_ISO_DMA_DEFAULT) || (dma_mode > HPSB_ISO_DMA_PACKET_PER_BUFFER)) 64 if ((dma_mode < HPSB_ISO_DMA_DEFAULT)
63 dma_mode=HPSB_ISO_DMA_DEFAULT; 65 || (dma_mode > HPSB_ISO_DMA_PACKET_PER_BUFFER))
66 dma_mode = HPSB_ISO_DMA_DEFAULT;
64 67
65 if ((irq_interval < 0) || (irq_interval > buf_packets / 4)) 68 if ((irq_interval < 0) || (irq_interval > buf_packets / 4))
66 irq_interval = buf_packets / 4; 69 irq_interval = buf_packets / 4;
67 if (irq_interval == 0) /* really interrupt for each packet*/ 70 if (irq_interval == 0) /* really interrupt for each packet */
68 irq_interval = 1; 71 irq_interval = 1;
69 72
70 if (channel < -1 || channel >= 64) 73 if (channel < -1 || channel >= 64)
@@ -76,7 +79,10 @@ static struct hpsb_iso* hpsb_iso_common_init(struct hpsb_host *host, enum hpsb_i
76 79
77 /* allocate and write the struct hpsb_iso */ 80 /* allocate and write the struct hpsb_iso */
78 81
79 iso = kmalloc(sizeof(*iso) + buf_packets * sizeof(struct hpsb_iso_packet_info), GFP_KERNEL); 82 iso =
83 kmalloc(sizeof(*iso) +
84 buf_packets * sizeof(struct hpsb_iso_packet_info),
85 GFP_KERNEL);
80 if (!iso) 86 if (!iso)
81 return NULL; 87 return NULL;
82 88
@@ -111,17 +117,18 @@ static struct hpsb_iso* hpsb_iso_common_init(struct hpsb_host *host, enum hpsb_i
111 iso->prebuffer = 0; 117 iso->prebuffer = 0;
112 118
113 /* allocate the packet buffer */ 119 /* allocate the packet buffer */
114 if (dma_region_alloc(&iso->data_buf, iso->buf_size, host->pdev, dma_direction)) 120 if (dma_region_alloc
121 (&iso->data_buf, iso->buf_size, host->pdev, dma_direction))
115 goto err; 122 goto err;
116 123
117 return iso; 124 return iso;
118 125
119err: 126 err:
120 hpsb_iso_shutdown(iso); 127 hpsb_iso_shutdown(iso);
121 return NULL; 128 return NULL;
122} 129}
123 130
124int hpsb_iso_n_ready(struct hpsb_iso* iso) 131int hpsb_iso_n_ready(struct hpsb_iso *iso)
125{ 132{
126 unsigned long flags; 133 unsigned long flags;
127 int val; 134 int val;
@@ -133,18 +140,19 @@ int hpsb_iso_n_ready(struct hpsb_iso* iso)
133 return val; 140 return val;
134} 141}
135 142
136 143struct hpsb_iso *hpsb_iso_xmit_init(struct hpsb_host *host,
137struct hpsb_iso* hpsb_iso_xmit_init(struct hpsb_host *host,
138 unsigned int data_buf_size, 144 unsigned int data_buf_size,
139 unsigned int buf_packets, 145 unsigned int buf_packets,
140 int channel, 146 int channel,
141 int speed, 147 int speed,
142 int irq_interval, 148 int irq_interval,
143 void (*callback)(struct hpsb_iso*)) 149 void (*callback) (struct hpsb_iso *))
144{ 150{
145 struct hpsb_iso *iso = hpsb_iso_common_init(host, HPSB_ISO_XMIT, 151 struct hpsb_iso *iso = hpsb_iso_common_init(host, HPSB_ISO_XMIT,
146 data_buf_size, buf_packets, 152 data_buf_size, buf_packets,
147 channel, HPSB_ISO_DMA_DEFAULT, irq_interval, callback); 153 channel,
154 HPSB_ISO_DMA_DEFAULT,
155 irq_interval, callback);
148 if (!iso) 156 if (!iso)
149 return NULL; 157 return NULL;
150 158
@@ -157,22 +165,23 @@ struct hpsb_iso* hpsb_iso_xmit_init(struct hpsb_host *host,
157 iso->flags |= HPSB_ISO_DRIVER_INIT; 165 iso->flags |= HPSB_ISO_DRIVER_INIT;
158 return iso; 166 return iso;
159 167
160err: 168 err:
161 hpsb_iso_shutdown(iso); 169 hpsb_iso_shutdown(iso);
162 return NULL; 170 return NULL;
163} 171}
164 172
165struct hpsb_iso* hpsb_iso_recv_init(struct hpsb_host *host, 173struct hpsb_iso *hpsb_iso_recv_init(struct hpsb_host *host,
166 unsigned int data_buf_size, 174 unsigned int data_buf_size,
167 unsigned int buf_packets, 175 unsigned int buf_packets,
168 int channel, 176 int channel,
169 int dma_mode, 177 int dma_mode,
170 int irq_interval, 178 int irq_interval,
171 void (*callback)(struct hpsb_iso*)) 179 void (*callback) (struct hpsb_iso *))
172{ 180{
173 struct hpsb_iso *iso = hpsb_iso_common_init(host, HPSB_ISO_RECV, 181 struct hpsb_iso *iso = hpsb_iso_common_init(host, HPSB_ISO_RECV,
174 data_buf_size, buf_packets, 182 data_buf_size, buf_packets,
175 channel, dma_mode, irq_interval, callback); 183 channel, dma_mode,
184 irq_interval, callback);
176 if (!iso) 185 if (!iso)
177 return NULL; 186 return NULL;
178 187
@@ -183,7 +192,7 @@ struct hpsb_iso* hpsb_iso_recv_init(struct hpsb_host *host,
183 iso->flags |= HPSB_ISO_DRIVER_INIT; 192 iso->flags |= HPSB_ISO_DRIVER_INIT;
184 return iso; 193 return iso;
185 194
186err: 195 err:
187 hpsb_iso_shutdown(iso); 196 hpsb_iso_shutdown(iso);
188 return NULL; 197 return NULL;
189} 198}
@@ -197,16 +206,17 @@ int hpsb_iso_recv_listen_channel(struct hpsb_iso *iso, unsigned char channel)
197 206
198int hpsb_iso_recv_unlisten_channel(struct hpsb_iso *iso, unsigned char channel) 207int hpsb_iso_recv_unlisten_channel(struct hpsb_iso *iso, unsigned char channel)
199{ 208{
200 if (iso->type != HPSB_ISO_RECV || iso->channel != -1 || channel >= 64) 209 if (iso->type != HPSB_ISO_RECV || iso->channel != -1 || channel >= 64)
201 return -EINVAL; 210 return -EINVAL;
202 return iso->host->driver->isoctl(iso, RECV_UNLISTEN_CHANNEL, channel); 211 return iso->host->driver->isoctl(iso, RECV_UNLISTEN_CHANNEL, channel);
203} 212}
204 213
205int hpsb_iso_recv_set_channel_mask(struct hpsb_iso *iso, u64 mask) 214int hpsb_iso_recv_set_channel_mask(struct hpsb_iso *iso, u64 mask)
206{ 215{
207 if (iso->type != HPSB_ISO_RECV || iso->channel != -1) 216 if (iso->type != HPSB_ISO_RECV || iso->channel != -1)
208 return -EINVAL; 217 return -EINVAL;
209 return iso->host->driver->isoctl(iso, RECV_SET_CHANNEL_MASK, (unsigned long) &mask); 218 return iso->host->driver->isoctl(iso, RECV_SET_CHANNEL_MASK,
219 (unsigned long)&mask);
210} 220}
211 221
212int hpsb_iso_recv_flush(struct hpsb_iso *iso) 222int hpsb_iso_recv_flush(struct hpsb_iso *iso)
@@ -283,7 +293,9 @@ int hpsb_iso_recv_start(struct hpsb_iso *iso, int cycle, int tag_mask, int sync)
283 293
284 isoctl_args[2] = sync; 294 isoctl_args[2] = sync;
285 295
286 retval = iso->host->driver->isoctl(iso, RECV_START, (unsigned long) &isoctl_args[0]); 296 retval =
297 iso->host->driver->isoctl(iso, RECV_START,
298 (unsigned long)&isoctl_args[0]);
287 if (retval) 299 if (retval)
288 return retval; 300 return retval;
289 301
@@ -296,7 +308,8 @@ int hpsb_iso_recv_start(struct hpsb_iso *iso, int cycle, int tag_mask, int sync)
296 308
297static int hpsb_iso_check_offset_len(struct hpsb_iso *iso, 309static int hpsb_iso_check_offset_len(struct hpsb_iso *iso,
298 unsigned int offset, unsigned short len, 310 unsigned int offset, unsigned short len,
299 unsigned int *out_offset, unsigned short *out_len) 311 unsigned int *out_offset,
312 unsigned short *out_len)
300{ 313{
301 if (offset >= iso->buf_size) 314 if (offset >= iso->buf_size)
302 return -EFAULT; 315 return -EFAULT;
@@ -316,8 +329,8 @@ static int hpsb_iso_check_offset_len(struct hpsb_iso *iso,
316 return 0; 329 return 0;
317} 330}
318 331
319 332int hpsb_iso_xmit_queue_packet(struct hpsb_iso *iso, u32 offset, u16 len,
320int hpsb_iso_xmit_queue_packet(struct hpsb_iso *iso, u32 offset, u16 len, u8 tag, u8 sy) 333 u8 tag, u8 sy)
321{ 334{
322 struct hpsb_iso_packet_info *info; 335 struct hpsb_iso_packet_info *info;
323 unsigned long flags; 336 unsigned long flags;
@@ -334,7 +347,8 @@ int hpsb_iso_xmit_queue_packet(struct hpsb_iso *iso, u32 offset, u16 len, u8 tag
334 info = &iso->infos[iso->first_packet]; 347 info = &iso->infos[iso->first_packet];
335 348
336 /* check for bogus offset/length */ 349 /* check for bogus offset/length */
337 if (hpsb_iso_check_offset_len(iso, offset, len, &info->offset, &info->len)) 350 if (hpsb_iso_check_offset_len
351 (iso, offset, len, &info->offset, &info->len))
338 return -EFAULT; 352 return -EFAULT;
339 353
340 info->tag = tag; 354 info->tag = tag;
@@ -342,13 +356,13 @@ int hpsb_iso_xmit_queue_packet(struct hpsb_iso *iso, u32 offset, u16 len, u8 tag
342 356
343 spin_lock_irqsave(&iso->lock, flags); 357 spin_lock_irqsave(&iso->lock, flags);
344 358
345 rv = iso->host->driver->isoctl(iso, XMIT_QUEUE, (unsigned long) info); 359 rv = iso->host->driver->isoctl(iso, XMIT_QUEUE, (unsigned long)info);
346 if (rv) 360 if (rv)
347 goto out; 361 goto out;
348 362
349 /* increment cursors */ 363 /* increment cursors */
350 iso->first_packet = (iso->first_packet+1) % iso->buf_packets; 364 iso->first_packet = (iso->first_packet + 1) % iso->buf_packets;
351 iso->xmit_cycle = (iso->xmit_cycle+1) % 8000; 365 iso->xmit_cycle = (iso->xmit_cycle + 1) % 8000;
352 iso->n_ready_packets--; 366 iso->n_ready_packets--;
353 367
354 if (iso->prebuffer != 0) { 368 if (iso->prebuffer != 0) {
@@ -359,7 +373,7 @@ int hpsb_iso_xmit_queue_packet(struct hpsb_iso *iso, u32 offset, u16 len, u8 tag
359 } 373 }
360 } 374 }
361 375
362out: 376 out:
363 spin_unlock_irqrestore(&iso->lock, flags); 377 spin_unlock_irqrestore(&iso->lock, flags);
364 return rv; 378 return rv;
365} 379}
@@ -369,7 +383,9 @@ int hpsb_iso_xmit_sync(struct hpsb_iso *iso)
369 if (iso->type != HPSB_ISO_XMIT) 383 if (iso->type != HPSB_ISO_XMIT)
370 return -EINVAL; 384 return -EINVAL;
371 385
372 return wait_event_interruptible(iso->waitq, hpsb_iso_n_ready(iso) == iso->buf_packets); 386 return wait_event_interruptible(iso->waitq,
387 hpsb_iso_n_ready(iso) ==
388 iso->buf_packets);
373} 389}
374 390
375void hpsb_iso_packet_sent(struct hpsb_iso *iso, int cycle, int error) 391void hpsb_iso_packet_sent(struct hpsb_iso *iso, int cycle, int error)
@@ -396,7 +412,8 @@ void hpsb_iso_packet_sent(struct hpsb_iso *iso, int cycle, int error)
396} 412}
397 413
398void hpsb_iso_packet_received(struct hpsb_iso *iso, u32 offset, u16 len, 414void hpsb_iso_packet_received(struct hpsb_iso *iso, u32 offset, u16 len,
399 u16 total_len, u16 cycle, u8 channel, u8 tag, u8 sy) 415 u16 total_len, u16 cycle, u8 channel, u8 tag,
416 u8 sy)
400{ 417{
401 unsigned long flags; 418 unsigned long flags;
402 spin_lock_irqsave(&iso->lock, flags); 419 spin_lock_irqsave(&iso->lock, flags);
@@ -416,7 +433,7 @@ void hpsb_iso_packet_received(struct hpsb_iso *iso, u32 offset, u16 len,
416 info->tag = tag; 433 info->tag = tag;
417 info->sy = sy; 434 info->sy = sy;
418 435
419 iso->pkt_dma = (iso->pkt_dma+1) % iso->buf_packets; 436 iso->pkt_dma = (iso->pkt_dma + 1) % iso->buf_packets;
420 iso->n_ready_packets++; 437 iso->n_ready_packets++;
421 } 438 }
422 439
@@ -435,20 +452,21 @@ int hpsb_iso_recv_release_packets(struct hpsb_iso *iso, unsigned int n_packets)
435 spin_lock_irqsave(&iso->lock, flags); 452 spin_lock_irqsave(&iso->lock, flags);
436 for (i = 0; i < n_packets; i++) { 453 for (i = 0; i < n_packets; i++) {
437 rv = iso->host->driver->isoctl(iso, RECV_RELEASE, 454 rv = iso->host->driver->isoctl(iso, RECV_RELEASE,
438 (unsigned long) &iso->infos[iso->first_packet]); 455 (unsigned long)&iso->infos[iso->
456 first_packet]);
439 if (rv) 457 if (rv)
440 break; 458 break;
441 459
442 iso->first_packet = (iso->first_packet+1) % iso->buf_packets; 460 iso->first_packet = (iso->first_packet + 1) % iso->buf_packets;
443 iso->n_ready_packets--; 461 iso->n_ready_packets--;
444 462
445 /* release memory from packets discarded when queue was full */ 463 /* release memory from packets discarded when queue was full */
446 if (iso->n_ready_packets == 0) { /* Release only after all prior packets handled */ 464 if (iso->n_ready_packets == 0) { /* Release only after all prior packets handled */
447 if (iso->bytes_discarded != 0) { 465 if (iso->bytes_discarded != 0) {
448 struct hpsb_iso_packet_info inf; 466 struct hpsb_iso_packet_info inf;
449 inf.total_len = iso->bytes_discarded; 467 inf.total_len = iso->bytes_discarded;
450 iso->host->driver->isoctl(iso, RECV_RELEASE, 468 iso->host->driver->isoctl(iso, RECV_RELEASE,
451 (unsigned long) &inf); 469 (unsigned long)&inf);
452 iso->bytes_discarded = 0; 470 iso->bytes_discarded = 0;
453 } 471 }
454 } 472 }
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index f2453668acf5..082c7fd239f5 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -743,21 +743,20 @@ static struct node_entry *nodemgr_create_node(octlet_t guid, struct csr1212_csr
743 unsigned int generation) 743 unsigned int generation)
744{ 744{
745 struct hpsb_host *host = hi->host; 745 struct hpsb_host *host = hi->host;
746 struct node_entry *ne; 746 struct node_entry *ne;
747
748 ne = kmalloc(sizeof(struct node_entry), GFP_KERNEL);
749 if (!ne) return NULL;
750 747
751 memset(ne, 0, sizeof(struct node_entry)); 748 ne = kzalloc(sizeof(*ne), GFP_KERNEL);
749 if (!ne)
750 return NULL;
752 751
753 ne->tpool = &host->tpool[nodeid & NODE_MASK]; 752 ne->tpool = &host->tpool[nodeid & NODE_MASK];
754 753
755 ne->host = host; 754 ne->host = host;
756 ne->nodeid = nodeid; 755 ne->nodeid = nodeid;
757 ne->generation = generation; 756 ne->generation = generation;
758 ne->needs_probe = 1; 757 ne->needs_probe = 1;
759 758
760 ne->guid = guid; 759 ne->guid = guid;
761 ne->guid_vendor_id = (guid >> 40) & 0xffffff; 760 ne->guid_vendor_id = (guid >> 40) & 0xffffff;
762 ne->guid_vendor_oui = nodemgr_find_oui_name(ne->guid_vendor_id); 761 ne->guid_vendor_oui = nodemgr_find_oui_name(ne->guid_vendor_id);
763 ne->csr = csr; 762 ne->csr = csr;
@@ -787,7 +786,7 @@ static struct node_entry *nodemgr_create_node(octlet_t guid, struct csr1212_csr
787 (host->node_id == nodeid) ? "Host" : "Node", 786 (host->node_id == nodeid) ? "Host" : "Node",
788 NODE_BUS_ARGS(host, nodeid), (unsigned long long)guid); 787 NODE_BUS_ARGS(host, nodeid), (unsigned long long)guid);
789 788
790 return ne; 789 return ne;
791} 790}
792 791
793 792
@@ -872,12 +871,10 @@ static struct unit_directory *nodemgr_process_unit_directory
872 struct csr1212_keyval *kv; 871 struct csr1212_keyval *kv;
873 u8 last_key_id = 0; 872 u8 last_key_id = 0;
874 873
875 ud = kmalloc(sizeof(struct unit_directory), GFP_KERNEL); 874 ud = kzalloc(sizeof(*ud), GFP_KERNEL);
876 if (!ud) 875 if (!ud)
877 goto unit_directory_error; 876 goto unit_directory_error;
878 877
879 memset (ud, 0, sizeof(struct unit_directory));
880
881 ud->ne = ne; 878 ud->ne = ne;
882 ud->ignore_driver = ignore_drivers; 879 ud->ignore_driver = ignore_drivers;
883 ud->address = ud_kv->offset + CSR1212_CONFIG_ROM_SPACE_BASE; 880 ud->address = ud_kv->offset + CSR1212_CONFIG_ROM_SPACE_BASE;
@@ -937,10 +934,10 @@ static struct unit_directory *nodemgr_process_unit_directory
937 /* Logical Unit Number */ 934 /* Logical Unit Number */
938 if (kv->key.type == CSR1212_KV_TYPE_IMMEDIATE) { 935 if (kv->key.type == CSR1212_KV_TYPE_IMMEDIATE) {
939 if (ud->flags & UNIT_DIRECTORY_HAS_LUN) { 936 if (ud->flags & UNIT_DIRECTORY_HAS_LUN) {
940 ud_child = kmalloc(sizeof(struct unit_directory), GFP_KERNEL); 937 ud_child = kmalloc(sizeof(*ud_child), GFP_KERNEL);
941 if (!ud_child) 938 if (!ud_child)
942 goto unit_directory_error; 939 goto unit_directory_error;
943 memcpy(ud_child, ud, sizeof(struct unit_directory)); 940 memcpy(ud_child, ud, sizeof(*ud_child));
944 nodemgr_register_device(ne, ud_child, &ne->device); 941 nodemgr_register_device(ne, ud_child, &ne->device);
945 ud_child = NULL; 942 ud_child = NULL;
946 943
@@ -1200,7 +1197,7 @@ static void nodemgr_node_scan_one(struct host_info *hi,
1200 struct csr1212_csr *csr; 1197 struct csr1212_csr *csr;
1201 struct nodemgr_csr_info *ci; 1198 struct nodemgr_csr_info *ci;
1202 1199
1203 ci = kmalloc(sizeof(struct nodemgr_csr_info), GFP_KERNEL); 1200 ci = kmalloc(sizeof(*ci), GFP_KERNEL);
1204 if (!ci) 1201 if (!ci)
1205 return; 1202 return;
1206 1203
@@ -1410,14 +1407,28 @@ static void nodemgr_node_probe(struct host_info *hi, int generation)
1410 struct hpsb_host *host = hi->host; 1407 struct hpsb_host *host = hi->host;
1411 struct class *class = &nodemgr_ne_class; 1408 struct class *class = &nodemgr_ne_class;
1412 struct class_device *cdev; 1409 struct class_device *cdev;
1410 struct node_entry *ne;
1413 1411
1414 /* Do some processing of the nodes we've probed. This pulls them 1412 /* Do some processing of the nodes we've probed. This pulls them
1415 * into the sysfs layer if needed, and can result in processing of 1413 * into the sysfs layer if needed, and can result in processing of
1416 * unit-directories, or just updating the node and it's 1414 * unit-directories, or just updating the node and it's
1417 * unit-directories. */ 1415 * unit-directories.
1416 *
1417 * Run updates before probes. Usually, updates are time-critical
1418 * while probes are time-consuming. (Well, those probes need some
1419 * improvement...) */
1420
1418 down_read(&class->subsys.rwsem); 1421 down_read(&class->subsys.rwsem);
1419 list_for_each_entry(cdev, &class->children, node) 1422 list_for_each_entry(cdev, &class->children, node) {
1420 nodemgr_probe_ne(hi, container_of(cdev, struct node_entry, class_dev), generation); 1423 ne = container_of(cdev, struct node_entry, class_dev);
1424 if (!ne->needs_probe)
1425 nodemgr_probe_ne(hi, ne, generation);
1426 }
1427 list_for_each_entry(cdev, &class->children, node) {
1428 ne = container_of(cdev, struct node_entry, class_dev);
1429 if (ne->needs_probe)
1430 nodemgr_probe_ne(hi, ne, generation);
1431 }
1421 up_read(&class->subsys.rwsem); 1432 up_read(&class->subsys.rwsem);
1422 1433
1423 1434
@@ -1448,7 +1459,8 @@ static int nodemgr_send_resume_packet(struct hpsb_host *host)
1448 int ret = 1; 1459 int ret = 1;
1449 1460
1450 packet = hpsb_make_phypacket(host, 1461 packet = hpsb_make_phypacket(host,
1451 0x003c0000 | NODEID_TO_NODE(host->node_id) << 24); 1462 EXTPHYPACKET_TYPE_RESUME |
1463 NODEID_TO_NODE(host->node_id) << PHYPACKET_PORT_SHIFT);
1452 if (packet) { 1464 if (packet) {
1453 packet->no_waiter = 1; 1465 packet->no_waiter = 1;
1454 packet->generation = get_hpsb_generation(host); 1466 packet->generation = get_hpsb_generation(host);
diff --git a/drivers/ieee1394/nodemgr.h b/drivers/ieee1394/nodemgr.h
index 3a2f0c02fd08..0b26616e16c3 100644
--- a/drivers/ieee1394/nodemgr.h
+++ b/drivers/ieee1394/nodemgr.h
@@ -151,24 +151,6 @@ static inline int hpsb_node_entry_valid(struct node_entry *ne)
151} 151}
152 152
153/* 153/*
154 * Returns a node entry (which has its reference count incremented) or NULL if
155 * the GUID in question is not known. Getting a valid entry does not mean that
156 * the node with this GUID is currently accessible (might be powered down).
157 */
158struct node_entry *hpsb_guid_get_entry(u64 guid);
159
160/* Same as above, but use the nodeid to get an node entry. This is not
161 * fool-proof by itself, since the nodeid can change. */
162struct node_entry *hpsb_nodeid_get_entry(struct hpsb_host *host, nodeid_t nodeid);
163
164/*
165 * If the entry refers to a local host, this function will return the pointer
166 * to the hpsb_host structure. It will return NULL otherwise. Once you have
167 * established it is a local host, you can use that knowledge from then on (the
168 * GUID won't wander to an external node). */
169struct hpsb_host *hpsb_get_host_by_ne(struct node_entry *ne);
170
171/*
172 * This will fill in the given, pre-initialised hpsb_packet with the current 154 * This will fill in the given, pre-initialised hpsb_packet with the current
173 * information from the node entry (host, node ID, generation number). It will 155 * information from the node entry (host, node ID, generation number). It will
174 * return false if the node owning the GUID is not accessible (and not modify the 156 * return false if the node owning the GUID is not accessible (and not modify the
diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c
index 4cf9b8f3e336..b6b96fa04d62 100644
--- a/drivers/ieee1394/ohci1394.c
+++ b/drivers/ieee1394/ohci1394.c
@@ -161,9 +161,6 @@ printk(level "%s: " fmt "\n" , OHCI1394_DRIVER_NAME , ## args)
161#define PRINT(level, fmt, args...) \ 161#define PRINT(level, fmt, args...) \
162printk(level "%s: fw-host%d: " fmt "\n" , OHCI1394_DRIVER_NAME, ohci->host->id , ## args) 162printk(level "%s: fw-host%d: " fmt "\n" , OHCI1394_DRIVER_NAME, ohci->host->id , ## args)
163 163
164static char version[] __devinitdata =
165 "$Rev: 1313 $ Ben Collins <bcollins@debian.org>";
166
167/* Module Parameters */ 164/* Module Parameters */
168static int phys_dma = 1; 165static int phys_dma = 1;
169module_param(phys_dma, int, 0644); 166module_param(phys_dma, int, 0644);
@@ -587,12 +584,13 @@ static void ohci_initialize(struct ti_ohci *ohci)
587 sprintf (irq_buf, "%s", __irq_itoa(ohci->dev->irq)); 584 sprintf (irq_buf, "%s", __irq_itoa(ohci->dev->irq));
588#endif 585#endif
589 PRINT(KERN_INFO, "OHCI-1394 %d.%d (PCI): IRQ=[%s] " 586 PRINT(KERN_INFO, "OHCI-1394 %d.%d (PCI): IRQ=[%s] "
590 "MMIO=[%lx-%lx] Max Packet=[%d]", 587 "MMIO=[%lx-%lx] Max Packet=[%d] IR/IT contexts=[%d/%d]",
591 ((((buf) >> 16) & 0xf) + (((buf) >> 20) & 0xf) * 10), 588 ((((buf) >> 16) & 0xf) + (((buf) >> 20) & 0xf) * 10),
592 ((((buf) >> 4) & 0xf) + ((buf) & 0xf) * 10), irq_buf, 589 ((((buf) >> 4) & 0xf) + ((buf) & 0xf) * 10), irq_buf,
593 pci_resource_start(ohci->dev, 0), 590 pci_resource_start(ohci->dev, 0),
594 pci_resource_start(ohci->dev, 0) + OHCI1394_REGISTER_SIZE - 1, 591 pci_resource_start(ohci->dev, 0) + OHCI1394_REGISTER_SIZE - 1,
595 ohci->max_packet_size); 592 ohci->max_packet_size,
593 ohci->nb_iso_rcv_ctx, ohci->nb_iso_xmit_ctx);
596 594
597 /* Check all of our ports to make sure that if anything is 595 /* Check all of our ports to make sure that if anything is
598 * connected, we enable that port. */ 596 * connected, we enable that port. */
@@ -2960,28 +2958,23 @@ alloc_dma_rcv_ctx(struct ti_ohci *ohci, struct dma_rcv_ctx *d,
2960 d->ctrlClear = 0; 2958 d->ctrlClear = 0;
2961 d->cmdPtr = 0; 2959 d->cmdPtr = 0;
2962 2960
2963 d->buf_cpu = kmalloc(d->num_desc * sizeof(quadlet_t*), GFP_ATOMIC); 2961 d->buf_cpu = kzalloc(d->num_desc * sizeof(*d->buf_cpu), GFP_ATOMIC);
2964 d->buf_bus = kmalloc(d->num_desc * sizeof(dma_addr_t), GFP_ATOMIC); 2962 d->buf_bus = kzalloc(d->num_desc * sizeof(*d->buf_bus), GFP_ATOMIC);
2965 2963
2966 if (d->buf_cpu == NULL || d->buf_bus == NULL) { 2964 if (d->buf_cpu == NULL || d->buf_bus == NULL) {
2967 PRINT(KERN_ERR, "Failed to allocate dma buffer"); 2965 PRINT(KERN_ERR, "Failed to allocate dma buffer");
2968 free_dma_rcv_ctx(d); 2966 free_dma_rcv_ctx(d);
2969 return -ENOMEM; 2967 return -ENOMEM;
2970 } 2968 }
2971 memset(d->buf_cpu, 0, d->num_desc * sizeof(quadlet_t*));
2972 memset(d->buf_bus, 0, d->num_desc * sizeof(dma_addr_t));
2973 2969
2974 d->prg_cpu = kmalloc(d->num_desc * sizeof(struct dma_cmd*), 2970 d->prg_cpu = kzalloc(d->num_desc * sizeof(*d->prg_cpu), GFP_ATOMIC);
2975 GFP_ATOMIC); 2971 d->prg_bus = kzalloc(d->num_desc * sizeof(*d->prg_bus), GFP_ATOMIC);
2976 d->prg_bus = kmalloc(d->num_desc * sizeof(dma_addr_t), GFP_ATOMIC);
2977 2972
2978 if (d->prg_cpu == NULL || d->prg_bus == NULL) { 2973 if (d->prg_cpu == NULL || d->prg_bus == NULL) {
2979 PRINT(KERN_ERR, "Failed to allocate dma prg"); 2974 PRINT(KERN_ERR, "Failed to allocate dma prg");
2980 free_dma_rcv_ctx(d); 2975 free_dma_rcv_ctx(d);
2981 return -ENOMEM; 2976 return -ENOMEM;
2982 } 2977 }
2983 memset(d->prg_cpu, 0, d->num_desc * sizeof(struct dma_cmd*));
2984 memset(d->prg_bus, 0, d->num_desc * sizeof(dma_addr_t));
2985 2978
2986 d->spb = kmalloc(d->split_buf_size, GFP_ATOMIC); 2979 d->spb = kmalloc(d->split_buf_size, GFP_ATOMIC);
2987 2980
@@ -3093,17 +3086,14 @@ alloc_dma_trm_ctx(struct ti_ohci *ohci, struct dma_trm_ctx *d,
3093 d->ctrlClear = 0; 3086 d->ctrlClear = 0;
3094 d->cmdPtr = 0; 3087 d->cmdPtr = 0;
3095 3088
3096 d->prg_cpu = kmalloc(d->num_desc * sizeof(struct at_dma_prg*), 3089 d->prg_cpu = kzalloc(d->num_desc * sizeof(*d->prg_cpu), GFP_KERNEL);
3097 GFP_KERNEL); 3090 d->prg_bus = kzalloc(d->num_desc * sizeof(*d->prg_bus), GFP_KERNEL);
3098 d->prg_bus = kmalloc(d->num_desc * sizeof(dma_addr_t), GFP_KERNEL);
3099 3091
3100 if (d->prg_cpu == NULL || d->prg_bus == NULL) { 3092 if (d->prg_cpu == NULL || d->prg_bus == NULL) {
3101 PRINT(KERN_ERR, "Failed to allocate at dma prg"); 3093 PRINT(KERN_ERR, "Failed to allocate at dma prg");
3102 free_dma_trm_ctx(d); 3094 free_dma_trm_ctx(d);
3103 return -ENOMEM; 3095 return -ENOMEM;
3104 } 3096 }
3105 memset(d->prg_cpu, 0, d->num_desc * sizeof(struct at_dma_prg*));
3106 memset(d->prg_bus, 0, d->num_desc * sizeof(dma_addr_t));
3107 3097
3108 len = sprintf(pool_name, "ohci1394_trm_prg"); 3098 len = sprintf(pool_name, "ohci1394_trm_prg");
3109 sprintf(pool_name+len, "%d", num_allocs); 3099 sprintf(pool_name+len, "%d", num_allocs);
@@ -3201,8 +3191,6 @@ static struct hpsb_host_driver ohci1394_driver = {
3201 .hw_csr_reg = ohci_hw_csr_reg, 3191 .hw_csr_reg = ohci_hw_csr_reg,
3202}; 3192};
3203 3193
3204
3205
3206/*********************************** 3194/***********************************
3207 * PCI Driver Interface functions * 3195 * PCI Driver Interface functions *
3208 ***********************************/ 3196 ***********************************/
@@ -3217,15 +3205,10 @@ do { \
3217static int __devinit ohci1394_pci_probe(struct pci_dev *dev, 3205static int __devinit ohci1394_pci_probe(struct pci_dev *dev,
3218 const struct pci_device_id *ent) 3206 const struct pci_device_id *ent)
3219{ 3207{
3220 static int version_printed = 0;
3221
3222 struct hpsb_host *host; 3208 struct hpsb_host *host;
3223 struct ti_ohci *ohci; /* shortcut to currently handled device */ 3209 struct ti_ohci *ohci; /* shortcut to currently handled device */
3224 unsigned long ohci_base; 3210 unsigned long ohci_base;
3225 3211
3226 if (version_printed++ == 0)
3227 PRINT_G(KERN_INFO, "%s", version);
3228
3229 if (pci_enable_device(dev)) 3212 if (pci_enable_device(dev))
3230 FAIL(-ENXIO, "Failed to enable OHCI hardware"); 3213 FAIL(-ENXIO, "Failed to enable OHCI hardware");
3231 pci_set_master(dev); 3214 pci_set_master(dev);
@@ -3369,13 +3352,8 @@ static int __devinit ohci1394_pci_probe(struct pci_dev *dev,
3369 /* Determine the number of available IR and IT contexts. */ 3352 /* Determine the number of available IR and IT contexts. */
3370 ohci->nb_iso_rcv_ctx = 3353 ohci->nb_iso_rcv_ctx =
3371 get_nb_iso_ctx(ohci, OHCI1394_IsoRecvIntMaskSet); 3354 get_nb_iso_ctx(ohci, OHCI1394_IsoRecvIntMaskSet);
3372 DBGMSG("%d iso receive contexts available",
3373 ohci->nb_iso_rcv_ctx);
3374
3375 ohci->nb_iso_xmit_ctx = 3355 ohci->nb_iso_xmit_ctx =
3376 get_nb_iso_ctx(ohci, OHCI1394_IsoXmitIntMaskSet); 3356 get_nb_iso_ctx(ohci, OHCI1394_IsoXmitIntMaskSet);
3377 DBGMSG("%d iso transmit contexts available",
3378 ohci->nb_iso_xmit_ctx);
3379 3357
3380 /* Set the usage bits for non-existent contexts so they can't 3358 /* Set the usage bits for non-existent contexts so they can't
3381 * be allocated */ 3359 * be allocated */
@@ -3606,8 +3584,6 @@ static struct pci_driver ohci1394_pci_driver = {
3606 .suspend = ohci1394_pci_suspend, 3584 .suspend = ohci1394_pci_suspend,
3607}; 3585};
3608 3586
3609
3610
3611/*********************************** 3587/***********************************
3612 * OHCI1394 Video Interface * 3588 * OHCI1394 Video Interface *
3613 ***********************************/ 3589 ***********************************/
@@ -3714,7 +3690,6 @@ EXPORT_SYMBOL(ohci1394_init_iso_tasklet);
3714EXPORT_SYMBOL(ohci1394_register_iso_tasklet); 3690EXPORT_SYMBOL(ohci1394_register_iso_tasklet);
3715EXPORT_SYMBOL(ohci1394_unregister_iso_tasklet); 3691EXPORT_SYMBOL(ohci1394_unregister_iso_tasklet);
3716 3692
3717
3718/*********************************** 3693/***********************************
3719 * General module initialization * 3694 * General module initialization *
3720 ***********************************/ 3695 ***********************************/
diff --git a/drivers/ieee1394/ohci1394.h b/drivers/ieee1394/ohci1394.h
index cc66c1cae250..7df0962144e3 100644
--- a/drivers/ieee1394/ohci1394.h
+++ b/drivers/ieee1394/ohci1394.h
@@ -219,8 +219,8 @@ struct ti_ohci {
219 219
220 int self_id_errors; 220 int self_id_errors;
221 221
222 /* Tasklets for iso receive and transmit, used by video1394, 222 /* Tasklets for iso receive and transmit, used by video1394
223 * amdtp and dv1394 */ 223 * and dv1394 */
224 224
225 struct list_head iso_tasklet_list; 225 struct list_head iso_tasklet_list;
226 spinlock_t iso_tasklet_list_lock; 226 spinlock_t iso_tasklet_list_lock;
diff --git a/drivers/ieee1394/pcilynx.c b/drivers/ieee1394/pcilynx.c
index 6b1ab875333b..e2edc41e1b6f 100644
--- a/drivers/ieee1394/pcilynx.c
+++ b/drivers/ieee1394/pcilynx.c
@@ -1435,7 +1435,7 @@ static int __devinit add_card(struct pci_dev *dev,
1435 struct i2c_algo_bit_data i2c_adapter_data; 1435 struct i2c_algo_bit_data i2c_adapter_data;
1436 1436
1437 error = -ENOMEM; 1437 error = -ENOMEM;
1438 i2c_ad = kmalloc(sizeof(struct i2c_adapter), SLAB_KERNEL); 1438 i2c_ad = kmalloc(sizeof(*i2c_ad), SLAB_KERNEL);
1439 if (!i2c_ad) FAIL("failed to allocate I2C adapter memory"); 1439 if (!i2c_ad) FAIL("failed to allocate I2C adapter memory");
1440 1440
1441 memcpy(i2c_ad, &bit_ops, sizeof(struct i2c_adapter)); 1441 memcpy(i2c_ad, &bit_ops, sizeof(struct i2c_adapter));
diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c
index 24411e666b21..b05235639918 100644
--- a/drivers/ieee1394/raw1394.c
+++ b/drivers/ieee1394/raw1394.c
@@ -102,12 +102,9 @@ static struct pending_request *__alloc_pending_request(gfp_t flags)
102{ 102{
103 struct pending_request *req; 103 struct pending_request *req;
104 104
105 req = (struct pending_request *)kmalloc(sizeof(struct pending_request), 105 req = kzalloc(sizeof(*req), flags);
106 flags); 106 if (req)
107 if (req != NULL) {
108 memset(req, 0, sizeof(struct pending_request));
109 INIT_LIST_HEAD(&req->list); 107 INIT_LIST_HEAD(&req->list);
110 }
111 108
112 return req; 109 return req;
113} 110}
@@ -192,9 +189,9 @@ static void add_host(struct hpsb_host *host)
192 struct host_info *hi; 189 struct host_info *hi;
193 unsigned long flags; 190 unsigned long flags;
194 191
195 hi = (struct host_info *)kmalloc(sizeof(struct host_info), GFP_KERNEL); 192 hi = kmalloc(sizeof(*hi), GFP_KERNEL);
196 193
197 if (hi != NULL) { 194 if (hi) {
198 INIT_LIST_HEAD(&hi->list); 195 INIT_LIST_HEAD(&hi->list);
199 hi->host = host; 196 hi->host = host;
200 INIT_LIST_HEAD(&hi->file_info_list); 197 INIT_LIST_HEAD(&hi->file_info_list);
@@ -315,8 +312,8 @@ static void iso_receive(struct hpsb_host *host, int channel, quadlet_t * data,
315 break; 312 break;
316 313
317 if (!ibs) { 314 if (!ibs) {
318 ibs = kmalloc(sizeof(struct iso_block_store) 315 ibs = kmalloc(sizeof(*ibs) + length,
319 + length, SLAB_ATOMIC); 316 SLAB_ATOMIC);
320 if (!ibs) { 317 if (!ibs) {
321 kfree(req); 318 kfree(req);
322 break; 319 break;
@@ -376,8 +373,8 @@ static void fcp_request(struct hpsb_host *host, int nodeid, int direction,
376 break; 373 break;
377 374
378 if (!ibs) { 375 if (!ibs) {
379 ibs = kmalloc(sizeof(struct iso_block_store) 376 ibs = kmalloc(sizeof(*ibs) + length,
380 + length, SLAB_ATOMIC); 377 SLAB_ATOMIC);
381 if (!ibs) { 378 if (!ibs) {
382 kfree(req); 379 kfree(req);
383 break; 380 break;
@@ -502,10 +499,9 @@ static int state_initialized(struct file_info *fi, struct pending_request *req)
502 switch (req->req.type) { 499 switch (req->req.type) {
503 case RAW1394_REQ_LIST_CARDS: 500 case RAW1394_REQ_LIST_CARDS:
504 spin_lock_irqsave(&host_info_lock, flags); 501 spin_lock_irqsave(&host_info_lock, flags);
505 khl = kmalloc(sizeof(struct raw1394_khost_list) * host_count, 502 khl = kmalloc(sizeof(*khl) * host_count, SLAB_ATOMIC);
506 SLAB_ATOMIC);
507 503
508 if (khl != NULL) { 504 if (khl) {
509 req->req.misc = host_count; 505 req->req.misc = host_count;
510 req->data = (quadlet_t *) khl; 506 req->data = (quadlet_t *) khl;
511 507
@@ -517,7 +513,7 @@ static int state_initialized(struct file_info *fi, struct pending_request *req)
517 } 513 }
518 spin_unlock_irqrestore(&host_info_lock, flags); 514 spin_unlock_irqrestore(&host_info_lock, flags);
519 515
520 if (khl != NULL) { 516 if (khl) {
521 req->req.error = RAW1394_ERROR_NONE; 517 req->req.error = RAW1394_ERROR_NONE;
522 req->req.length = min(req->req.length, 518 req->req.length = min(req->req.length,
523 (u32) (sizeof 519 (u32) (sizeof
@@ -1647,13 +1643,13 @@ static int arm_register(struct file_info *fi, struct pending_request *req)
1647 return (-EINVAL); 1643 return (-EINVAL);
1648 } 1644 }
1649 /* addr-list-entry for fileinfo */ 1645 /* addr-list-entry for fileinfo */
1650 addr = (struct arm_addr *)kmalloc(sizeof(struct arm_addr), SLAB_KERNEL); 1646 addr = kmalloc(sizeof(*addr), SLAB_KERNEL);
1651 if (!addr) { 1647 if (!addr) {
1652 req->req.length = 0; 1648 req->req.length = 0;
1653 return (-ENOMEM); 1649 return (-ENOMEM);
1654 } 1650 }
1655 /* allocation of addr_space_buffer */ 1651 /* allocation of addr_space_buffer */
1656 addr->addr_space_buffer = (u8 *) vmalloc(req->req.length); 1652 addr->addr_space_buffer = vmalloc(req->req.length);
1657 if (!(addr->addr_space_buffer)) { 1653 if (!(addr->addr_space_buffer)) {
1658 kfree(addr); 1654 kfree(addr);
1659 req->req.length = 0; 1655 req->req.length = 0;
@@ -2122,8 +2118,7 @@ static int modify_config_rom(struct file_info *fi, struct pending_request *req)
2122 return -ENOMEM; 2118 return -ENOMEM;
2123 } 2119 }
2124 2120
2125 cache->filled_head = 2121 cache->filled_head = kmalloc(sizeof(*cache->filled_head), GFP_KERNEL);
2126 kmalloc(sizeof(struct csr1212_cache_region), GFP_KERNEL);
2127 if (!cache->filled_head) { 2122 if (!cache->filled_head) {
2128 csr1212_release_keyval(fi->csr1212_dirs[dr]); 2123 csr1212_release_keyval(fi->csr1212_dirs[dr]);
2129 fi->csr1212_dirs[dr] = NULL; 2124 fi->csr1212_dirs[dr] = NULL;
@@ -2136,7 +2131,6 @@ static int modify_config_rom(struct file_info *fi, struct pending_request *req)
2136 req->req.length)) { 2131 req->req.length)) {
2137 csr1212_release_keyval(fi->csr1212_dirs[dr]); 2132 csr1212_release_keyval(fi->csr1212_dirs[dr]);
2138 fi->csr1212_dirs[dr] = NULL; 2133 fi->csr1212_dirs[dr] = NULL;
2139 CSR1212_FREE(cache);
2140 ret = -EFAULT; 2134 ret = -EFAULT;
2141 } else { 2135 } else {
2142 cache->len = req->req.length; 2136 cache->len = req->req.length;
@@ -2172,7 +2166,7 @@ static int modify_config_rom(struct file_info *fi, struct pending_request *req)
2172 } 2166 }
2173 } 2167 }
2174 kfree(cache->filled_head); 2168 kfree(cache->filled_head);
2175 kfree(cache); 2169 CSR1212_FREE(cache);
2176 2170
2177 if (ret >= 0) { 2171 if (ret >= 0) {
2178 /* we have to free the request, because we queue no response, 2172 /* we have to free the request, because we queue no response,
@@ -2488,8 +2482,8 @@ static int raw1394_iso_recv_packets(struct file_info *fi, void __user * uaddr)
2488 2482
2489 /* ensure user-supplied buffer is accessible and big enough */ 2483 /* ensure user-supplied buffer is accessible and big enough */
2490 if (!access_ok(VERIFY_WRITE, upackets.infos, 2484 if (!access_ok(VERIFY_WRITE, upackets.infos,
2491 upackets.n_packets * 2485 upackets.n_packets *
2492 sizeof(struct raw1394_iso_packet_info))) 2486 sizeof(struct raw1394_iso_packet_info)))
2493 return -EFAULT; 2487 return -EFAULT;
2494 2488
2495 /* copy the packet_infos out */ 2489 /* copy the packet_infos out */
@@ -2522,8 +2516,8 @@ static int raw1394_iso_send_packets(struct file_info *fi, void __user * uaddr)
2522 2516
2523 /* ensure user-supplied buffer is accessible and big enough */ 2517 /* ensure user-supplied buffer is accessible and big enough */
2524 if (!access_ok(VERIFY_READ, upackets.infos, 2518 if (!access_ok(VERIFY_READ, upackets.infos,
2525 upackets.n_packets * 2519 upackets.n_packets *
2526 sizeof(struct raw1394_iso_packet_info))) 2520 sizeof(struct raw1394_iso_packet_info)))
2527 return -EFAULT; 2521 return -EFAULT;
2528 2522
2529 /* copy the infos structs in and queue the packets */ 2523 /* copy the infos structs in and queue the packets */
@@ -2684,11 +2678,10 @@ static int raw1394_open(struct inode *inode, struct file *file)
2684{ 2678{
2685 struct file_info *fi; 2679 struct file_info *fi;
2686 2680
2687 fi = kmalloc(sizeof(struct file_info), SLAB_KERNEL); 2681 fi = kzalloc(sizeof(*fi), SLAB_KERNEL);
2688 if (fi == NULL) 2682 if (!fi)
2689 return -ENOMEM; 2683 return -ENOMEM;
2690 2684
2691 memset(fi, 0, sizeof(struct file_info));
2692 fi->notification = (u8) RAW1394_NOTIFY_ON; /* busreset notification */ 2685 fi->notification = (u8) RAW1394_NOTIFY_ON; /* busreset notification */
2693 2686
2694 INIT_LIST_HEAD(&fi->list); 2687 INIT_LIST_HEAD(&fi->list);
@@ -2748,8 +2741,7 @@ static int raw1394_release(struct inode *inode, struct file *file)
2748 list) { 2741 list) {
2749 entry = fi_hlp->addr_list.next; 2742 entry = fi_hlp->addr_list.next;
2750 while (entry != &(fi_hlp->addr_list)) { 2743 while (entry != &(fi_hlp->addr_list)) {
2751 arm_addr = list_entry(entry, 2744 arm_addr = list_entry(entry, struct
2752 struct
2753 arm_addr, 2745 arm_addr,
2754 addr_list); 2746 addr_list);
2755 if (arm_addr->start == 2747 if (arm_addr->start ==
@@ -2912,16 +2904,17 @@ static int __init init_raw1394(void)
2912 2904
2913 hpsb_register_highlevel(&raw1394_highlevel); 2905 hpsb_register_highlevel(&raw1394_highlevel);
2914 2906
2915 if (IS_ERR(class_device_create(hpsb_protocol_class, NULL, MKDEV( 2907 if (IS_ERR
2916 IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16), 2908 (class_device_create
2917 NULL, RAW1394_DEVICE_NAME))) { 2909 (hpsb_protocol_class, NULL,
2910 MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16), NULL,
2911 RAW1394_DEVICE_NAME))) {
2918 ret = -EFAULT; 2912 ret = -EFAULT;
2919 goto out_unreg; 2913 goto out_unreg;
2920 } 2914 }
2921 2915
2922 devfs_mk_cdev(MKDEV( 2916 devfs_mk_cdev(MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16),
2923 IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16), 2917 S_IFCHR | S_IRUSR | S_IWUSR, RAW1394_DEVICE_NAME);
2924 S_IFCHR | S_IRUSR | S_IWUSR, RAW1394_DEVICE_NAME);
2925 2918
2926 cdev_init(&raw1394_cdev, &raw1394_fops); 2919 cdev_init(&raw1394_cdev, &raw1394_fops);
2927 raw1394_cdev.owner = THIS_MODULE; 2920 raw1394_cdev.owner = THIS_MODULE;
@@ -2943,20 +2936,22 @@ static int __init init_raw1394(void)
2943 2936
2944 goto out; 2937 goto out;
2945 2938
2946out_dev: 2939 out_dev:
2947 devfs_remove(RAW1394_DEVICE_NAME); 2940 devfs_remove(RAW1394_DEVICE_NAME);
2948 class_device_destroy(hpsb_protocol_class, 2941 class_device_destroy(hpsb_protocol_class,
2949 MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16)); 2942 MKDEV(IEEE1394_MAJOR,
2950out_unreg: 2943 IEEE1394_MINOR_BLOCK_RAW1394 * 16));
2944 out_unreg:
2951 hpsb_unregister_highlevel(&raw1394_highlevel); 2945 hpsb_unregister_highlevel(&raw1394_highlevel);
2952out: 2946 out:
2953 return ret; 2947 return ret;
2954} 2948}
2955 2949
2956static void __exit cleanup_raw1394(void) 2950static void __exit cleanup_raw1394(void)
2957{ 2951{
2958 class_device_destroy(hpsb_protocol_class, 2952 class_device_destroy(hpsb_protocol_class,
2959 MKDEV(IEEE1394_MAJOR, IEEE1394_MINOR_BLOCK_RAW1394 * 16)); 2953 MKDEV(IEEE1394_MAJOR,
2954 IEEE1394_MINOR_BLOCK_RAW1394 * 16));
2960 cdev_del(&raw1394_cdev); 2955 cdev_del(&raw1394_cdev);
2961 devfs_remove(RAW1394_DEVICE_NAME); 2956 devfs_remove(RAW1394_DEVICE_NAME);
2962 hpsb_unregister_highlevel(&raw1394_highlevel); 2957 hpsb_unregister_highlevel(&raw1394_highlevel);
diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index f7e18ccc5c0a..18d7eda38851 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -80,9 +80,6 @@
80#include "ieee1394_transactions.h" 80#include "ieee1394_transactions.h"
81#include "sbp2.h" 81#include "sbp2.h"
82 82
83static char version[] __devinitdata =
84 "$Rev: 1306 $ Ben Collins <bcollins@debian.org>";
85
86/* 83/*
87 * Module load parameter definitions 84 * Module load parameter definitions
88 */ 85 */
@@ -151,18 +148,15 @@ static int force_inquiry_hack;
151module_param(force_inquiry_hack, int, 0444); 148module_param(force_inquiry_hack, int, 0444);
152MODULE_PARM_DESC(force_inquiry_hack, "Force SCSI inquiry hack (default = 0)"); 149MODULE_PARM_DESC(force_inquiry_hack, "Force SCSI inquiry hack (default = 0)");
153 150
154
155/* 151/*
156 * Export information about protocols/devices supported by this driver. 152 * Export information about protocols/devices supported by this driver.
157 */ 153 */
158static struct ieee1394_device_id sbp2_id_table[] = { 154static struct ieee1394_device_id sbp2_id_table[] = {
159 { 155 {
160 .match_flags =IEEE1394_MATCH_SPECIFIER_ID | 156 .match_flags = IEEE1394_MATCH_SPECIFIER_ID | IEEE1394_MATCH_VERSION,
161 IEEE1394_MATCH_VERSION, 157 .specifier_id = SBP2_UNIT_SPEC_ID_ENTRY & 0xffffff,
162 .specifier_id = SBP2_UNIT_SPEC_ID_ENTRY & 0xffffff, 158 .version = SBP2_SW_VERSION_ENTRY & 0xffffff},
163 .version = SBP2_SW_VERSION_ENTRY & 0xffffff 159 {}
164 },
165 { }
166}; 160};
167 161
168MODULE_DEVICE_TABLE(ieee1394, sbp2_id_table); 162MODULE_DEVICE_TABLE(ieee1394, sbp2_id_table);
@@ -221,7 +215,6 @@ static u32 global_outstanding_dmas = 0;
221 215
222#define SBP2_ERR(fmt, args...) HPSB_ERR("sbp2: "fmt, ## args) 216#define SBP2_ERR(fmt, args...) HPSB_ERR("sbp2: "fmt, ## args)
223 217
224
225/* 218/*
226 * Globals 219 * Globals
227 */ 220 */
@@ -254,8 +247,8 @@ static struct hpsb_address_ops sbp2_ops = {
254 247
255#ifdef CONFIG_IEEE1394_SBP2_PHYS_DMA 248#ifdef CONFIG_IEEE1394_SBP2_PHYS_DMA
256static struct hpsb_address_ops sbp2_physdma_ops = { 249static struct hpsb_address_ops sbp2_physdma_ops = {
257 .read = sbp2_handle_physdma_read, 250 .read = sbp2_handle_physdma_read,
258 .write = sbp2_handle_physdma_write, 251 .write = sbp2_handle_physdma_write,
259}; 252};
260#endif 253#endif
261 254
@@ -287,7 +280,6 @@ static u32 sbp2_broken_inquiry_list[] = {
287 * General utility functions 280 * General utility functions
288 **************************************/ 281 **************************************/
289 282
290
291#ifndef __BIG_ENDIAN 283#ifndef __BIG_ENDIAN
292/* 284/*
293 * Converts a buffer from be32 to cpu byte ordering. Length is in bytes. 285 * Converts a buffer from be32 to cpu byte ordering. Length is in bytes.
@@ -324,7 +316,8 @@ static __inline__ void sbp2util_cpu_to_be32_buffer(void *buffer, int length)
324/* 316/*
325 * Debug packet dump routine. Length is in bytes. 317 * Debug packet dump routine. Length is in bytes.
326 */ 318 */
327static void sbp2util_packet_dump(void *buffer, int length, char *dump_name, u32 dump_phys_addr) 319static void sbp2util_packet_dump(void *buffer, int length, char *dump_name,
320 u32 dump_phys_addr)
328{ 321{
329 int i; 322 int i;
330 unsigned char *dump = buffer; 323 unsigned char *dump = buffer;
@@ -345,7 +338,7 @@ static void sbp2util_packet_dump(void *buffer, int length, char *dump_name, u32
345 printk(" "); 338 printk(" ");
346 if ((i & 0xf) == 0) 339 if ((i & 0xf) == 0)
347 printk("\n "); 340 printk("\n ");
348 printk("%02x ", (int) dump[i]); 341 printk("%02x ", (int)dump[i]);
349 } 342 }
350 printk("\n"); 343 printk("\n");
351 344
@@ -364,9 +357,9 @@ static int sbp2util_down_timeout(atomic_t *done, int timeout)
364 357
365 for (i = timeout; (i > 0 && atomic_read(done) == 0); i-= HZ/10) { 358 for (i = timeout; (i > 0 && atomic_read(done) == 0); i-= HZ/10) {
366 if (msleep_interruptible(100)) /* 100ms */ 359 if (msleep_interruptible(100)) /* 100ms */
367 return(1); 360 return 1;
368 } 361 }
369 return ((i > 0) ? 0:1); 362 return (i > 0) ? 0 : 1;
370} 363}
371 364
372/* Free's an allocated packet */ 365/* Free's an allocated packet */
@@ -380,21 +373,22 @@ static void sbp2_free_packet(struct hpsb_packet *packet)
380 * subaction and returns immediately. Can be used from interrupts. 373 * subaction and returns immediately. Can be used from interrupts.
381 */ 374 */
382static int sbp2util_node_write_no_wait(struct node_entry *ne, u64 addr, 375static int sbp2util_node_write_no_wait(struct node_entry *ne, u64 addr,
383 quadlet_t *buffer, size_t length) 376 quadlet_t *buffer, size_t length)
384{ 377{
385 struct hpsb_packet *packet; 378 struct hpsb_packet *packet;
386 379
387 packet = hpsb_make_writepacket(ne->host, ne->nodeid, 380 packet = hpsb_make_writepacket(ne->host, ne->nodeid,
388 addr, buffer, length); 381 addr, buffer, length);
389 if (!packet) 382 if (!packet)
390 return -ENOMEM; 383 return -ENOMEM;
391 384
392 hpsb_set_packet_complete_task(packet, (void (*)(void*))sbp2_free_packet, 385 hpsb_set_packet_complete_task(packet,
386 (void (*)(void *))sbp2_free_packet,
393 packet); 387 packet);
394 388
395 hpsb_node_fill_packet(ne, packet); 389 hpsb_node_fill_packet(ne, packet);
396 390
397 if (hpsb_send_packet(packet) < 0) { 391 if (hpsb_send_packet(packet) < 0) {
398 sbp2_free_packet(packet); 392 sbp2_free_packet(packet);
399 return -EIO; 393 return -EIO;
400 } 394 }
@@ -417,22 +411,22 @@ static int sbp2util_create_command_orb_pool(struct scsi_id_instance_data *scsi_i
417 411
418 spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags); 412 spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags);
419 for (i = 0; i < orbs; i++) { 413 for (i = 0; i < orbs; i++) {
420 command = (struct sbp2_command_info *) 414 command = kzalloc(sizeof(*command), GFP_ATOMIC);
421 kmalloc(sizeof(struct sbp2_command_info), GFP_ATOMIC);
422 if (!command) { 415 if (!command) {
423 spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); 416 spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock,
424 return(-ENOMEM); 417 flags);
418 return -ENOMEM;
425 } 419 }
426 memset(command, '\0', sizeof(struct sbp2_command_info));
427 command->command_orb_dma = 420 command->command_orb_dma =
428 pci_map_single (hi->host->pdev, &command->command_orb, 421 pci_map_single(hi->host->pdev, &command->command_orb,
429 sizeof(struct sbp2_command_orb), 422 sizeof(struct sbp2_command_orb),
430 PCI_DMA_BIDIRECTIONAL); 423 PCI_DMA_BIDIRECTIONAL);
431 SBP2_DMA_ALLOC("single command orb DMA"); 424 SBP2_DMA_ALLOC("single command orb DMA");
432 command->sge_dma = 425 command->sge_dma =
433 pci_map_single (hi->host->pdev, &command->scatter_gather_element, 426 pci_map_single(hi->host->pdev,
434 sizeof(command->scatter_gather_element), 427 &command->scatter_gather_element,
435 PCI_DMA_BIDIRECTIONAL); 428 sizeof(command->scatter_gather_element),
429 PCI_DMA_BIDIRECTIONAL);
436 SBP2_DMA_ALLOC("scatter_gather_element"); 430 SBP2_DMA_ALLOC("scatter_gather_element");
437 INIT_LIST_HEAD(&command->list); 431 INIT_LIST_HEAD(&command->list);
438 list_add_tail(&command->list, &scsi_id->sbp2_command_orb_completed); 432 list_add_tail(&command->list, &scsi_id->sbp2_command_orb_completed);
@@ -488,7 +482,7 @@ static struct sbp2_command_info *sbp2util_find_command_for_orb(
488 list_for_each_entry(command, &scsi_id->sbp2_command_orb_inuse, list) { 482 list_for_each_entry(command, &scsi_id->sbp2_command_orb_inuse, list) {
489 if (command->command_orb_dma == orb) { 483 if (command->command_orb_dma == orb) {
490 spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); 484 spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
491 return (command); 485 return command;
492 } 486 }
493 } 487 }
494 } 488 }
@@ -496,7 +490,7 @@ static struct sbp2_command_info *sbp2util_find_command_for_orb(
496 490
497 SBP2_ORB_DEBUG("could not match command orb %x", (unsigned int)orb); 491 SBP2_ORB_DEBUG("could not match command orb %x", (unsigned int)orb);
498 492
499 return(NULL); 493 return NULL;
500} 494}
501 495
502/* 496/*
@@ -513,12 +507,12 @@ static struct sbp2_command_info *sbp2util_find_command_for_SCpnt(struct scsi_id_
513 list_for_each_entry(command, &scsi_id->sbp2_command_orb_inuse, list) { 507 list_for_each_entry(command, &scsi_id->sbp2_command_orb_inuse, list) {
514 if (command->Current_SCpnt == SCpnt) { 508 if (command->Current_SCpnt == SCpnt) {
515 spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); 509 spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
516 return (command); 510 return command;
517 } 511 }
518 } 512 }
519 } 513 }
520 spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); 514 spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
521 return(NULL); 515 return NULL;
522} 516}
523 517
524/* 518/*
@@ -545,7 +539,7 @@ static struct sbp2_command_info *sbp2util_allocate_command_orb(
545 SBP2_ERR("sbp2util_allocate_command_orb - No orbs available!"); 539 SBP2_ERR("sbp2util_allocate_command_orb - No orbs available!");
546 } 540 }
547 spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); 541 spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
548 return (command); 542 return command;
549} 543}
550 544
551/* Free our DMA's */ 545/* Free our DMA's */
@@ -587,7 +581,8 @@ static void sbp2util_free_command_dma(struct sbp2_command_info *command)
587/* 581/*
588 * This function moves a command to the completed orb list. 582 * This function moves a command to the completed orb list.
589 */ 583 */
590static void sbp2util_mark_command_completed(struct scsi_id_instance_data *scsi_id, struct sbp2_command_info *command) 584static void sbp2util_mark_command_completed(struct scsi_id_instance_data *scsi_id,
585 struct sbp2_command_info *command)
591{ 586{
592 unsigned long flags; 587 unsigned long flags;
593 588
@@ -606,8 +601,6 @@ static inline int sbp2util_node_is_available(struct scsi_id_instance_data *scsi_
606 return scsi_id && scsi_id->ne && !scsi_id->ne->in_limbo; 601 return scsi_id && scsi_id->ne && !scsi_id->ne->in_limbo;
607} 602}
608 603
609
610
611/********************************************* 604/*********************************************
612 * IEEE-1394 core driver stack related section 605 * IEEE-1394 core driver stack related section
613 *********************************************/ 606 *********************************************/
@@ -627,14 +620,14 @@ static int sbp2_probe(struct device *dev)
627 if (ud->flags & UNIT_DIRECTORY_HAS_LUN_DIRECTORY) 620 if (ud->flags & UNIT_DIRECTORY_HAS_LUN_DIRECTORY)
628 return -ENODEV; 621 return -ENODEV;
629 622
630 scsi_id = sbp2_alloc_device(ud); 623 scsi_id = sbp2_alloc_device(ud);
631 624
632 if (!scsi_id) 625 if (!scsi_id)
633 return -ENOMEM; 626 return -ENOMEM;
634 627
635 sbp2_parse_unit_directory(scsi_id, ud); 628 sbp2_parse_unit_directory(scsi_id, ud);
636 629
637 return sbp2_start_device(scsi_id); 630 return sbp2_start_device(scsi_id);
638} 631}
639 632
640static int sbp2_remove(struct device *dev) 633static int sbp2_remove(struct device *dev)
@@ -719,12 +712,11 @@ static struct scsi_id_instance_data *sbp2_alloc_device(struct unit_directory *ud
719 712
720 SBP2_DEBUG("sbp2_alloc_device"); 713 SBP2_DEBUG("sbp2_alloc_device");
721 714
722 scsi_id = kmalloc(sizeof(*scsi_id), GFP_KERNEL); 715 scsi_id = kzalloc(sizeof(*scsi_id), GFP_KERNEL);
723 if (!scsi_id) { 716 if (!scsi_id) {
724 SBP2_ERR("failed to create scsi_id"); 717 SBP2_ERR("failed to create scsi_id");
725 goto failed_alloc; 718 goto failed_alloc;
726 } 719 }
727 memset(scsi_id, 0, sizeof(*scsi_id));
728 720
729 scsi_id->ne = ud->ne; 721 scsi_id->ne = ud->ne;
730 scsi_id->ud = ud; 722 scsi_id->ud = ud;
@@ -735,7 +727,7 @@ static struct scsi_id_instance_data *sbp2_alloc_device(struct unit_directory *ud
735 INIT_LIST_HEAD(&scsi_id->sbp2_command_orb_completed); 727 INIT_LIST_HEAD(&scsi_id->sbp2_command_orb_completed);
736 INIT_LIST_HEAD(&scsi_id->scsi_list); 728 INIT_LIST_HEAD(&scsi_id->scsi_list);
737 spin_lock_init(&scsi_id->sbp2_command_orb_lock); 729 spin_lock_init(&scsi_id->sbp2_command_orb_lock);
738 scsi_id->sbp2_device_type_and_lun = SBP2_DEVICE_TYPE_LUN_UNINITIALIZED; 730 scsi_id->sbp2_lun = 0;
739 731
740 ud->device.driver_data = scsi_id; 732 ud->device.driver_data = scsi_id;
741 733
@@ -769,7 +761,7 @@ static struct scsi_id_instance_data *sbp2_alloc_device(struct unit_directory *ud
769 761
770 /* Register our host with the SCSI stack. */ 762 /* Register our host with the SCSI stack. */
771 scsi_host = scsi_host_alloc(&scsi_driver_template, 763 scsi_host = scsi_host_alloc(&scsi_driver_template,
772 sizeof (unsigned long)); 764 sizeof(unsigned long));
773 if (!scsi_host) { 765 if (!scsi_host) {
774 SBP2_ERR("failed to register scsi host"); 766 SBP2_ERR("failed to register scsi host");
775 goto failed_alloc; 767 goto failed_alloc;
@@ -790,7 +782,6 @@ failed_alloc:
790 return NULL; 782 return NULL;
791} 783}
792 784
793
794static void sbp2_host_reset(struct hpsb_host *host) 785static void sbp2_host_reset(struct hpsb_host *host)
795{ 786{
796 struct sbp2scsi_host_info *hi; 787 struct sbp2scsi_host_info *hi;
@@ -804,7 +795,6 @@ static void sbp2_host_reset(struct hpsb_host *host)
804 } 795 }
805} 796}
806 797
807
808/* 798/*
809 * This function is where we first pull the node unique ids, and then 799 * This function is where we first pull the node unique ids, and then
810 * allocate memory and register a SBP-2 device. 800 * allocate memory and register a SBP-2 device.
@@ -818,7 +808,8 @@ static int sbp2_start_device(struct scsi_id_instance_data *scsi_id)
818 808
819 /* Login FIFO DMA */ 809 /* Login FIFO DMA */
820 scsi_id->login_response = 810 scsi_id->login_response =
821 pci_alloc_consistent(hi->host->pdev, sizeof(struct sbp2_login_response), 811 pci_alloc_consistent(hi->host->pdev,
812 sizeof(struct sbp2_login_response),
822 &scsi_id->login_response_dma); 813 &scsi_id->login_response_dma);
823 if (!scsi_id->login_response) 814 if (!scsi_id->login_response)
824 goto alloc_fail; 815 goto alloc_fail;
@@ -826,7 +817,8 @@ static int sbp2_start_device(struct scsi_id_instance_data *scsi_id)
826 817
827 /* Query logins ORB DMA */ 818 /* Query logins ORB DMA */
828 scsi_id->query_logins_orb = 819 scsi_id->query_logins_orb =
829 pci_alloc_consistent(hi->host->pdev, sizeof(struct sbp2_query_logins_orb), 820 pci_alloc_consistent(hi->host->pdev,
821 sizeof(struct sbp2_query_logins_orb),
830 &scsi_id->query_logins_orb_dma); 822 &scsi_id->query_logins_orb_dma);
831 if (!scsi_id->query_logins_orb) 823 if (!scsi_id->query_logins_orb)
832 goto alloc_fail; 824 goto alloc_fail;
@@ -834,7 +826,8 @@ static int sbp2_start_device(struct scsi_id_instance_data *scsi_id)
834 826
835 /* Query logins response DMA */ 827 /* Query logins response DMA */
836 scsi_id->query_logins_response = 828 scsi_id->query_logins_response =
837 pci_alloc_consistent(hi->host->pdev, sizeof(struct sbp2_query_logins_response), 829 pci_alloc_consistent(hi->host->pdev,
830 sizeof(struct sbp2_query_logins_response),
838 &scsi_id->query_logins_response_dma); 831 &scsi_id->query_logins_response_dma);
839 if (!scsi_id->query_logins_response) 832 if (!scsi_id->query_logins_response)
840 goto alloc_fail; 833 goto alloc_fail;
@@ -842,7 +835,8 @@ static int sbp2_start_device(struct scsi_id_instance_data *scsi_id)
842 835
843 /* Reconnect ORB DMA */ 836 /* Reconnect ORB DMA */
844 scsi_id->reconnect_orb = 837 scsi_id->reconnect_orb =
845 pci_alloc_consistent(hi->host->pdev, sizeof(struct sbp2_reconnect_orb), 838 pci_alloc_consistent(hi->host->pdev,
839 sizeof(struct sbp2_reconnect_orb),
846 &scsi_id->reconnect_orb_dma); 840 &scsi_id->reconnect_orb_dma);
847 if (!scsi_id->reconnect_orb) 841 if (!scsi_id->reconnect_orb)
848 goto alloc_fail; 842 goto alloc_fail;
@@ -850,7 +844,8 @@ static int sbp2_start_device(struct scsi_id_instance_data *scsi_id)
850 844
851 /* Logout ORB DMA */ 845 /* Logout ORB DMA */
852 scsi_id->logout_orb = 846 scsi_id->logout_orb =
853 pci_alloc_consistent(hi->host->pdev, sizeof(struct sbp2_logout_orb), 847 pci_alloc_consistent(hi->host->pdev,
848 sizeof(struct sbp2_logout_orb),
854 &scsi_id->logout_orb_dma); 849 &scsi_id->logout_orb_dma);
855 if (!scsi_id->logout_orb) 850 if (!scsi_id->logout_orb)
856 goto alloc_fail; 851 goto alloc_fail;
@@ -858,58 +853,11 @@ static int sbp2_start_device(struct scsi_id_instance_data *scsi_id)
858 853
859 /* Login ORB DMA */ 854 /* Login ORB DMA */
860 scsi_id->login_orb = 855 scsi_id->login_orb =
861 pci_alloc_consistent(hi->host->pdev, sizeof(struct sbp2_login_orb), 856 pci_alloc_consistent(hi->host->pdev,
857 sizeof(struct sbp2_login_orb),
862 &scsi_id->login_orb_dma); 858 &scsi_id->login_orb_dma);
863 if (!scsi_id->login_orb) { 859 if (!scsi_id->login_orb)
864alloc_fail: 860 goto alloc_fail;
865 if (scsi_id->query_logins_response) {
866 pci_free_consistent(hi->host->pdev,
867 sizeof(struct sbp2_query_logins_response),
868 scsi_id->query_logins_response,
869 scsi_id->query_logins_response_dma);
870 SBP2_DMA_FREE("query logins response DMA");
871 }
872
873 if (scsi_id->query_logins_orb) {
874 pci_free_consistent(hi->host->pdev,
875 sizeof(struct sbp2_query_logins_orb),
876 scsi_id->query_logins_orb,
877 scsi_id->query_logins_orb_dma);
878 SBP2_DMA_FREE("query logins ORB DMA");
879 }
880
881 if (scsi_id->logout_orb) {
882 pci_free_consistent(hi->host->pdev,
883 sizeof(struct sbp2_logout_orb),
884 scsi_id->logout_orb,
885 scsi_id->logout_orb_dma);
886 SBP2_DMA_FREE("logout ORB DMA");
887 }
888
889 if (scsi_id->reconnect_orb) {
890 pci_free_consistent(hi->host->pdev,
891 sizeof(struct sbp2_reconnect_orb),
892 scsi_id->reconnect_orb,
893 scsi_id->reconnect_orb_dma);
894 SBP2_DMA_FREE("reconnect ORB DMA");
895 }
896
897 if (scsi_id->login_response) {
898 pci_free_consistent(hi->host->pdev,
899 sizeof(struct sbp2_login_response),
900 scsi_id->login_response,
901 scsi_id->login_response_dma);
902 SBP2_DMA_FREE("login FIFO DMA");
903 }
904
905 list_del(&scsi_id->scsi_list);
906
907 kfree(scsi_id);
908
909 SBP2_ERR ("Could not allocate memory for scsi_id");
910
911 return -ENOMEM;
912 }
913 SBP2_DMA_ALLOC("consistent DMA region for login ORB"); 861 SBP2_DMA_ALLOC("consistent DMA region for login ORB");
914 862
915 SBP2_DEBUG("New SBP-2 device inserted, SCSI ID = %x", scsi_id->ud->id); 863 SBP2_DEBUG("New SBP-2 device inserted, SCSI ID = %x", scsi_id->ud->id);
@@ -935,7 +883,7 @@ alloc_fail:
935 sbp2_remove_device(scsi_id); 883 sbp2_remove_device(scsi_id);
936 return -EINTR; 884 return -EINTR;
937 } 885 }
938 886
939 /* 887 /*
940 * Login to the sbp-2 device 888 * Login to the sbp-2 device
941 */ 889 */
@@ -964,10 +912,17 @@ alloc_fail:
964 error = scsi_add_device(scsi_id->scsi_host, 0, scsi_id->ud->id, 0); 912 error = scsi_add_device(scsi_id->scsi_host, 0, scsi_id->ud->id, 0);
965 if (error) { 913 if (error) {
966 SBP2_ERR("scsi_add_device failed"); 914 SBP2_ERR("scsi_add_device failed");
915 sbp2_logout_device(scsi_id);
916 sbp2_remove_device(scsi_id);
967 return error; 917 return error;
968 } 918 }
969 919
970 return 0; 920 return 0;
921
922alloc_fail:
923 SBP2_ERR("Could not allocate memory for scsi_id");
924 sbp2_remove_device(scsi_id);
925 return -ENOMEM;
971} 926}
972 927
973/* 928/*
@@ -1054,51 +1009,44 @@ static void sbp2_remove_device(struct scsi_id_instance_data *scsi_id)
1054 * This function deals with physical dma write requests (for adapters that do not support 1009 * This function deals with physical dma write requests (for adapters that do not support
1055 * physical dma in hardware). Mostly just here for debugging... 1010 * physical dma in hardware). Mostly just here for debugging...
1056 */ 1011 */
1057static int sbp2_handle_physdma_write(struct hpsb_host *host, int nodeid, int destid, quadlet_t *data, 1012static int sbp2_handle_physdma_write(struct hpsb_host *host, int nodeid,
1058 u64 addr, size_t length, u16 flags) 1013 int destid, quadlet_t *data, u64 addr,
1014 size_t length, u16 flags)
1059{ 1015{
1060 1016
1061 /* 1017 /*
1062 * Manually put the data in the right place. 1018 * Manually put the data in the right place.
1063 */ 1019 */
1064 memcpy(bus_to_virt((u32)addr), data, length); 1020 memcpy(bus_to_virt((u32) addr), data, length);
1065 sbp2util_packet_dump(data, length, "sbp2 phys dma write by device", (u32)addr); 1021 sbp2util_packet_dump(data, length, "sbp2 phys dma write by device",
1066 return(RCODE_COMPLETE); 1022 (u32) addr);
1023 return RCODE_COMPLETE;
1067} 1024}
1068 1025
1069/* 1026/*
1070 * This function deals with physical dma read requests (for adapters that do not support 1027 * This function deals with physical dma read requests (for adapters that do not support
1071 * physical dma in hardware). Mostly just here for debugging... 1028 * physical dma in hardware). Mostly just here for debugging...
1072 */ 1029 */
1073static int sbp2_handle_physdma_read(struct hpsb_host *host, int nodeid, quadlet_t *data, 1030static int sbp2_handle_physdma_read(struct hpsb_host *host, int nodeid,
1074 u64 addr, size_t length, u16 flags) 1031 quadlet_t *data, u64 addr, size_t length,
1032 u16 flags)
1075{ 1033{
1076 1034
1077 /* 1035 /*
1078 * Grab data from memory and send a read response. 1036 * Grab data from memory and send a read response.
1079 */ 1037 */
1080 memcpy(data, bus_to_virt((u32)addr), length); 1038 memcpy(data, bus_to_virt((u32) addr), length);
1081 sbp2util_packet_dump(data, length, "sbp2 phys dma read by device", (u32)addr); 1039 sbp2util_packet_dump(data, length, "sbp2 phys dma read by device",
1082 return(RCODE_COMPLETE); 1040 (u32) addr);
1041 return RCODE_COMPLETE;
1083} 1042}
1084#endif 1043#endif
1085 1044
1086
1087/************************************** 1045/**************************************
1088 * SBP-2 protocol related section 1046 * SBP-2 protocol related section
1089 **************************************/ 1047 **************************************/
1090 1048
1091/* 1049/*
1092 * This function determines if we should convert scsi commands for a particular sbp2 device type
1093 */
1094static __inline__ int sbp2_command_conversion_device_type(u8 device_type)
1095{
1096 return (((device_type == TYPE_DISK) ||
1097 (device_type == TYPE_RBC) ||
1098 (device_type == TYPE_ROM)) ? 1:0);
1099}
1100
1101/*
1102 * This function queries the device for the maximum concurrent logins it 1050 * This function queries the device for the maximum concurrent logins it
1103 * supports. 1051 * supports.
1104 */ 1052 */
@@ -1120,11 +1068,7 @@ static int sbp2_query_logins(struct scsi_id_instance_data *scsi_id)
1120 1068
1121 scsi_id->query_logins_orb->lun_misc = ORB_SET_FUNCTION(SBP2_QUERY_LOGINS_REQUEST); 1069 scsi_id->query_logins_orb->lun_misc = ORB_SET_FUNCTION(SBP2_QUERY_LOGINS_REQUEST);
1122 scsi_id->query_logins_orb->lun_misc |= ORB_SET_NOTIFY(1); 1070 scsi_id->query_logins_orb->lun_misc |= ORB_SET_NOTIFY(1);
1123 if (scsi_id->sbp2_device_type_and_lun != SBP2_DEVICE_TYPE_LUN_UNINITIALIZED) { 1071 scsi_id->query_logins_orb->lun_misc |= ORB_SET_LUN(scsi_id->sbp2_lun);
1124 scsi_id->query_logins_orb->lun_misc |= ORB_SET_LUN(scsi_id->sbp2_device_type_and_lun);
1125 SBP2_DEBUG("sbp2_query_logins: set lun to %d",
1126 ORB_SET_LUN(scsi_id->sbp2_device_type_and_lun));
1127 }
1128 SBP2_DEBUG("sbp2_query_logins: lun_misc initialized"); 1072 SBP2_DEBUG("sbp2_query_logins: lun_misc initialized");
1129 1073
1130 scsi_id->query_logins_orb->reserved_resp_length = 1074 scsi_id->query_logins_orb->reserved_resp_length =
@@ -1161,12 +1105,12 @@ static int sbp2_query_logins(struct scsi_id_instance_data *scsi_id)
1161 1105
1162 if (sbp2util_down_timeout(&scsi_id->sbp2_login_complete, 2*HZ)) { 1106 if (sbp2util_down_timeout(&scsi_id->sbp2_login_complete, 2*HZ)) {
1163 SBP2_INFO("Error querying logins to SBP-2 device - timed out"); 1107 SBP2_INFO("Error querying logins to SBP-2 device - timed out");
1164 return(-EIO); 1108 return -EIO;
1165 } 1109 }
1166 1110
1167 if (scsi_id->status_block.ORB_offset_lo != scsi_id->query_logins_orb_dma) { 1111 if (scsi_id->status_block.ORB_offset_lo != scsi_id->query_logins_orb_dma) {
1168 SBP2_INFO("Error querying logins to SBP-2 device - timed out"); 1112 SBP2_INFO("Error querying logins to SBP-2 device - timed out");
1169 return(-EIO); 1113 return -EIO;
1170 } 1114 }
1171 1115
1172 if (STATUS_GET_RESP(scsi_id->status_block.ORB_offset_hi_misc) || 1116 if (STATUS_GET_RESP(scsi_id->status_block.ORB_offset_hi_misc) ||
@@ -1174,7 +1118,7 @@ static int sbp2_query_logins(struct scsi_id_instance_data *scsi_id)
1174 STATUS_GET_SBP_STATUS(scsi_id->status_block.ORB_offset_hi_misc)) { 1118 STATUS_GET_SBP_STATUS(scsi_id->status_block.ORB_offset_hi_misc)) {
1175 1119
1176 SBP2_INFO("Error querying logins to SBP-2 device - timed out"); 1120 SBP2_INFO("Error querying logins to SBP-2 device - timed out");
1177 return(-EIO); 1121 return -EIO;
1178 } 1122 }
1179 1123
1180 sbp2util_cpu_to_be32_buffer(scsi_id->query_logins_response, sizeof(struct sbp2_query_logins_response)); 1124 sbp2util_cpu_to_be32_buffer(scsi_id->query_logins_response, sizeof(struct sbp2_query_logins_response));
@@ -1191,7 +1135,7 @@ static int sbp2_query_logins(struct scsi_id_instance_data *scsi_id)
1191 SBP2_DEBUG("Number of active logins: %d", active_logins); 1135 SBP2_DEBUG("Number of active logins: %d", active_logins);
1192 1136
1193 if (active_logins >= max_logins) { 1137 if (active_logins >= max_logins) {
1194 return(-EIO); 1138 return -EIO;
1195 } 1139 }
1196 1140
1197 return 0; 1141 return 0;
@@ -1210,13 +1154,13 @@ static int sbp2_login_device(struct scsi_id_instance_data *scsi_id)
1210 1154
1211 if (!scsi_id->login_orb) { 1155 if (!scsi_id->login_orb) {
1212 SBP2_DEBUG("sbp2_login_device: login_orb not alloc'd!"); 1156 SBP2_DEBUG("sbp2_login_device: login_orb not alloc'd!");
1213 return(-EIO); 1157 return -EIO;
1214 } 1158 }
1215 1159
1216 if (!exclusive_login) { 1160 if (!exclusive_login) {
1217 if (sbp2_query_logins(scsi_id)) { 1161 if (sbp2_query_logins(scsi_id)) {
1218 SBP2_INFO("Device does not support any more concurrent logins"); 1162 SBP2_INFO("Device does not support any more concurrent logins");
1219 return(-EIO); 1163 return -EIO;
1220 } 1164 }
1221 } 1165 }
1222 1166
@@ -1233,12 +1177,7 @@ static int sbp2_login_device(struct scsi_id_instance_data *scsi_id)
1233 scsi_id->login_orb->lun_misc |= ORB_SET_RECONNECT(0); /* One second reconnect time */ 1177 scsi_id->login_orb->lun_misc |= ORB_SET_RECONNECT(0); /* One second reconnect time */
1234 scsi_id->login_orb->lun_misc |= ORB_SET_EXCLUSIVE(exclusive_login); /* Exclusive access to device */ 1178 scsi_id->login_orb->lun_misc |= ORB_SET_EXCLUSIVE(exclusive_login); /* Exclusive access to device */
1235 scsi_id->login_orb->lun_misc |= ORB_SET_NOTIFY(1); /* Notify us of login complete */ 1179 scsi_id->login_orb->lun_misc |= ORB_SET_NOTIFY(1); /* Notify us of login complete */
1236 /* Set the lun if we were able to pull it from the device's unit directory */ 1180 scsi_id->login_orb->lun_misc |= ORB_SET_LUN(scsi_id->sbp2_lun);
1237 if (scsi_id->sbp2_device_type_and_lun != SBP2_DEVICE_TYPE_LUN_UNINITIALIZED) {
1238 scsi_id->login_orb->lun_misc |= ORB_SET_LUN(scsi_id->sbp2_device_type_and_lun);
1239 SBP2_DEBUG("sbp2_query_logins: set lun to %d",
1240 ORB_SET_LUN(scsi_id->sbp2_device_type_and_lun));
1241 }
1242 SBP2_DEBUG("sbp2_login_device: lun_misc initialized"); 1181 SBP2_DEBUG("sbp2_login_device: lun_misc initialized");
1243 1182
1244 scsi_id->login_orb->passwd_resp_lengths = 1183 scsi_id->login_orb->passwd_resp_lengths =
@@ -1288,7 +1227,7 @@ static int sbp2_login_device(struct scsi_id_instance_data *scsi_id)
1288 */ 1227 */
1289 if (sbp2util_down_timeout(&scsi_id->sbp2_login_complete, 20*HZ)) { 1228 if (sbp2util_down_timeout(&scsi_id->sbp2_login_complete, 20*HZ)) {
1290 SBP2_ERR("Error logging into SBP-2 device - login timed-out"); 1229 SBP2_ERR("Error logging into SBP-2 device - login timed-out");
1291 return(-EIO); 1230 return -EIO;
1292 } 1231 }
1293 1232
1294 /* 1233 /*
@@ -1296,7 +1235,7 @@ static int sbp2_login_device(struct scsi_id_instance_data *scsi_id)
1296 */ 1235 */
1297 if (scsi_id->status_block.ORB_offset_lo != scsi_id->login_orb_dma) { 1236 if (scsi_id->status_block.ORB_offset_lo != scsi_id->login_orb_dma) {
1298 SBP2_ERR("Error logging into SBP-2 device - login timed-out"); 1237 SBP2_ERR("Error logging into SBP-2 device - login timed-out");
1299 return(-EIO); 1238 return -EIO;
1300 } 1239 }
1301 1240
1302 /* 1241 /*
@@ -1307,7 +1246,7 @@ static int sbp2_login_device(struct scsi_id_instance_data *scsi_id)
1307 STATUS_GET_SBP_STATUS(scsi_id->status_block.ORB_offset_hi_misc)) { 1246 STATUS_GET_SBP_STATUS(scsi_id->status_block.ORB_offset_hi_misc)) {
1308 1247
1309 SBP2_ERR("Error logging into SBP-2 device - login failed"); 1248 SBP2_ERR("Error logging into SBP-2 device - login failed");
1310 return(-EIO); 1249 return -EIO;
1311 } 1250 }
1312 1251
1313 /* 1252 /*
@@ -1331,7 +1270,7 @@ static int sbp2_login_device(struct scsi_id_instance_data *scsi_id)
1331 1270
1332 SBP2_INFO("Logged into SBP-2 device"); 1271 SBP2_INFO("Logged into SBP-2 device");
1333 1272
1334 return(0); 1273 return 0;
1335 1274
1336} 1275}
1337 1276
@@ -1385,8 +1324,7 @@ static int sbp2_logout_device(struct scsi_id_instance_data *scsi_id)
1385 atomic_set(&scsi_id->sbp2_login_complete, 0); 1324 atomic_set(&scsi_id->sbp2_login_complete, 0);
1386 1325
1387 error = hpsb_node_write(scsi_id->ne, 1326 error = hpsb_node_write(scsi_id->ne,
1388 scsi_id->sbp2_management_agent_addr, 1327 scsi_id->sbp2_management_agent_addr, data, 8);
1389 data, 8);
1390 if (error) 1328 if (error)
1391 return error; 1329 return error;
1392 1330
@@ -1396,7 +1334,7 @@ static int sbp2_logout_device(struct scsi_id_instance_data *scsi_id)
1396 1334
1397 SBP2_INFO("Logged out of SBP-2 device"); 1335 SBP2_INFO("Logged out of SBP-2 device");
1398 1336
1399 return(0); 1337 return 0;
1400 1338
1401} 1339}
1402 1340
@@ -1456,8 +1394,7 @@ static int sbp2_reconnect_device(struct scsi_id_instance_data *scsi_id)
1456 atomic_set(&scsi_id->sbp2_login_complete, 0); 1394 atomic_set(&scsi_id->sbp2_login_complete, 0);
1457 1395
1458 error = hpsb_node_write(scsi_id->ne, 1396 error = hpsb_node_write(scsi_id->ne,
1459 scsi_id->sbp2_management_agent_addr, 1397 scsi_id->sbp2_management_agent_addr, data, 8);
1460 data, 8);
1461 if (error) 1398 if (error)
1462 return error; 1399 return error;
1463 1400
@@ -1466,7 +1403,7 @@ static int sbp2_reconnect_device(struct scsi_id_instance_data *scsi_id)
1466 */ 1403 */
1467 if (sbp2util_down_timeout(&scsi_id->sbp2_login_complete, HZ)) { 1404 if (sbp2util_down_timeout(&scsi_id->sbp2_login_complete, HZ)) {
1468 SBP2_ERR("Error reconnecting to SBP-2 device - reconnect timed-out"); 1405 SBP2_ERR("Error reconnecting to SBP-2 device - reconnect timed-out");
1469 return(-EIO); 1406 return -EIO;
1470 } 1407 }
1471 1408
1472 /* 1409 /*
@@ -1474,7 +1411,7 @@ static int sbp2_reconnect_device(struct scsi_id_instance_data *scsi_id)
1474 */ 1411 */
1475 if (scsi_id->status_block.ORB_offset_lo != scsi_id->reconnect_orb_dma) { 1412 if (scsi_id->status_block.ORB_offset_lo != scsi_id->reconnect_orb_dma) {
1476 SBP2_ERR("Error reconnecting to SBP-2 device - reconnect timed-out"); 1413 SBP2_ERR("Error reconnecting to SBP-2 device - reconnect timed-out");
1477 return(-EIO); 1414 return -EIO;
1478 } 1415 }
1479 1416
1480 /* 1417 /*
@@ -1485,12 +1422,12 @@ static int sbp2_reconnect_device(struct scsi_id_instance_data *scsi_id)
1485 STATUS_GET_SBP_STATUS(scsi_id->status_block.ORB_offset_hi_misc)) { 1422 STATUS_GET_SBP_STATUS(scsi_id->status_block.ORB_offset_hi_misc)) {
1486 1423
1487 SBP2_ERR("Error reconnecting to SBP-2 device - reconnect failed"); 1424 SBP2_ERR("Error reconnecting to SBP-2 device - reconnect failed");
1488 return(-EIO); 1425 return -EIO;
1489 } 1426 }
1490 1427
1491 HPSB_DEBUG("Reconnected to SBP-2 device"); 1428 HPSB_DEBUG("Reconnected to SBP-2 device");
1492 1429
1493 return(0); 1430 return 0;
1494 1431
1495} 1432}
1496 1433
@@ -1513,10 +1450,9 @@ static int sbp2_set_busy_timeout(struct scsi_id_instance_data *scsi_id)
1513 SBP2_ERR("sbp2_set_busy_timeout error"); 1450 SBP2_ERR("sbp2_set_busy_timeout error");
1514 } 1451 }
1515 1452
1516 return(0); 1453 return 0;
1517} 1454}
1518 1455
1519
1520/* 1456/*
1521 * This function is called to parse sbp2 device's config rom unit 1457 * This function is called to parse sbp2 device's config rom unit
1522 * directory. Used to determine things like sbp2 management agent offset, 1458 * directory. Used to determine things like sbp2 management agent offset,
@@ -1529,7 +1465,7 @@ static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
1529 struct csr1212_dentry *dentry; 1465 struct csr1212_dentry *dentry;
1530 u64 management_agent_addr; 1466 u64 management_agent_addr;
1531 u32 command_set_spec_id, command_set, unit_characteristics, 1467 u32 command_set_spec_id, command_set, unit_characteristics,
1532 firmware_revision, workarounds; 1468 firmware_revision, workarounds;
1533 int i; 1469 int i;
1534 1470
1535 SBP2_DEBUG("sbp2_parse_unit_directory"); 1471 SBP2_DEBUG("sbp2_parse_unit_directory");
@@ -1547,13 +1483,14 @@ static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
1547 if (kv->key.type == CSR1212_KV_TYPE_CSR_OFFSET) { 1483 if (kv->key.type == CSR1212_KV_TYPE_CSR_OFFSET) {
1548 /* Save off the management agent address */ 1484 /* Save off the management agent address */
1549 management_agent_addr = 1485 management_agent_addr =
1550 CSR1212_REGISTER_SPACE_BASE + 1486 CSR1212_REGISTER_SPACE_BASE +
1551 (kv->value.csr_offset << 2); 1487 (kv->value.csr_offset << 2);
1552 1488
1553 SBP2_DEBUG("sbp2_management_agent_addr = %x", 1489 SBP2_DEBUG("sbp2_management_agent_addr = %x",
1554 (unsigned int) management_agent_addr); 1490 (unsigned int)management_agent_addr);
1555 } else if (kv->key.type == CSR1212_KV_TYPE_IMMEDIATE) { 1491 } else if (kv->key.type == CSR1212_KV_TYPE_IMMEDIATE) {
1556 scsi_id->sbp2_device_type_and_lun = kv->value.immediate; 1492 scsi_id->sbp2_lun =
1493 ORB_SET_LUN(kv->value.immediate);
1557 } 1494 }
1558 break; 1495 break;
1559 1496
@@ -1561,14 +1498,14 @@ static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
1561 /* Command spec organization */ 1498 /* Command spec organization */
1562 command_set_spec_id = kv->value.immediate; 1499 command_set_spec_id = kv->value.immediate;
1563 SBP2_DEBUG("sbp2_command_set_spec_id = %x", 1500 SBP2_DEBUG("sbp2_command_set_spec_id = %x",
1564 (unsigned int) command_set_spec_id); 1501 (unsigned int)command_set_spec_id);
1565 break; 1502 break;
1566 1503
1567 case SBP2_COMMAND_SET_KEY: 1504 case SBP2_COMMAND_SET_KEY:
1568 /* Command set used by sbp2 device */ 1505 /* Command set used by sbp2 device */
1569 command_set = kv->value.immediate; 1506 command_set = kv->value.immediate;
1570 SBP2_DEBUG("sbp2_command_set = %x", 1507 SBP2_DEBUG("sbp2_command_set = %x",
1571 (unsigned int) command_set); 1508 (unsigned int)command_set);
1572 break; 1509 break;
1573 1510
1574 case SBP2_UNIT_CHARACTERISTICS_KEY: 1511 case SBP2_UNIT_CHARACTERISTICS_KEY:
@@ -1578,7 +1515,7 @@ static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
1578 */ 1515 */
1579 unit_characteristics = kv->value.immediate; 1516 unit_characteristics = kv->value.immediate;
1580 SBP2_DEBUG("sbp2_unit_characteristics = %x", 1517 SBP2_DEBUG("sbp2_unit_characteristics = %x",
1581 (unsigned int) unit_characteristics); 1518 (unsigned int)unit_characteristics);
1582 break; 1519 break;
1583 1520
1584 case SBP2_FIRMWARE_REVISION_KEY: 1521 case SBP2_FIRMWARE_REVISION_KEY:
@@ -1586,9 +1523,10 @@ static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
1586 firmware_revision = kv->value.immediate; 1523 firmware_revision = kv->value.immediate;
1587 if (force_inquiry_hack) 1524 if (force_inquiry_hack)
1588 SBP2_INFO("sbp2_firmware_revision = %x", 1525 SBP2_INFO("sbp2_firmware_revision = %x",
1589 (unsigned int) firmware_revision); 1526 (unsigned int)firmware_revision);
1590 else SBP2_DEBUG("sbp2_firmware_revision = %x", 1527 else
1591 (unsigned int) firmware_revision); 1528 SBP2_DEBUG("sbp2_firmware_revision = %x",
1529 (unsigned int)firmware_revision);
1592 break; 1530 break;
1593 1531
1594 default: 1532 default:
@@ -1646,7 +1584,7 @@ static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
1646 scsi_id->sbp2_firmware_revision = firmware_revision; 1584 scsi_id->sbp2_firmware_revision = firmware_revision;
1647 scsi_id->workarounds = workarounds; 1585 scsi_id->workarounds = workarounds;
1648 if (ud->flags & UNIT_DIRECTORY_HAS_LUN) 1586 if (ud->flags & UNIT_DIRECTORY_HAS_LUN)
1649 scsi_id->sbp2_device_type_and_lun = ud->lun; 1587 scsi_id->sbp2_lun = ORB_SET_LUN(ud->lun);
1650 } 1588 }
1651} 1589}
1652 1590
@@ -1666,8 +1604,9 @@ static int sbp2_max_speed_and_size(struct scsi_id_instance_data *scsi_id)
1666 SBP2_DEBUG("sbp2_max_speed_and_size"); 1604 SBP2_DEBUG("sbp2_max_speed_and_size");
1667 1605
1668 /* Initial setting comes from the hosts speed map */ 1606 /* Initial setting comes from the hosts speed map */
1669 scsi_id->speed_code = hi->host->speed_map[NODEID_TO_NODE(hi->host->node_id) * 64 1607 scsi_id->speed_code =
1670 + NODEID_TO_NODE(scsi_id->ne->nodeid)]; 1608 hi->host->speed_map[NODEID_TO_NODE(hi->host->node_id) * 64 +
1609 NODEID_TO_NODE(scsi_id->ne->nodeid)];
1671 1610
1672 /* Bump down our speed if the user requested it */ 1611 /* Bump down our speed if the user requested it */
1673 if (scsi_id->speed_code > max_speed) { 1612 if (scsi_id->speed_code > max_speed) {
@@ -1678,15 +1617,16 @@ static int sbp2_max_speed_and_size(struct scsi_id_instance_data *scsi_id)
1678 1617
1679 /* Payload size is the lesser of what our speed supports and what 1618 /* Payload size is the lesser of what our speed supports and what
1680 * our host supports. */ 1619 * our host supports. */
1681 scsi_id->max_payload_size = min(sbp2_speedto_max_payload[scsi_id->speed_code], 1620 scsi_id->max_payload_size =
1682 (u8)(hi->host->csr.max_rec - 1)); 1621 min(sbp2_speedto_max_payload[scsi_id->speed_code],
1622 (u8) (hi->host->csr.max_rec - 1));
1683 1623
1684 HPSB_DEBUG("Node " NODE_BUS_FMT ": Max speed [%s] - Max payload [%u]", 1624 HPSB_DEBUG("Node " NODE_BUS_FMT ": Max speed [%s] - Max payload [%u]",
1685 NODE_BUS_ARGS(hi->host, scsi_id->ne->nodeid), 1625 NODE_BUS_ARGS(hi->host, scsi_id->ne->nodeid),
1686 hpsb_speedto_str[scsi_id->speed_code], 1626 hpsb_speedto_str[scsi_id->speed_code],
1687 1 << ((u32)scsi_id->max_payload_size + 2)); 1627 1 << ((u32) scsi_id->max_payload_size + 2));
1688 1628
1689 return(0); 1629 return 0;
1690} 1630}
1691 1631
1692/* 1632/*
@@ -1721,30 +1661,187 @@ static int sbp2_agent_reset(struct scsi_id_instance_data *scsi_id, int wait)
1721 */ 1661 */
1722 scsi_id->last_orb = NULL; 1662 scsi_id->last_orb = NULL;
1723 1663
1724 return(0); 1664 return 0;
1665}
1666
1667static void sbp2_prep_command_orb_sg(struct sbp2_command_orb *orb,
1668 struct sbp2scsi_host_info *hi,
1669 struct sbp2_command_info *command,
1670 unsigned int scsi_use_sg,
1671 struct scatterlist *sgpnt,
1672 u32 orb_direction,
1673 enum dma_data_direction dma_dir)
1674{
1675 command->dma_dir = dma_dir;
1676 orb->data_descriptor_hi = ORB_SET_NODE_ID(hi->host->node_id);
1677 orb->misc |= ORB_SET_DIRECTION(orb_direction);
1678
1679 /* Special case if only one element (and less than 64KB in size) */
1680 if ((scsi_use_sg == 1) &&
1681 (sgpnt[0].length <= SBP2_MAX_SG_ELEMENT_LENGTH)) {
1682
1683 SBP2_DEBUG("Only one s/g element");
1684 command->dma_size = sgpnt[0].length;
1685 command->dma_type = CMD_DMA_PAGE;
1686 command->cmd_dma = pci_map_page(hi->host->pdev,
1687 sgpnt[0].page,
1688 sgpnt[0].offset,
1689 command->dma_size,
1690 command->dma_dir);
1691 SBP2_DMA_ALLOC("single page scatter element");
1692
1693 orb->data_descriptor_lo = command->cmd_dma;
1694 orb->misc |= ORB_SET_DATA_SIZE(command->dma_size);
1695
1696 } else {
1697 struct sbp2_unrestricted_page_table *sg_element =
1698 &command->scatter_gather_element[0];
1699 u32 sg_count, sg_len;
1700 dma_addr_t sg_addr;
1701 int i, count = pci_map_sg(hi->host->pdev, sgpnt, scsi_use_sg,
1702 dma_dir);
1703
1704 SBP2_DMA_ALLOC("scatter list");
1705
1706 command->dma_size = scsi_use_sg;
1707 command->sge_buffer = sgpnt;
1708
1709 /* use page tables (s/g) */
1710 orb->misc |= ORB_SET_PAGE_TABLE_PRESENT(0x1);
1711 orb->data_descriptor_lo = command->sge_dma;
1712
1713 /*
1714 * Loop through and fill out our sbp-2 page tables
1715 * (and split up anything too large)
1716 */
1717 for (i = 0, sg_count = 0 ; i < count; i++, sgpnt++) {
1718 sg_len = sg_dma_len(sgpnt);
1719 sg_addr = sg_dma_address(sgpnt);
1720 while (sg_len) {
1721 sg_element[sg_count].segment_base_lo = sg_addr;
1722 if (sg_len > SBP2_MAX_SG_ELEMENT_LENGTH) {
1723 sg_element[sg_count].length_segment_base_hi =
1724 PAGE_TABLE_SET_SEGMENT_LENGTH(SBP2_MAX_SG_ELEMENT_LENGTH);
1725 sg_addr += SBP2_MAX_SG_ELEMENT_LENGTH;
1726 sg_len -= SBP2_MAX_SG_ELEMENT_LENGTH;
1727 } else {
1728 sg_element[sg_count].length_segment_base_hi =
1729 PAGE_TABLE_SET_SEGMENT_LENGTH(sg_len);
1730 sg_len = 0;
1731 }
1732 sg_count++;
1733 }
1734 }
1735
1736 /* Number of page table (s/g) elements */
1737 orb->misc |= ORB_SET_DATA_SIZE(sg_count);
1738
1739 sbp2util_packet_dump(sg_element,
1740 (sizeof(struct sbp2_unrestricted_page_table)) * sg_count,
1741 "sbp2 s/g list", command->sge_dma);
1742
1743 /* Byte swap page tables if necessary */
1744 sbp2util_cpu_to_be32_buffer(sg_element,
1745 (sizeof(struct sbp2_unrestricted_page_table)) *
1746 sg_count);
1747 }
1748}
1749
1750static void sbp2_prep_command_orb_no_sg(struct sbp2_command_orb *orb,
1751 struct sbp2scsi_host_info *hi,
1752 struct sbp2_command_info *command,
1753 struct scatterlist *sgpnt,
1754 u32 orb_direction,
1755 unsigned int scsi_request_bufflen,
1756 void *scsi_request_buffer,
1757 enum dma_data_direction dma_dir)
1758{
1759 command->dma_dir = dma_dir;
1760 command->dma_size = scsi_request_bufflen;
1761 command->dma_type = CMD_DMA_SINGLE;
1762 command->cmd_dma = pci_map_single(hi->host->pdev, scsi_request_buffer,
1763 command->dma_size, command->dma_dir);
1764 orb->data_descriptor_hi = ORB_SET_NODE_ID(hi->host->node_id);
1765 orb->misc |= ORB_SET_DIRECTION(orb_direction);
1766
1767 SBP2_DMA_ALLOC("single bulk");
1768
1769 /*
1770 * Handle case where we get a command w/o s/g enabled (but
1771 * check for transfers larger than 64K)
1772 */
1773 if (scsi_request_bufflen <= SBP2_MAX_SG_ELEMENT_LENGTH) {
1774
1775 orb->data_descriptor_lo = command->cmd_dma;
1776 orb->misc |= ORB_SET_DATA_SIZE(scsi_request_bufflen);
1777
1778 } else {
1779 struct sbp2_unrestricted_page_table *sg_element =
1780 &command->scatter_gather_element[0];
1781 u32 sg_count, sg_len;
1782 dma_addr_t sg_addr;
1783
1784 /*
1785 * Need to turn this into page tables, since the
1786 * buffer is too large.
1787 */
1788 orb->data_descriptor_lo = command->sge_dma;
1789
1790 /* Use page tables (s/g) */
1791 orb->misc |= ORB_SET_PAGE_TABLE_PRESENT(0x1);
1792
1793 /*
1794 * fill out our sbp-2 page tables (and split up
1795 * the large buffer)
1796 */
1797 sg_count = 0;
1798 sg_len = scsi_request_bufflen;
1799 sg_addr = command->cmd_dma;
1800 while (sg_len) {
1801 sg_element[sg_count].segment_base_lo = sg_addr;
1802 if (sg_len > SBP2_MAX_SG_ELEMENT_LENGTH) {
1803 sg_element[sg_count].length_segment_base_hi =
1804 PAGE_TABLE_SET_SEGMENT_LENGTH(SBP2_MAX_SG_ELEMENT_LENGTH);
1805 sg_addr += SBP2_MAX_SG_ELEMENT_LENGTH;
1806 sg_len -= SBP2_MAX_SG_ELEMENT_LENGTH;
1807 } else {
1808 sg_element[sg_count].length_segment_base_hi =
1809 PAGE_TABLE_SET_SEGMENT_LENGTH(sg_len);
1810 sg_len = 0;
1811 }
1812 sg_count++;
1813 }
1814
1815 /* Number of page table (s/g) elements */
1816 orb->misc |= ORB_SET_DATA_SIZE(sg_count);
1817
1818 sbp2util_packet_dump(sg_element,
1819 (sizeof(struct sbp2_unrestricted_page_table)) * sg_count,
1820 "sbp2 s/g list", command->sge_dma);
1821
1822 /* Byte swap page tables if necessary */
1823 sbp2util_cpu_to_be32_buffer(sg_element,
1824 (sizeof(struct sbp2_unrestricted_page_table)) *
1825 sg_count);
1826 }
1725} 1827}
1726 1828
1727/* 1829/*
1728 * This function is called to create the actual command orb and s/g list 1830 * This function is called to create the actual command orb and s/g list
1729 * out of the scsi command itself. 1831 * out of the scsi command itself.
1730 */ 1832 */
1731static int sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id, 1833static void sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id,
1732 struct sbp2_command_info *command, 1834 struct sbp2_command_info *command,
1733 unchar *scsi_cmd, 1835 unchar *scsi_cmd,
1734 unsigned int scsi_use_sg, 1836 unsigned int scsi_use_sg,
1735 unsigned int scsi_request_bufflen, 1837 unsigned int scsi_request_bufflen,
1736 void *scsi_request_buffer, 1838 void *scsi_request_buffer,
1737 enum dma_data_direction dma_dir) 1839 enum dma_data_direction dma_dir)
1738
1739{ 1840{
1740 struct sbp2scsi_host_info *hi = scsi_id->hi; 1841 struct sbp2scsi_host_info *hi = scsi_id->hi;
1741 struct scatterlist *sgpnt = (struct scatterlist *) scsi_request_buffer; 1842 struct scatterlist *sgpnt = (struct scatterlist *)scsi_request_buffer;
1742 struct sbp2_command_orb *command_orb = &command->command_orb; 1843 struct sbp2_command_orb *command_orb = &command->command_orb;
1743 struct sbp2_unrestricted_page_table *scatter_gather_element = 1844 u32 orb_direction;
1744 &command->scatter_gather_element[0];
1745 u32 sg_count, sg_len, orb_direction;
1746 dma_addr_t sg_addr;
1747 int i;
1748 1845
1749 /* 1846 /*
1750 * Set-up our command ORB.. 1847 * Set-up our command ORB..
@@ -1758,222 +1855,42 @@ static int sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id,
1758 command_orb->next_ORB_lo = 0x0; 1855 command_orb->next_ORB_lo = 0x0;
1759 command_orb->misc = ORB_SET_MAX_PAYLOAD(scsi_id->max_payload_size); 1856 command_orb->misc = ORB_SET_MAX_PAYLOAD(scsi_id->max_payload_size);
1760 command_orb->misc |= ORB_SET_SPEED(scsi_id->speed_code); 1857 command_orb->misc |= ORB_SET_SPEED(scsi_id->speed_code);
1761 command_orb->misc |= ORB_SET_NOTIFY(1); /* Notify us when complete */ 1858 command_orb->misc |= ORB_SET_NOTIFY(1); /* Notify us when complete */
1762 1859
1763 /* 1860 if (dma_dir == DMA_NONE)
1764 * Get the direction of the transfer. If the direction is unknown, then use our 1861 orb_direction = ORB_DIRECTION_NO_DATA_TRANSFER;
1765 * goofy table as a back-up. 1862 else if (dma_dir == DMA_TO_DEVICE && scsi_request_bufflen)
1766 */ 1863 orb_direction = ORB_DIRECTION_WRITE_TO_MEDIA;
1767 switch (dma_dir) { 1864 else if (dma_dir == DMA_FROM_DEVICE && scsi_request_bufflen)
1768 case DMA_NONE: 1865 orb_direction = ORB_DIRECTION_READ_FROM_MEDIA;
1769 orb_direction = ORB_DIRECTION_NO_DATA_TRANSFER; 1866 else {
1770 break; 1867 SBP2_WARN("Falling back to DMA_NONE");
1771 case DMA_TO_DEVICE: 1868 orb_direction = ORB_DIRECTION_NO_DATA_TRANSFER;
1772 orb_direction = ORB_DIRECTION_WRITE_TO_MEDIA;
1773 break;
1774 case DMA_FROM_DEVICE:
1775 orb_direction = ORB_DIRECTION_READ_FROM_MEDIA;
1776 break;
1777 case DMA_BIDIRECTIONAL:
1778 default:
1779 SBP2_ERR("SCSI data transfer direction not specified. "
1780 "Update the SBP2 direction table in sbp2.h if "
1781 "necessary for your application");
1782 __scsi_print_command(scsi_cmd);
1783 orb_direction = sbp2scsi_direction_table[*scsi_cmd];
1784 break;
1785 } 1869 }
1786 1870
1787 /* 1871 /* Set-up our pagetable stuff */
1788 * Set-up our pagetable stuff... unfortunately, this has become
1789 * messier than I'd like. Need to clean this up a bit. ;-)
1790 */
1791 if (orb_direction == ORB_DIRECTION_NO_DATA_TRANSFER) { 1872 if (orb_direction == ORB_DIRECTION_NO_DATA_TRANSFER) {
1792
1793 SBP2_DEBUG("No data transfer"); 1873 SBP2_DEBUG("No data transfer");
1794
1795 /*
1796 * Handle no data transfer
1797 */
1798 command_orb->data_descriptor_hi = 0x0; 1874 command_orb->data_descriptor_hi = 0x0;
1799 command_orb->data_descriptor_lo = 0x0; 1875 command_orb->data_descriptor_lo = 0x0;
1800 command_orb->misc |= ORB_SET_DIRECTION(1); 1876 command_orb->misc |= ORB_SET_DIRECTION(1);
1801
1802 } else if (scsi_use_sg) { 1877 } else if (scsi_use_sg) {
1803
1804 SBP2_DEBUG("Use scatter/gather"); 1878 SBP2_DEBUG("Use scatter/gather");
1805 1879 sbp2_prep_command_orb_sg(command_orb, hi, command, scsi_use_sg,
1806 /* 1880 sgpnt, orb_direction, dma_dir);
1807 * Special case if only one element (and less than 64KB in size)
1808 */
1809 if ((scsi_use_sg == 1) && (sgpnt[0].length <= SBP2_MAX_SG_ELEMENT_LENGTH)) {
1810
1811 SBP2_DEBUG("Only one s/g element");
1812 command->dma_dir = dma_dir;
1813 command->dma_size = sgpnt[0].length;
1814 command->dma_type = CMD_DMA_PAGE;
1815 command->cmd_dma = pci_map_page(hi->host->pdev,
1816 sgpnt[0].page,
1817 sgpnt[0].offset,
1818 command->dma_size,
1819 command->dma_dir);
1820 SBP2_DMA_ALLOC("single page scatter element");
1821
1822 command_orb->data_descriptor_hi = ORB_SET_NODE_ID(hi->host->node_id);
1823 command_orb->data_descriptor_lo = command->cmd_dma;
1824 command_orb->misc |= ORB_SET_DATA_SIZE(command->dma_size);
1825 command_orb->misc |= ORB_SET_DIRECTION(orb_direction);
1826
1827 } else {
1828 int count = pci_map_sg(hi->host->pdev, sgpnt, scsi_use_sg, dma_dir);
1829 SBP2_DMA_ALLOC("scatter list");
1830
1831 command->dma_size = scsi_use_sg;
1832 command->dma_dir = dma_dir;
1833 command->sge_buffer = sgpnt;
1834
1835 /* use page tables (s/g) */
1836 command_orb->misc |= ORB_SET_PAGE_TABLE_PRESENT(0x1);
1837 command_orb->misc |= ORB_SET_DIRECTION(orb_direction);
1838 command_orb->data_descriptor_hi = ORB_SET_NODE_ID(hi->host->node_id);
1839 command_orb->data_descriptor_lo = command->sge_dma;
1840
1841 /*
1842 * Loop through and fill out our sbp-2 page tables
1843 * (and split up anything too large)
1844 */
1845 for (i = 0, sg_count = 0 ; i < count; i++, sgpnt++) {
1846 sg_len = sg_dma_len(sgpnt);
1847 sg_addr = sg_dma_address(sgpnt);
1848 while (sg_len) {
1849 scatter_gather_element[sg_count].segment_base_lo = sg_addr;
1850 if (sg_len > SBP2_MAX_SG_ELEMENT_LENGTH) {
1851 scatter_gather_element[sg_count].length_segment_base_hi =
1852 PAGE_TABLE_SET_SEGMENT_LENGTH(SBP2_MAX_SG_ELEMENT_LENGTH);
1853 sg_addr += SBP2_MAX_SG_ELEMENT_LENGTH;
1854 sg_len -= SBP2_MAX_SG_ELEMENT_LENGTH;
1855 } else {
1856 scatter_gather_element[sg_count].length_segment_base_hi =
1857 PAGE_TABLE_SET_SEGMENT_LENGTH(sg_len);
1858 sg_len = 0;
1859 }
1860 sg_count++;
1861 }
1862 }
1863
1864 /* Number of page table (s/g) elements */
1865 command_orb->misc |= ORB_SET_DATA_SIZE(sg_count);
1866
1867 sbp2util_packet_dump(scatter_gather_element,
1868 (sizeof(struct sbp2_unrestricted_page_table)) * sg_count,
1869 "sbp2 s/g list", command->sge_dma);
1870
1871 /*
1872 * Byte swap page tables if necessary
1873 */
1874 sbp2util_cpu_to_be32_buffer(scatter_gather_element,
1875 (sizeof(struct sbp2_unrestricted_page_table)) *
1876 sg_count);
1877
1878 }
1879
1880 } else { 1881 } else {
1881
1882 SBP2_DEBUG("No scatter/gather"); 1882 SBP2_DEBUG("No scatter/gather");
1883 1883 sbp2_prep_command_orb_no_sg(command_orb, hi, command, sgpnt,
1884 command->dma_dir = dma_dir; 1884 orb_direction, scsi_request_bufflen,
1885 command->dma_size = scsi_request_bufflen; 1885 scsi_request_buffer, dma_dir);
1886 command->dma_type = CMD_DMA_SINGLE;
1887 command->cmd_dma = pci_map_single (hi->host->pdev, scsi_request_buffer,
1888 command->dma_size,
1889 command->dma_dir);
1890 SBP2_DMA_ALLOC("single bulk");
1891
1892 /*
1893 * Handle case where we get a command w/o s/g enabled (but
1894 * check for transfers larger than 64K)
1895 */
1896 if (scsi_request_bufflen <= SBP2_MAX_SG_ELEMENT_LENGTH) {
1897
1898 command_orb->data_descriptor_hi = ORB_SET_NODE_ID(hi->host->node_id);
1899 command_orb->data_descriptor_lo = command->cmd_dma;
1900 command_orb->misc |= ORB_SET_DATA_SIZE(scsi_request_bufflen);
1901 command_orb->misc |= ORB_SET_DIRECTION(orb_direction);
1902
1903 /*
1904 * Sanity, in case our direction table is not
1905 * up-to-date
1906 */
1907 if (!scsi_request_bufflen) {
1908 command_orb->data_descriptor_hi = 0x0;
1909 command_orb->data_descriptor_lo = 0x0;
1910 command_orb->misc |= ORB_SET_DIRECTION(1);
1911 }
1912
1913 } else {
1914 /*
1915 * Need to turn this into page tables, since the
1916 * buffer is too large.
1917 */
1918 command_orb->data_descriptor_hi = ORB_SET_NODE_ID(hi->host->node_id);
1919 command_orb->data_descriptor_lo = command->sge_dma;
1920
1921 /* Use page tables (s/g) */
1922 command_orb->misc |= ORB_SET_PAGE_TABLE_PRESENT(0x1);
1923 command_orb->misc |= ORB_SET_DIRECTION(orb_direction);
1924
1925 /*
1926 * fill out our sbp-2 page tables (and split up
1927 * the large buffer)
1928 */
1929 sg_count = 0;
1930 sg_len = scsi_request_bufflen;
1931 sg_addr = command->cmd_dma;
1932 while (sg_len) {
1933 scatter_gather_element[sg_count].segment_base_lo = sg_addr;
1934 if (sg_len > SBP2_MAX_SG_ELEMENT_LENGTH) {
1935 scatter_gather_element[sg_count].length_segment_base_hi =
1936 PAGE_TABLE_SET_SEGMENT_LENGTH(SBP2_MAX_SG_ELEMENT_LENGTH);
1937 sg_addr += SBP2_MAX_SG_ELEMENT_LENGTH;
1938 sg_len -= SBP2_MAX_SG_ELEMENT_LENGTH;
1939 } else {
1940 scatter_gather_element[sg_count].length_segment_base_hi =
1941 PAGE_TABLE_SET_SEGMENT_LENGTH(sg_len);
1942 sg_len = 0;
1943 }
1944 sg_count++;
1945 }
1946
1947 /* Number of page table (s/g) elements */
1948 command_orb->misc |= ORB_SET_DATA_SIZE(sg_count);
1949
1950 sbp2util_packet_dump(scatter_gather_element,
1951 (sizeof(struct sbp2_unrestricted_page_table)) * sg_count,
1952 "sbp2 s/g list", command->sge_dma);
1953
1954 /*
1955 * Byte swap page tables if necessary
1956 */
1957 sbp2util_cpu_to_be32_buffer(scatter_gather_element,
1958 (sizeof(struct sbp2_unrestricted_page_table)) *
1959 sg_count);
1960
1961 }
1962
1963 } 1886 }
1964 1887
1965 /* 1888 /* Byte swap command ORB if necessary */
1966 * Byte swap command ORB if necessary
1967 */
1968 sbp2util_cpu_to_be32_buffer(command_orb, sizeof(struct sbp2_command_orb)); 1889 sbp2util_cpu_to_be32_buffer(command_orb, sizeof(struct sbp2_command_orb));
1969 1890
1970 /* 1891 /* Put our scsi command in the command ORB */
1971 * Put our scsi command in the command ORB
1972 */
1973 memset(command_orb->cdb, 0, 12); 1892 memset(command_orb->cdb, 0, 12);
1974 memcpy(command_orb->cdb, scsi_cmd, COMMAND_SIZE(*scsi_cmd)); 1893 memcpy(command_orb->cdb, scsi_cmd, COMMAND_SIZE(*scsi_cmd));
1975
1976 return(0);
1977} 1894}
1978 1895
1979/* 1896/*
@@ -1989,7 +1906,7 @@ static int sbp2_link_orb_command(struct scsi_id_instance_data *scsi_id,
1989 1906
1990 outstanding_orb_incr; 1907 outstanding_orb_incr;
1991 SBP2_ORB_DEBUG("sending command orb %p, total orbs = %x", 1908 SBP2_ORB_DEBUG("sending command orb %p, total orbs = %x",
1992 command_orb, global_outstanding_command_orbs); 1909 command_orb, global_outstanding_command_orbs);
1993 1910
1994 pci_dma_sync_single_for_device(hi->host->pdev, command->command_orb_dma, 1911 pci_dma_sync_single_for_device(hi->host->pdev, command->command_orb_dma,
1995 sizeof(struct sbp2_command_orb), 1912 sizeof(struct sbp2_command_orb),
@@ -2034,10 +1951,11 @@ static int sbp2_link_orb_command(struct scsi_id_instance_data *scsi_id,
2034 * both by the sbp2 device and us. 1951 * both by the sbp2 device and us.
2035 */ 1952 */
2036 scsi_id->last_orb->next_ORB_lo = 1953 scsi_id->last_orb->next_ORB_lo =
2037 cpu_to_be32(command->command_orb_dma); 1954 cpu_to_be32(command->command_orb_dma);
2038 /* Tells hardware that this pointer is valid */ 1955 /* Tells hardware that this pointer is valid */
2039 scsi_id->last_orb->next_ORB_hi = 0x0; 1956 scsi_id->last_orb->next_ORB_hi = 0x0;
2040 pci_dma_sync_single_for_device(hi->host->pdev, scsi_id->last_orb_dma, 1957 pci_dma_sync_single_for_device(hi->host->pdev,
1958 scsi_id->last_orb_dma,
2041 sizeof(struct sbp2_command_orb), 1959 sizeof(struct sbp2_command_orb),
2042 PCI_DMA_BIDIRECTIONAL); 1960 PCI_DMA_BIDIRECTIONAL);
2043 1961
@@ -2051,14 +1969,14 @@ static int sbp2_link_orb_command(struct scsi_id_instance_data *scsi_id,
2051 1969
2052 if (sbp2util_node_write_no_wait(ne, addr, &data, 4) < 0) { 1970 if (sbp2util_node_write_no_wait(ne, addr, &data, 4) < 0) {
2053 SBP2_ERR("sbp2util_node_write_no_wait failed"); 1971 SBP2_ERR("sbp2util_node_write_no_wait failed");
2054 return(-EIO); 1972 return -EIO;
2055 } 1973 }
2056 1974
2057 scsi_id->last_orb = command_orb; 1975 scsi_id->last_orb = command_orb;
2058 scsi_id->last_orb_dma = command->command_orb_dma; 1976 scsi_id->last_orb_dma = command->command_orb_dma;
2059 1977
2060 } 1978 }
2061 return(0); 1979 return 0;
2062} 1980}
2063 1981
2064/* 1982/*
@@ -2085,7 +2003,7 @@ static int sbp2_send_command(struct scsi_id_instance_data *scsi_id,
2085 */ 2003 */
2086 command = sbp2util_allocate_command_orb(scsi_id, SCpnt, done); 2004 command = sbp2util_allocate_command_orb(scsi_id, SCpnt, done);
2087 if (!command) { 2005 if (!command) {
2088 return(-EIO); 2006 return -EIO;
2089 } 2007 }
2090 2008
2091 /* 2009 /*
@@ -2106,11 +2024,6 @@ static int sbp2_send_command(struct scsi_id_instance_data *scsi_id,
2106 sbp2_create_command_orb(scsi_id, command, cmd, SCpnt->use_sg, 2024 sbp2_create_command_orb(scsi_id, command, cmd, SCpnt->use_sg,
2107 request_bufflen, SCpnt->request_buffer, 2025 request_bufflen, SCpnt->request_buffer,
2108 SCpnt->sc_data_direction); 2026 SCpnt->sc_data_direction);
2109 /*
2110 * Update our cdb if necessary (to handle sbp2 RBC command set
2111 * differences). This is where the command set hacks go! =)
2112 */
2113 sbp2_check_sbp2_command(scsi_id, command->command_orb.cdb);
2114 2027
2115 sbp2util_packet_dump(&command->command_orb, sizeof(struct sbp2_command_orb), 2028 sbp2util_packet_dump(&command->command_orb, sizeof(struct sbp2_command_orb),
2116 "sbp2 command orb", command->command_orb_dma); 2029 "sbp2 command orb", command->command_orb_dma);
@@ -2125,112 +2038,7 @@ static int sbp2_send_command(struct scsi_id_instance_data *scsi_id,
2125 */ 2038 */
2126 sbp2_link_orb_command(scsi_id, command); 2039 sbp2_link_orb_command(scsi_id, command);
2127 2040
2128 return(0); 2041 return 0;
2129}
2130
2131
2132/*
2133 * This function deals with command set differences between Linux scsi
2134 * command set and sbp2 RBC command set.
2135 */
2136static void sbp2_check_sbp2_command(struct scsi_id_instance_data *scsi_id, unchar *cmd)
2137{
2138 unchar new_cmd[16];
2139 u8 device_type = SBP2_DEVICE_TYPE (scsi_id->sbp2_device_type_and_lun);
2140
2141 SBP2_DEBUG("sbp2_check_sbp2_command");
2142
2143 switch (*cmd) {
2144
2145 case READ_6:
2146
2147 if (sbp2_command_conversion_device_type(device_type)) {
2148
2149 SBP2_DEBUG("Convert READ_6 to READ_10");
2150
2151 /*
2152 * Need to turn read_6 into read_10
2153 */
2154 new_cmd[0] = 0x28;
2155 new_cmd[1] = (cmd[1] & 0xe0);
2156 new_cmd[2] = 0x0;
2157 new_cmd[3] = (cmd[1] & 0x1f);
2158 new_cmd[4] = cmd[2];
2159 new_cmd[5] = cmd[3];
2160 new_cmd[6] = 0x0;
2161 new_cmd[7] = 0x0;
2162 new_cmd[8] = cmd[4];
2163 new_cmd[9] = cmd[5];
2164
2165 memcpy(cmd, new_cmd, 10);
2166
2167 }
2168
2169 break;
2170
2171 case WRITE_6:
2172
2173 if (sbp2_command_conversion_device_type(device_type)) {
2174
2175 SBP2_DEBUG("Convert WRITE_6 to WRITE_10");
2176
2177 /*
2178 * Need to turn write_6 into write_10
2179 */
2180 new_cmd[0] = 0x2a;
2181 new_cmd[1] = (cmd[1] & 0xe0);
2182 new_cmd[2] = 0x0;
2183 new_cmd[3] = (cmd[1] & 0x1f);
2184 new_cmd[4] = cmd[2];
2185 new_cmd[5] = cmd[3];
2186 new_cmd[6] = 0x0;
2187 new_cmd[7] = 0x0;
2188 new_cmd[8] = cmd[4];
2189 new_cmd[9] = cmd[5];
2190
2191 memcpy(cmd, new_cmd, 10);
2192
2193 }
2194
2195 break;
2196
2197 case MODE_SENSE:
2198
2199 if (sbp2_command_conversion_device_type(device_type)) {
2200
2201 SBP2_DEBUG("Convert MODE_SENSE_6 to MODE_SENSE_10");
2202
2203 /*
2204 * Need to turn mode_sense_6 into mode_sense_10
2205 */
2206 new_cmd[0] = 0x5a;
2207 new_cmd[1] = cmd[1];
2208 new_cmd[2] = cmd[2];
2209 new_cmd[3] = 0x0;
2210 new_cmd[4] = 0x0;
2211 new_cmd[5] = 0x0;
2212 new_cmd[6] = 0x0;
2213 new_cmd[7] = 0x0;
2214 new_cmd[8] = cmd[4];
2215 new_cmd[9] = cmd[5];
2216
2217 memcpy(cmd, new_cmd, 10);
2218
2219 }
2220
2221 break;
2222
2223 case MODE_SELECT:
2224
2225 /*
2226 * TODO. Probably need to change mode select to 10 byte version
2227 */
2228
2229 default:
2230 break;
2231 }
2232
2233 return;
2234} 2042}
2235 2043
2236/* 2044/*
@@ -2260,80 +2068,40 @@ static unsigned int sbp2_status_to_sense_data(unchar *sbp2_status, unchar *sense
2260 sense_data[14] = sbp2_status[20]; 2068 sense_data[14] = sbp2_status[20];
2261 sense_data[15] = sbp2_status[21]; 2069 sense_data[15] = sbp2_status[21];
2262 2070
2263 return(sbp2_status[8] & 0x3f); /* return scsi status */ 2071 return sbp2_status[8] & 0x3f; /* return scsi status */
2264} 2072}
2265 2073
2266/* 2074/*
2267 * This function is called after a command is completed, in order to do any necessary SBP-2 2075 * This function is called after a command is completed, in order to do any necessary SBP-2
2268 * response data translations for the SCSI stack 2076 * response data translations for the SCSI stack
2269 */ 2077 */
2270static void sbp2_check_sbp2_response(struct scsi_id_instance_data *scsi_id, 2078static void sbp2_check_sbp2_response(struct scsi_id_instance_data *scsi_id,
2271 struct scsi_cmnd *SCpnt) 2079 struct scsi_cmnd *SCpnt)
2272{ 2080{
2273 u8 *scsi_buf = SCpnt->request_buffer; 2081 u8 *scsi_buf = SCpnt->request_buffer;
2274 u8 device_type = SBP2_DEVICE_TYPE (scsi_id->sbp2_device_type_and_lun);
2275 2082
2276 SBP2_DEBUG("sbp2_check_sbp2_response"); 2083 SBP2_DEBUG("sbp2_check_sbp2_response");
2277 2084
2278 switch (SCpnt->cmnd[0]) { 2085 switch (SCpnt->cmnd[0]) {
2279 2086
2280 case INQUIRY: 2087 case INQUIRY:
2281 2088 /*
2282 /* 2089 * Make sure data length is ok. Minimum length is 36 bytes
2283 * If scsi_id->sbp2_device_type_and_lun is uninitialized, then fill 2090 */
2284 * this information in from the inquiry response data. Lun is set to zero. 2091 if (scsi_buf[4] == 0) {
2285 */ 2092 scsi_buf[4] = 36 - 5;
2286 if (scsi_id->sbp2_device_type_and_lun == SBP2_DEVICE_TYPE_LUN_UNINITIALIZED) { 2093 }
2287 SBP2_DEBUG("Creating sbp2_device_type_and_lun from scsi inquiry data");
2288 scsi_id->sbp2_device_type_and_lun = (scsi_buf[0] & 0x1f) << 16;
2289 }
2290
2291 /*
2292 * Make sure data length is ok. Minimum length is 36 bytes
2293 */
2294 if (scsi_buf[4] == 0) {
2295 scsi_buf[4] = 36 - 5;
2296 }
2297
2298 /*
2299 * Check for Simple Direct Access Device and change it to TYPE_DISK
2300 */
2301 if ((scsi_buf[0] & 0x1f) == TYPE_RBC) {
2302 SBP2_DEBUG("Changing TYPE_RBC to TYPE_DISK");
2303 scsi_buf[0] &= 0xe0;
2304 }
2305
2306 /*
2307 * Fix ansi revision and response data format
2308 */
2309 scsi_buf[2] |= 2;
2310 scsi_buf[3] = (scsi_buf[3] & 0xf0) | 2;
2311
2312 break;
2313
2314 case MODE_SENSE:
2315
2316 if (sbp2_command_conversion_device_type(device_type)) {
2317
2318 SBP2_DEBUG("Modify mode sense response (10 byte version)");
2319
2320 scsi_buf[0] = scsi_buf[1]; /* Mode data length */
2321 scsi_buf[1] = scsi_buf[2]; /* Medium type */
2322 scsi_buf[2] = scsi_buf[3]; /* Device specific parameter */
2323 scsi_buf[3] = scsi_buf[7]; /* Block descriptor length */
2324 memcpy(scsi_buf + 4, scsi_buf + 8, scsi_buf[0]);
2325 }
2326
2327 break;
2328 2094
2329 case MODE_SELECT: 2095 /*
2096 * Fix ansi revision and response data format
2097 */
2098 scsi_buf[2] |= 2;
2099 scsi_buf[3] = (scsi_buf[3] & 0xf0) | 2;
2330 2100
2331 /* 2101 break;
2332 * TODO. Probably need to change mode select to 10 byte version
2333 */
2334 2102
2335 default: 2103 default:
2336 break; 2104 break;
2337 } 2105 }
2338 return; 2106 return;
2339} 2107}
@@ -2358,14 +2126,14 @@ static int sbp2_handle_status_write(struct hpsb_host *host, int nodeid, int dest
2358 2126
2359 if (!host) { 2127 if (!host) {
2360 SBP2_ERR("host is NULL - this is bad!"); 2128 SBP2_ERR("host is NULL - this is bad!");
2361 return(RCODE_ADDRESS_ERROR); 2129 return RCODE_ADDRESS_ERROR;
2362 } 2130 }
2363 2131
2364 hi = hpsb_get_hostinfo(&sbp2_highlevel, host); 2132 hi = hpsb_get_hostinfo(&sbp2_highlevel, host);
2365 2133
2366 if (!hi) { 2134 if (!hi) {
2367 SBP2_ERR("host info is NULL - this is bad!"); 2135 SBP2_ERR("host info is NULL - this is bad!");
2368 return(RCODE_ADDRESS_ERROR); 2136 return RCODE_ADDRESS_ERROR;
2369 } 2137 }
2370 2138
2371 /* 2139 /*
@@ -2382,7 +2150,7 @@ static int sbp2_handle_status_write(struct hpsb_host *host, int nodeid, int dest
2382 2150
2383 if (!scsi_id) { 2151 if (!scsi_id) {
2384 SBP2_ERR("scsi_id is NULL - device is gone?"); 2152 SBP2_ERR("scsi_id is NULL - device is gone?");
2385 return(RCODE_ADDRESS_ERROR); 2153 return RCODE_ADDRESS_ERROR;
2386 } 2154 }
2387 2155
2388 /* 2156 /*
@@ -2480,10 +2248,9 @@ static int sbp2_handle_status_write(struct hpsb_host *host, int nodeid, int dest
2480 SBP2_ORB_DEBUG("command orb completed"); 2248 SBP2_ORB_DEBUG("command orb completed");
2481 } 2249 }
2482 2250
2483 return(RCODE_COMPLETE); 2251 return RCODE_COMPLETE;
2484} 2252}
2485 2253
2486
2487/************************************** 2254/**************************************
2488 * SCSI interface related section 2255 * SCSI interface related section
2489 **************************************/ 2256 **************************************/
@@ -2541,6 +2308,16 @@ static int sbp2scsi_queuecommand(struct scsi_cmnd *SCpnt,
2541 } 2308 }
2542 2309
2543 /* 2310 /*
2311 * Bidirectional commands are not yet implemented,
2312 * and unknown transfer direction not handled.
2313 */
2314 if (SCpnt->sc_data_direction == DMA_BIDIRECTIONAL) {
2315 SBP2_ERR("Cannot handle DMA_BIDIRECTIONAL - rejecting command");
2316 result = DID_ERROR << 16;
2317 goto done;
2318 }
2319
2320 /*
2544 * Try and send our SCSI command 2321 * Try and send our SCSI command
2545 */ 2322 */
2546 if (sbp2_send_command(scsi_id, SCpnt, done)) { 2323 if (sbp2_send_command(scsi_id, SCpnt, done)) {
@@ -2616,55 +2393,56 @@ static void sbp2scsi_complete_command(struct scsi_id_instance_data *scsi_id,
2616 * complete the command, just let it get retried at the end of the 2393 * complete the command, just let it get retried at the end of the
2617 * bus reset. 2394 * bus reset.
2618 */ 2395 */
2619 if (!hpsb_node_entry_valid(scsi_id->ne) && (scsi_status != SBP2_SCSI_STATUS_GOOD)) { 2396 if (!hpsb_node_entry_valid(scsi_id->ne)
2397 && (scsi_status != SBP2_SCSI_STATUS_GOOD)) {
2620 SBP2_ERR("Bus reset in progress - retry command later"); 2398 SBP2_ERR("Bus reset in progress - retry command later");
2621 return; 2399 return;
2622 } 2400 }
2623 2401
2624 /* 2402 /*
2625 * Switch on scsi status 2403 * Switch on scsi status
2626 */ 2404 */
2627 switch (scsi_status) { 2405 switch (scsi_status) {
2628 case SBP2_SCSI_STATUS_GOOD: 2406 case SBP2_SCSI_STATUS_GOOD:
2629 SCpnt->result = DID_OK; 2407 SCpnt->result = DID_OK;
2630 break; 2408 break;
2631 2409
2632 case SBP2_SCSI_STATUS_BUSY: 2410 case SBP2_SCSI_STATUS_BUSY:
2633 SBP2_ERR("SBP2_SCSI_STATUS_BUSY"); 2411 SBP2_ERR("SBP2_SCSI_STATUS_BUSY");
2634 SCpnt->result = DID_BUS_BUSY << 16; 2412 SCpnt->result = DID_BUS_BUSY << 16;
2635 break; 2413 break;
2636 2414
2637 case SBP2_SCSI_STATUS_CHECK_CONDITION: 2415 case SBP2_SCSI_STATUS_CHECK_CONDITION:
2638 SBP2_DEBUG("SBP2_SCSI_STATUS_CHECK_CONDITION"); 2416 SBP2_DEBUG("SBP2_SCSI_STATUS_CHECK_CONDITION");
2639 SCpnt->result = CHECK_CONDITION << 1; 2417 SCpnt->result = CHECK_CONDITION << 1;
2640 2418
2641 /* 2419 /*
2642 * Debug stuff 2420 * Debug stuff
2643 */ 2421 */
2644#if CONFIG_IEEE1394_SBP2_DEBUG >= 1 2422#if CONFIG_IEEE1394_SBP2_DEBUG >= 1
2645 scsi_print_command(SCpnt); 2423 scsi_print_command(SCpnt);
2646 scsi_print_sense("bh", SCpnt); 2424 scsi_print_sense("bh", SCpnt);
2647#endif 2425#endif
2648 2426
2649 break; 2427 break;
2650 2428
2651 case SBP2_SCSI_STATUS_SELECTION_TIMEOUT: 2429 case SBP2_SCSI_STATUS_SELECTION_TIMEOUT:
2652 SBP2_ERR("SBP2_SCSI_STATUS_SELECTION_TIMEOUT"); 2430 SBP2_ERR("SBP2_SCSI_STATUS_SELECTION_TIMEOUT");
2653 SCpnt->result = DID_NO_CONNECT << 16; 2431 SCpnt->result = DID_NO_CONNECT << 16;
2654 scsi_print_command(SCpnt); 2432 scsi_print_command(SCpnt);
2655 break; 2433 break;
2656 2434
2657 case SBP2_SCSI_STATUS_CONDITION_MET: 2435 case SBP2_SCSI_STATUS_CONDITION_MET:
2658 case SBP2_SCSI_STATUS_RESERVATION_CONFLICT: 2436 case SBP2_SCSI_STATUS_RESERVATION_CONFLICT:
2659 case SBP2_SCSI_STATUS_COMMAND_TERMINATED: 2437 case SBP2_SCSI_STATUS_COMMAND_TERMINATED:
2660 SBP2_ERR("Bad SCSI status = %x", scsi_status); 2438 SBP2_ERR("Bad SCSI status = %x", scsi_status);
2661 SCpnt->result = DID_ERROR << 16; 2439 SCpnt->result = DID_ERROR << 16;
2662 scsi_print_command(SCpnt); 2440 scsi_print_command(SCpnt);
2663 break; 2441 break;
2664 2442
2665 default: 2443 default:
2666 SBP2_ERR("Unsupported SCSI status = %x", scsi_status); 2444 SBP2_ERR("Unsupported SCSI status = %x", scsi_status);
2667 SCpnt->result = DID_ERROR << 16; 2445 SCpnt->result = DID_ERROR << 16;
2668 } 2446 }
2669 2447
2670 /* 2448 /*
@@ -2678,7 +2456,8 @@ static void sbp2scsi_complete_command(struct scsi_id_instance_data *scsi_id,
2678 * If a bus reset is in progress and there was an error, complete 2456 * If a bus reset is in progress and there was an error, complete
2679 * the command as busy so that it will get retried. 2457 * the command as busy so that it will get retried.
2680 */ 2458 */
2681 if (!hpsb_node_entry_valid(scsi_id->ne) && (scsi_status != SBP2_SCSI_STATUS_GOOD)) { 2459 if (!hpsb_node_entry_valid(scsi_id->ne)
2460 && (scsi_status != SBP2_SCSI_STATUS_GOOD)) {
2682 SBP2_ERR("Completing command with busy (bus reset)"); 2461 SBP2_ERR("Completing command with busy (bus reset)");
2683 SCpnt->result = DID_BUS_BUSY << 16; 2462 SCpnt->result = DID_BUS_BUSY << 16;
2684 } 2463 }
@@ -2699,31 +2478,29 @@ static void sbp2scsi_complete_command(struct scsi_id_instance_data *scsi_id,
2699 /* 2478 /*
2700 * Tell scsi stack that we're done with this command 2479 * Tell scsi stack that we're done with this command
2701 */ 2480 */
2702 done (SCpnt); 2481 done(SCpnt);
2703} 2482}
2704 2483
2705
2706static int sbp2scsi_slave_alloc(struct scsi_device *sdev) 2484static int sbp2scsi_slave_alloc(struct scsi_device *sdev)
2707{ 2485{
2708 ((struct scsi_id_instance_data *)sdev->host->hostdata[0])->sdev = sdev; 2486 ((struct scsi_id_instance_data *)sdev->host->hostdata[0])->sdev = sdev;
2709 return 0; 2487 return 0;
2710} 2488}
2711 2489
2712
2713static int sbp2scsi_slave_configure(struct scsi_device *sdev) 2490static int sbp2scsi_slave_configure(struct scsi_device *sdev)
2714{ 2491{
2715 blk_queue_dma_alignment(sdev->request_queue, (512 - 1)); 2492 blk_queue_dma_alignment(sdev->request_queue, (512 - 1));
2493 sdev->use_10_for_rw = 1;
2494 sdev->use_10_for_ms = 1;
2716 return 0; 2495 return 0;
2717} 2496}
2718 2497
2719
2720static void sbp2scsi_slave_destroy(struct scsi_device *sdev) 2498static void sbp2scsi_slave_destroy(struct scsi_device *sdev)
2721{ 2499{
2722 ((struct scsi_id_instance_data *)sdev->host->hostdata[0])->sdev = NULL; 2500 ((struct scsi_id_instance_data *)sdev->host->hostdata[0])->sdev = NULL;
2723 return; 2501 return;
2724} 2502}
2725 2503
2726
2727/* 2504/*
2728 * Called by scsi stack when something has really gone wrong. Usually 2505 * Called by scsi stack when something has really gone wrong. Usually
2729 * called when a command has timed-out for some reason. 2506 * called when a command has timed-out for some reason.
@@ -2769,7 +2546,7 @@ static int sbp2scsi_abort(struct scsi_cmnd *SCpnt)
2769 sbp2scsi_complete_all_commands(scsi_id, DID_BUS_BUSY); 2546 sbp2scsi_complete_all_commands(scsi_id, DID_BUS_BUSY);
2770 } 2547 }
2771 2548
2772 return(SUCCESS); 2549 return SUCCESS;
2773} 2550}
2774 2551
2775/* 2552/*
@@ -2779,28 +2556,20 @@ static int sbp2scsi_reset(struct scsi_cmnd *SCpnt)
2779{ 2556{
2780 struct scsi_id_instance_data *scsi_id = 2557 struct scsi_id_instance_data *scsi_id =
2781 (struct scsi_id_instance_data *)SCpnt->device->host->hostdata[0]; 2558 (struct scsi_id_instance_data *)SCpnt->device->host->hostdata[0];
2782 unsigned long flags;
2783 2559
2784 SBP2_ERR("reset requested"); 2560 SBP2_ERR("reset requested");
2785 2561
2786 spin_lock_irqsave(SCpnt->device->host->host_lock, flags);
2787
2788 if (sbp2util_node_is_available(scsi_id)) { 2562 if (sbp2util_node_is_available(scsi_id)) {
2789 SBP2_ERR("Generating sbp2 fetch agent reset"); 2563 SBP2_ERR("Generating sbp2 fetch agent reset");
2790 sbp2_agent_reset(scsi_id, 0); 2564 sbp2_agent_reset(scsi_id, 0);
2791 } 2565 }
2792 2566
2793 spin_unlock_irqrestore(SCpnt->device->host->host_lock, flags);
2794
2795 return SUCCESS; 2567 return SUCCESS;
2796} 2568}
2797 2569
2798static const char *sbp2scsi_info (struct Scsi_Host *host) 2570static ssize_t sbp2_sysfs_ieee1394_id_show(struct device *dev,
2799{ 2571 struct device_attribute *attr,
2800 return "SCSI emulation for IEEE-1394 SBP-2 Devices"; 2572 char *buf)
2801}
2802
2803static ssize_t sbp2_sysfs_ieee1394_id_show(struct device *dev, struct device_attribute *attr, char *buf)
2804{ 2573{
2805 struct scsi_device *sdev; 2574 struct scsi_device *sdev;
2806 struct scsi_id_instance_data *scsi_id; 2575 struct scsi_id_instance_data *scsi_id;
@@ -2812,10 +2581,7 @@ static ssize_t sbp2_sysfs_ieee1394_id_show(struct device *dev, struct device_att
2812 if (!(scsi_id = (struct scsi_id_instance_data *)sdev->host->hostdata[0])) 2581 if (!(scsi_id = (struct scsi_id_instance_data *)sdev->host->hostdata[0]))
2813 return 0; 2582 return 0;
2814 2583
2815 if (scsi_id->sbp2_device_type_and_lun == SBP2_DEVICE_TYPE_LUN_UNINITIALIZED) 2584 lun = ORB_SET_LUN(scsi_id->sbp2_lun);
2816 lun = 0;
2817 else
2818 lun = ORB_SET_LUN(scsi_id->sbp2_device_type_and_lun);
2819 2585
2820 return sprintf(buf, "%016Lx:%d:%d\n", (unsigned long long)scsi_id->ne->guid, 2586 return sprintf(buf, "%016Lx:%d:%d\n", (unsigned long long)scsi_id->ne->guid,
2821 scsi_id->ud->id, lun); 2587 scsi_id->ud->id, lun);
@@ -2837,12 +2603,9 @@ static struct scsi_host_template scsi_driver_template = {
2837 .module = THIS_MODULE, 2603 .module = THIS_MODULE,
2838 .name = "SBP-2 IEEE-1394", 2604 .name = "SBP-2 IEEE-1394",
2839 .proc_name = SBP2_DEVICE_NAME, 2605 .proc_name = SBP2_DEVICE_NAME,
2840 .info = sbp2scsi_info,
2841 .queuecommand = sbp2scsi_queuecommand, 2606 .queuecommand = sbp2scsi_queuecommand,
2842 .eh_abort_handler = sbp2scsi_abort, 2607 .eh_abort_handler = sbp2scsi_abort,
2843 .eh_device_reset_handler = sbp2scsi_reset, 2608 .eh_device_reset_handler = sbp2scsi_reset,
2844 .eh_bus_reset_handler = sbp2scsi_reset,
2845 .eh_host_reset_handler = sbp2scsi_reset,
2846 .slave_alloc = sbp2scsi_slave_alloc, 2609 .slave_alloc = sbp2scsi_slave_alloc,
2847 .slave_configure = sbp2scsi_slave_configure, 2610 .slave_configure = sbp2scsi_slave_configure,
2848 .slave_destroy = sbp2scsi_slave_destroy, 2611 .slave_destroy = sbp2scsi_slave_destroy,
@@ -2861,8 +2624,6 @@ static int sbp2_module_init(void)
2861 2624
2862 SBP2_DEBUG("sbp2_module_init"); 2625 SBP2_DEBUG("sbp2_module_init");
2863 2626
2864 printk(KERN_INFO "sbp2: %s\n", version);
2865
2866 /* Module load debug option to force one command at a time (serializing I/O) */ 2627 /* Module load debug option to force one command at a time (serializing I/O) */
2867 if (serialize_io) { 2628 if (serialize_io) {
2868 SBP2_INFO("Driver forced to serialize I/O (serialize_io=1)"); 2629 SBP2_INFO("Driver forced to serialize I/O (serialize_io=1)");
@@ -2874,7 +2635,6 @@ static int sbp2_module_init(void)
2874 /* Set max sectors (module load option). Default is 255 sectors. */ 2635 /* Set max sectors (module load option). Default is 255 sectors. */
2875 scsi_driver_template.max_sectors = max_sectors; 2636 scsi_driver_template.max_sectors = max_sectors;
2876 2637
2877
2878 /* Register our high level driver with 1394 stack */ 2638 /* Register our high level driver with 1394 stack */
2879 hpsb_register_highlevel(&sbp2_highlevel); 2639 hpsb_register_highlevel(&sbp2_highlevel);
2880 2640
diff --git a/drivers/ieee1394/sbp2.h b/drivers/ieee1394/sbp2.h
index cd425be74841..900ea1d25e71 100644
--- a/drivers/ieee1394/sbp2.h
+++ b/drivers/ieee1394/sbp2.h
@@ -119,8 +119,8 @@ struct sbp2_query_logins_response {
119struct sbp2_reconnect_orb { 119struct sbp2_reconnect_orb {
120 u32 reserved1; 120 u32 reserved1;
121 u32 reserved2; 121 u32 reserved2;
122 u32 reserved3; 122 u32 reserved3;
123 u32 reserved4; 123 u32 reserved4;
124 u32 login_ID_misc; 124 u32 login_ID_misc;
125 u32 reserved5; 125 u32 reserved5;
126 u32 status_FIFO_hi; 126 u32 status_FIFO_hi;
@@ -130,8 +130,8 @@ struct sbp2_reconnect_orb {
130struct sbp2_logout_orb { 130struct sbp2_logout_orb {
131 u32 reserved1; 131 u32 reserved1;
132 u32 reserved2; 132 u32 reserved2;
133 u32 reserved3; 133 u32 reserved3;
134 u32 reserved4; 134 u32 reserved4;
135 u32 login_ID_misc; 135 u32 login_ID_misc;
136 u32 reserved5; 136 u32 reserved5;
137 u32 status_FIFO_hi; 137 u32 status_FIFO_hi;
@@ -188,7 +188,7 @@ struct sbp2_unrestricted_page_table {
188struct sbp2_status_block { 188struct sbp2_status_block {
189 u32 ORB_offset_hi_misc; 189 u32 ORB_offset_hi_misc;
190 u32 ORB_offset_lo; 190 u32 ORB_offset_lo;
191 u8 command_set_dependent[24]; 191 u8 command_set_dependent[24];
192}; 192};
193 193
194/* 194/*
@@ -211,7 +211,7 @@ struct sbp2_status_block {
211 * specified for write posting, where the ohci controller will 211 * specified for write posting, where the ohci controller will
212 * automatically send an ack_complete when the status is written by the 212 * automatically send an ack_complete when the status is written by the
213 * sbp2 device... saving a split transaction. =) 213 * sbp2 device... saving a split transaction. =)
214 */ 214 */
215#define SBP2_STATUS_FIFO_ADDRESS 0xfffe00000000ULL 215#define SBP2_STATUS_FIFO_ADDRESS 0xfffe00000000ULL
216#define SBP2_STATUS_FIFO_ADDRESS_HI 0xfffe 216#define SBP2_STATUS_FIFO_ADDRESS_HI 0xfffe
217#define SBP2_STATUS_FIFO_ADDRESS_LO 0x0 217#define SBP2_STATUS_FIFO_ADDRESS_LO 0x0
@@ -229,9 +229,6 @@ struct sbp2_status_block {
229#define SBP2_DEVICE_TYPE_AND_LUN_KEY 0x14 229#define SBP2_DEVICE_TYPE_AND_LUN_KEY 0x14
230#define SBP2_FIRMWARE_REVISION_KEY 0x3c 230#define SBP2_FIRMWARE_REVISION_KEY 0x3c
231 231
232#define SBP2_DEVICE_TYPE(q) (((q) >> 16) & 0x1f)
233#define SBP2_DEVICE_LUN(q) ((q) & 0xffff)
234
235#define SBP2_AGENT_STATE_OFFSET 0x00ULL 232#define SBP2_AGENT_STATE_OFFSET 0x00ULL
236#define SBP2_AGENT_RESET_OFFSET 0x04ULL 233#define SBP2_AGENT_RESET_OFFSET 0x04ULL
237#define SBP2_ORB_POINTER_OFFSET 0x08ULL 234#define SBP2_ORB_POINTER_OFFSET 0x08ULL
@@ -256,8 +253,6 @@ struct sbp2_status_block {
256 */ 253 */
257#define SBP2_128KB_BROKEN_FIRMWARE 0xa0b800 254#define SBP2_128KB_BROKEN_FIRMWARE 0xa0b800
258 255
259#define SBP2_DEVICE_TYPE_LUN_UNINITIALIZED 0xffffffff
260
261/* 256/*
262 * SCSI specific stuff 257 * SCSI specific stuff
263 */ 258 */
@@ -265,45 +260,7 @@ struct sbp2_status_block {
265#define SBP2_MAX_SG_ELEMENT_LENGTH 0xf000 260#define SBP2_MAX_SG_ELEMENT_LENGTH 0xf000
266#define SBP2_MAX_UDS_PER_NODE 16 /* Maximum scsi devices per node */ 261#define SBP2_MAX_UDS_PER_NODE 16 /* Maximum scsi devices per node */
267#define SBP2_MAX_SECTORS 255 /* Max sectors supported */ 262#define SBP2_MAX_SECTORS 255 /* Max sectors supported */
268 263#define SBP2_MAX_CMDS 8 /* This should be safe */
269/*
270 * SCSI direction table...
271 * (now used as a back-up in case the direction passed down from above is "unknown")
272 *
273 * DIN = IN data direction
274 * DOU = OUT data direction
275 * DNO = No data transfer
276 * DUN = Unknown data direction
277 *
278 * Opcode 0xec (Teac specific "opc execute") possibly should be DNO,
279 * but we'll change it when somebody reports a problem with this.
280 */
281#define DIN ORB_DIRECTION_READ_FROM_MEDIA
282#define DOU ORB_DIRECTION_WRITE_TO_MEDIA
283#define DNO ORB_DIRECTION_NO_DATA_TRANSFER
284#define DUN DIN
285
286static unchar sbp2scsi_direction_table[0x100] = {
287 DNO,DNO,DIN,DIN,DOU,DIN,DIN,DOU,DIN,DUN,DOU,DOU,DUN,DUN,DUN,DIN,
288 DNO,DIN,DIN,DOU,DIN,DOU,DNO,DNO,DOU,DNO,DIN,DNO,DIN,DOU,DNO,DUN,
289 DIN,DUN,DIN,DIN,DOU,DIN,DUN,DUN,DIN,DIN,DOU,DNO,DUN,DIN,DOU,DOU,
290 DOU,DOU,DOU,DNO,DIN,DNO,DNO,DIN,DOU,DOU,DOU,DOU,DIN,DOU,DIN,DOU,
291 DOU,DOU,DIN,DIN,DIN,DNO,DIN,DNO,DNO,DNO,DUN,DNO,DOU,DIN,DNO,DUN,
292 DUN,DIN,DIN,DNO,DNO,DOU,DUN,DUN,DNO,DIN,DIN,DNO,DIN,DOU,DUN,DUN,
293 DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,
294 DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,
295 DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,
296 DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,
297 DUN,DNO,DOU,DOU,DIN,DNO,DNO,DNO,DIN,DNO,DOU,DUN,DNO,DIN,DOU,DOU,
298 DOU,DOU,DOU,DNO,DUN,DIN,DOU,DIN,DIN,DIN,DNO,DNO,DNO,DIN,DIN,DUN,
299 DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,
300 DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,
301 DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DOU,DUN,DUN,DUN,DUN,DUN,
302 DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN,DUN
303};
304
305/* This should be safe */
306#define SBP2_MAX_CMDS 8
307 264
308/* This is the two dma types we use for cmd_dma below */ 265/* This is the two dma types we use for cmd_dma below */
309enum cmd_dma_types { 266enum cmd_dma_types {
@@ -338,10 +295,8 @@ struct sbp2_command_info {
338#define SBP2_BREAKAGE_128K_MAX_TRANSFER 0x1 295#define SBP2_BREAKAGE_128K_MAX_TRANSFER 0x1
339#define SBP2_BREAKAGE_INQUIRY_HACK 0x2 296#define SBP2_BREAKAGE_INQUIRY_HACK 0x2
340 297
341
342struct sbp2scsi_host_info; 298struct sbp2scsi_host_info;
343 299
344
345/* 300/*
346 * Information needed on a per scsi id basis (one for each sbp2 device) 301 * Information needed on a per scsi id basis (one for each sbp2 device)
347 */ 302 */
@@ -379,7 +334,7 @@ struct scsi_id_instance_data {
379 u32 sbp2_command_set_spec_id; 334 u32 sbp2_command_set_spec_id;
380 u32 sbp2_command_set; 335 u32 sbp2_command_set;
381 u32 sbp2_unit_characteristics; 336 u32 sbp2_unit_characteristics;
382 u32 sbp2_device_type_and_lun; 337 u32 sbp2_lun;
383 u32 sbp2_firmware_revision; 338 u32 sbp2_firmware_revision;
384 339
385 /* 340 /*
@@ -411,7 +366,6 @@ struct scsi_id_instance_data {
411 u32 workarounds; 366 u32 workarounds;
412}; 367};
413 368
414
415/* Sbp2 host data structure (one per IEEE1394 host) */ 369/* Sbp2 host data structure (one per IEEE1394 host) */
416struct sbp2scsi_host_info { 370struct sbp2scsi_host_info {
417 struct hpsb_host *host; /* IEEE1394 host */ 371 struct hpsb_host *host; /* IEEE1394 host */
@@ -456,20 +410,12 @@ static int sbp2_logout_device(struct scsi_id_instance_data *scsi_id);
456static int sbp2_handle_status_write(struct hpsb_host *host, int nodeid, int destid, 410static int sbp2_handle_status_write(struct hpsb_host *host, int nodeid, int destid,
457 quadlet_t *data, u64 addr, size_t length, u16 flags); 411 quadlet_t *data, u64 addr, size_t length, u16 flags);
458static int sbp2_agent_reset(struct scsi_id_instance_data *scsi_id, int wait); 412static int sbp2_agent_reset(struct scsi_id_instance_data *scsi_id, int wait);
459static int sbp2_create_command_orb(struct scsi_id_instance_data *scsi_id,
460 struct sbp2_command_info *command,
461 unchar *scsi_cmd,
462 unsigned int scsi_use_sg,
463 unsigned int scsi_request_bufflen,
464 void *scsi_request_buffer,
465 enum dma_data_direction dma_dir);
466static int sbp2_link_orb_command(struct scsi_id_instance_data *scsi_id, 413static int sbp2_link_orb_command(struct scsi_id_instance_data *scsi_id,
467 struct sbp2_command_info *command); 414 struct sbp2_command_info *command);
468static int sbp2_send_command(struct scsi_id_instance_data *scsi_id, 415static int sbp2_send_command(struct scsi_id_instance_data *scsi_id,
469 struct scsi_cmnd *SCpnt, 416 struct scsi_cmnd *SCpnt,
470 void (*done)(struct scsi_cmnd *)); 417 void (*done)(struct scsi_cmnd *));
471static unsigned int sbp2_status_to_sense_data(unchar *sbp2_status, unchar *sense_data); 418static unsigned int sbp2_status_to_sense_data(unchar *sbp2_status, unchar *sense_data);
472static void sbp2_check_sbp2_command(struct scsi_id_instance_data *scsi_id, unchar *cmd);
473static void sbp2_check_sbp2_response(struct scsi_id_instance_data *scsi_id, 419static void sbp2_check_sbp2_response(struct scsi_id_instance_data *scsi_id,
474 struct scsi_cmnd *SCpnt); 420 struct scsi_cmnd *SCpnt);
475static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id, 421static void sbp2_parse_unit_directory(struct scsi_id_instance_data *scsi_id,
diff --git a/drivers/ieee1394/video1394.c b/drivers/ieee1394/video1394.c
index 23911da50154..608479b2df14 100644
--- a/drivers/ieee1394/video1394.c
+++ b/drivers/ieee1394/video1394.c
@@ -19,12 +19,6 @@
19 * 19 *
20 * NOTES: 20 * NOTES:
21 * 21 *
22 * jds -- add private data to file to keep track of iso contexts associated
23 * with each open -- so release won't kill all iso transfers.
24 *
25 * Damien Douxchamps: Fix failure when the number of DMA pages per frame is
26 * one.
27 *
28 * ioctl return codes: 22 * ioctl return codes:
29 * EFAULT is only for invalid address for the argp 23 * EFAULT is only for invalid address for the argp
30 * EINVAL for out of range values 24 * EINVAL for out of range values
@@ -34,12 +28,6 @@
34 * ENOTTY for unsupported ioctl request 28 * ENOTTY for unsupported ioctl request
35 * 29 *
36 */ 30 */
37
38/* Markus Tavenrath <speedygoo@speedygoo.de> :
39 - fixed checks for valid buffer-numbers in video1394_icotl
40 - changed the ways the dma prg's are used, now it's possible to use
41 even a single dma buffer
42*/
43#include <linux/config.h> 31#include <linux/config.h>
44#include <linux/kernel.h> 32#include <linux/kernel.h>
45#include <linux/list.h> 33#include <linux/list.h>
@@ -77,14 +65,6 @@
77 65
78#define ISO_CHANNELS 64 66#define ISO_CHANNELS 64
79 67
80#ifndef virt_to_page
81#define virt_to_page(x) MAP_NR(x)
82#endif
83
84#ifndef vmalloc_32
85#define vmalloc_32(x) vmalloc(x)
86#endif
87
88struct it_dma_prg { 68struct it_dma_prg {
89 struct dma_cmd begin; 69 struct dma_cmd begin;
90 quadlet_t data[4]; 70 quadlet_t data[4];
@@ -206,14 +186,12 @@ alloc_dma_iso_ctx(struct ti_ohci *ohci, int type, int num_desc,
206 struct dma_iso_ctx *d; 186 struct dma_iso_ctx *d;
207 int i; 187 int i;
208 188
209 d = kmalloc(sizeof(struct dma_iso_ctx), GFP_KERNEL); 189 d = kzalloc(sizeof(*d), GFP_KERNEL);
210 if (d == NULL) { 190 if (!d) {
211 PRINT(KERN_ERR, ohci->host->id, "Failed to allocate dma_iso_ctx"); 191 PRINT(KERN_ERR, ohci->host->id, "Failed to allocate dma_iso_ctx");
212 return NULL; 192 return NULL;
213 } 193 }
214 194
215 memset(d, 0, sizeof *d);
216
217 d->ohci = ohci; 195 d->ohci = ohci;
218 d->type = type; 196 d->type = type;
219 d->channel = channel; 197 d->channel = channel;
@@ -251,9 +229,8 @@ alloc_dma_iso_ctx(struct ti_ohci *ohci, int type, int num_desc,
251 } 229 }
252 d->ctx = d->iso_tasklet.context; 230 d->ctx = d->iso_tasklet.context;
253 231
254 d->prg_reg = kmalloc(d->num_desc * sizeof(struct dma_prog_region), 232 d->prg_reg = kmalloc(d->num_desc * sizeof(*d->prg_reg), GFP_KERNEL);
255 GFP_KERNEL); 233 if (!d->prg_reg) {
256 if (d->prg_reg == NULL) {
257 PRINT(KERN_ERR, ohci->host->id, "Failed to allocate ir prg regs"); 234 PRINT(KERN_ERR, ohci->host->id, "Failed to allocate ir prg regs");
258 free_dma_iso_ctx(d); 235 free_dma_iso_ctx(d);
259 return NULL; 236 return NULL;
@@ -268,15 +245,14 @@ alloc_dma_iso_ctx(struct ti_ohci *ohci, int type, int num_desc,
268 d->cmdPtr = OHCI1394_IsoRcvCommandPtr+32*d->ctx; 245 d->cmdPtr = OHCI1394_IsoRcvCommandPtr+32*d->ctx;
269 d->ctxMatch = OHCI1394_IsoRcvContextMatch+32*d->ctx; 246 d->ctxMatch = OHCI1394_IsoRcvContextMatch+32*d->ctx;
270 247
271 d->ir_prg = kmalloc(d->num_desc * sizeof(struct dma_cmd *), 248 d->ir_prg = kzalloc(d->num_desc * sizeof(*d->ir_prg),
272 GFP_KERNEL); 249 GFP_KERNEL);
273 250
274 if (d->ir_prg == NULL) { 251 if (!d->ir_prg) {
275 PRINT(KERN_ERR, ohci->host->id, "Failed to allocate dma ir prg"); 252 PRINT(KERN_ERR, ohci->host->id, "Failed to allocate dma ir prg");
276 free_dma_iso_ctx(d); 253 free_dma_iso_ctx(d);
277 return NULL; 254 return NULL;
278 } 255 }
279 memset(d->ir_prg, 0, d->num_desc * sizeof(struct dma_cmd *));
280 256
281 d->nb_cmd = d->buf_size / PAGE_SIZE + 1; 257 d->nb_cmd = d->buf_size / PAGE_SIZE + 1;
282 d->left_size = (d->frame_size % PAGE_SIZE) ? 258 d->left_size = (d->frame_size % PAGE_SIZE) ?
@@ -297,16 +273,15 @@ alloc_dma_iso_ctx(struct ti_ohci *ohci, int type, int num_desc,
297 d->ctrlClear = OHCI1394_IsoXmitContextControlClear+16*d->ctx; 273 d->ctrlClear = OHCI1394_IsoXmitContextControlClear+16*d->ctx;
298 d->cmdPtr = OHCI1394_IsoXmitCommandPtr+16*d->ctx; 274 d->cmdPtr = OHCI1394_IsoXmitCommandPtr+16*d->ctx;
299 275
300 d->it_prg = kmalloc(d->num_desc * sizeof(struct it_dma_prg *), 276 d->it_prg = kzalloc(d->num_desc * sizeof(*d->it_prg),
301 GFP_KERNEL); 277 GFP_KERNEL);
302 278
303 if (d->it_prg == NULL) { 279 if (!d->it_prg) {
304 PRINT(KERN_ERR, ohci->host->id, 280 PRINT(KERN_ERR, ohci->host->id,
305 "Failed to allocate dma it prg"); 281 "Failed to allocate dma it prg");
306 free_dma_iso_ctx(d); 282 free_dma_iso_ctx(d);
307 return NULL; 283 return NULL;
308 } 284 }
309 memset(d->it_prg, 0, d->num_desc*sizeof(struct it_dma_prg *));
310 285
311 d->packet_size = packet_size; 286 d->packet_size = packet_size;
312 287
@@ -337,47 +312,24 @@ alloc_dma_iso_ctx(struct ti_ohci *ohci, int type, int num_desc,
337 } 312 }
338 } 313 }
339 314
340 d->buffer_status = kmalloc(d->num_desc * sizeof(unsigned int), 315 d->buffer_status =
341 GFP_KERNEL); 316 kzalloc(d->num_desc * sizeof(*d->buffer_status), GFP_KERNEL);
342 d->buffer_prg_assignment = kmalloc(d->num_desc * sizeof(unsigned int), 317 d->buffer_prg_assignment =
343 GFP_KERNEL); 318 kzalloc(d->num_desc * sizeof(*d->buffer_prg_assignment), GFP_KERNEL);
344 d->buffer_time = kmalloc(d->num_desc * sizeof(struct timeval), 319 d->buffer_time =
345 GFP_KERNEL); 320 kzalloc(d->num_desc * sizeof(*d->buffer_time), GFP_KERNEL);
346 d->last_used_cmd = kmalloc(d->num_desc * sizeof(unsigned int), 321 d->last_used_cmd =
347 GFP_KERNEL); 322 kzalloc(d->num_desc * sizeof(*d->last_used_cmd), GFP_KERNEL);
348 d->next_buffer = kmalloc(d->num_desc * sizeof(int), 323 d->next_buffer =
349 GFP_KERNEL); 324 kzalloc(d->num_desc * sizeof(*d->next_buffer), GFP_KERNEL);
350 325
351 if (d->buffer_status == NULL) { 326 if (!d->buffer_status || !d->buffer_prg_assignment || !d->buffer_time ||
352 PRINT(KERN_ERR, ohci->host->id, "Failed to allocate buffer_status"); 327 !d->last_used_cmd || !d->next_buffer) {
353 free_dma_iso_ctx(d); 328 PRINT(KERN_ERR, ohci->host->id,
354 return NULL; 329 "Failed to allocate dma_iso_ctx member");
355 }
356 if (d->buffer_prg_assignment == NULL) {
357 PRINT(KERN_ERR, ohci->host->id, "Failed to allocate buffer_prg_assignment");
358 free_dma_iso_ctx(d);
359 return NULL;
360 }
361 if (d->buffer_time == NULL) {
362 PRINT(KERN_ERR, ohci->host->id, "Failed to allocate buffer_time");
363 free_dma_iso_ctx(d);
364 return NULL;
365 }
366 if (d->last_used_cmd == NULL) {
367 PRINT(KERN_ERR, ohci->host->id, "Failed to allocate last_used_cmd");
368 free_dma_iso_ctx(d);
369 return NULL;
370 }
371 if (d->next_buffer == NULL) {
372 PRINT(KERN_ERR, ohci->host->id, "Failed to allocate next_buffer");
373 free_dma_iso_ctx(d); 330 free_dma_iso_ctx(d);
374 return NULL; 331 return NULL;
375 } 332 }
376 memset(d->buffer_status, 0, d->num_desc * sizeof(unsigned int));
377 memset(d->buffer_prg_assignment, 0, d->num_desc * sizeof(unsigned int));
378 memset(d->buffer_time, 0, d->num_desc * sizeof(struct timeval));
379 memset(d->last_used_cmd, 0, d->num_desc * sizeof(unsigned int));
380 memset(d->next_buffer, -1, d->num_desc * sizeof(int));
381 333
382 spin_lock_init(&d->lock); 334 spin_lock_init(&d->lock);
383 335
@@ -539,7 +491,7 @@ static void wakeup_dma_ir_ctx(unsigned long l)
539 if (d->ir_prg[i][d->nb_cmd-1].status & cpu_to_le32(0xFFFF0000)) { 491 if (d->ir_prg[i][d->nb_cmd-1].status & cpu_to_le32(0xFFFF0000)) {
540 reset_ir_status(d, i); 492 reset_ir_status(d, i);
541 d->buffer_status[d->buffer_prg_assignment[i]] = VIDEO1394_BUFFER_READY; 493 d->buffer_status[d->buffer_prg_assignment[i]] = VIDEO1394_BUFFER_READY;
542 do_gettimeofday(&d->buffer_time[i]); 494 do_gettimeofday(&d->buffer_time[d->buffer_prg_assignment[i]]);
543 } 495 }
544 } 496 }
545 497
@@ -1046,7 +998,6 @@ static int __video1394_ioctl(struct file *file,
1046 998
1047 /* set time of buffer */ 999 /* set time of buffer */
1048 v.filltime = d->buffer_time[v.buffer]; 1000 v.filltime = d->buffer_time[v.buffer];
1049// printk("Buffer %d time %d\n", v.buffer, (d->buffer_time[v.buffer]).tv_usec);
1050 1001
1051 /* 1002 /*
1052 * Look ahead to see how many more buffers have been received 1003 * Look ahead to see how many more buffers have been received
@@ -1085,7 +1036,7 @@ static int __video1394_ioctl(struct file *file,
1085 } 1036 }
1086 1037
1087 if (d->flags & VIDEO1394_VARIABLE_PACKET_SIZE) { 1038 if (d->flags & VIDEO1394_VARIABLE_PACKET_SIZE) {
1088 int buf_size = d->nb_cmd * sizeof(unsigned int); 1039 int buf_size = d->nb_cmd * sizeof(*psizes);
1089 struct video1394_queue_variable __user *p = argp; 1040 struct video1394_queue_variable __user *p = argp;
1090 unsigned int __user *qv; 1041 unsigned int __user *qv;
1091 1042
@@ -1104,7 +1055,7 @@ static int __video1394_ioctl(struct file *file,
1104 1055
1105 spin_lock_irqsave(&d->lock,flags); 1056 spin_lock_irqsave(&d->lock,flags);
1106 1057
1107 // last_buffer is last_prg 1058 /* last_buffer is last_prg */
1108 next_prg = (d->last_buffer + 1) % d->num_desc; 1059 next_prg = (d->last_buffer + 1) % d->num_desc;
1109 if (d->buffer_status[v.buffer]!=VIDEO1394_BUFFER_FREE) { 1060 if (d->buffer_status[v.buffer]!=VIDEO1394_BUFFER_FREE) {
1110 PRINT(KERN_ERR, ohci->host->id, 1061 PRINT(KERN_ERR, ohci->host->id,
@@ -1251,13 +1202,12 @@ static int video1394_open(struct inode *inode, struct file *file)
1251 if (ohci == NULL) 1202 if (ohci == NULL)
1252 return -EIO; 1203 return -EIO;
1253 1204
1254 ctx = kmalloc(sizeof(struct file_ctx), GFP_KERNEL); 1205 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1255 if (ctx == NULL) { 1206 if (!ctx) {
1256 PRINT(KERN_ERR, ohci->host->id, "Cannot malloc file_ctx"); 1207 PRINT(KERN_ERR, ohci->host->id, "Cannot malloc file_ctx");
1257 return -ENOMEM; 1208 return -ENOMEM;
1258 } 1209 }
1259 1210
1260 memset(ctx, 0, sizeof(struct file_ctx));
1261 ctx->ohci = ohci; 1211 ctx->ohci = ohci;
1262 INIT_LIST_HEAD(&ctx->context_list); 1212 INIT_LIST_HEAD(&ctx->context_list);
1263 ctx->current_ctx = NULL; 1213 ctx->current_ctx = NULL;
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 9f2352bd8348..a1e660e3531d 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -157,7 +157,7 @@ struct input_event_compat {
157# define COMPAT_TEST test_thread_flag(TIF_IA32) 157# define COMPAT_TEST test_thread_flag(TIF_IA32)
158#elif defined(CONFIG_IA64) 158#elif defined(CONFIG_IA64)
159# define COMPAT_TEST IS_IA32_PROCESS(ia64_task_regs(current)) 159# define COMPAT_TEST IS_IA32_PROCESS(ia64_task_regs(current))
160#elif defined(CONFIG_ARCH_S390) 160#elif defined(CONFIG_S390)
161# define COMPAT_TEST test_thread_flag(TIF_31BIT) 161# define COMPAT_TEST test_thread_flag(TIF_31BIT)
162#elif defined(CONFIG_MIPS) 162#elif defined(CONFIG_MIPS)
163# define COMPAT_TEST (current->thread.mflags & MF_32BIT_ADDR) 163# define COMPAT_TEST (current->thread.mflags & MF_32BIT_ADDR)
diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c
index 02a3117ef92d..5ebfd1d138da 100644
--- a/drivers/macintosh/therm_adt746x.c
+++ b/drivers/macintosh/therm_adt746x.c
@@ -52,6 +52,7 @@ static char *sensor_location[3] = {NULL, NULL, NULL};
52 52
53static int limit_adjust = 0; 53static int limit_adjust = 0;
54static int fan_speed = -1; 54static int fan_speed = -1;
55static int verbose = 0;
55 56
56MODULE_AUTHOR("Colin Leroy <colin@colino.net>"); 57MODULE_AUTHOR("Colin Leroy <colin@colino.net>");
57MODULE_DESCRIPTION("Driver for ADT746x thermostat in iBook G4 and " 58MODULE_DESCRIPTION("Driver for ADT746x thermostat in iBook G4 and "
@@ -66,6 +67,10 @@ module_param(fan_speed, int, 0644);
66MODULE_PARM_DESC(fan_speed,"Specify starting fan speed (0-255) " 67MODULE_PARM_DESC(fan_speed,"Specify starting fan speed (0-255) "
67 "(default 64)"); 68 "(default 64)");
68 69
70module_param(verbose, bool, 0);
71MODULE_PARM_DESC(verbose,"Verbose log operations "
72 "(default 0)");
73
69struct thermostat { 74struct thermostat {
70 struct i2c_client clt; 75 struct i2c_client clt;
71 u8 temps[3]; 76 u8 temps[3];
@@ -149,13 +154,13 @@ detach_thermostat(struct i2c_adapter *adapter)
149 if (thread_therm != NULL) { 154 if (thread_therm != NULL) {
150 kthread_stop(thread_therm); 155 kthread_stop(thread_therm);
151 } 156 }
152 157
153 printk(KERN_INFO "adt746x: Putting max temperatures back from " 158 printk(KERN_INFO "adt746x: Putting max temperatures back from "
154 "%d, %d, %d to %d, %d, %d\n", 159 "%d, %d, %d to %d, %d, %d\n",
155 th->limits[0], th->limits[1], th->limits[2], 160 th->limits[0], th->limits[1], th->limits[2],
156 th->initial_limits[0], th->initial_limits[1], 161 th->initial_limits[0], th->initial_limits[1],
157 th->initial_limits[2]); 162 th->initial_limits[2]);
158 163
159 for (i = 0; i < 3; i++) 164 for (i = 0; i < 3; i++)
160 write_reg(th, LIMIT_REG[i], th->initial_limits[i]); 165 write_reg(th, LIMIT_REG[i], th->initial_limits[i]);
161 166
@@ -212,12 +217,14 @@ static void write_fan_speed(struct thermostat *th, int speed, int fan)
212 return; 217 return;
213 218
214 if (th->last_speed[fan] != speed) { 219 if (th->last_speed[fan] != speed) {
215 if (speed == -1) 220 if (verbose) {
216 printk(KERN_DEBUG "adt746x: Setting speed to automatic " 221 if (speed == -1)
217 "for %s fan.\n", sensor_location[fan+1]); 222 printk(KERN_DEBUG "adt746x: Setting speed to automatic "
218 else 223 "for %s fan.\n", sensor_location[fan+1]);
219 printk(KERN_DEBUG "adt746x: Setting speed to %d " 224 else
220 "for %s fan.\n", speed, sensor_location[fan+1]); 225 printk(KERN_DEBUG "adt746x: Setting speed to %d "
226 "for %s fan.\n", speed, sensor_location[fan+1]);
227 }
221 } else 228 } else
222 return; 229 return;
223 230
@@ -298,10 +305,11 @@ static void update_fans_speed (struct thermostat *th)
298 if (new_speed > 255) 305 if (new_speed > 255)
299 new_speed = 255; 306 new_speed = 255;
300 307
301 printk(KERN_DEBUG "adt746x: setting fans speed to %d " 308 if (verbose)
302 "(limit exceeded by %d on %s) \n", 309 printk(KERN_DEBUG "adt746x: Setting fans speed to %d "
303 new_speed, var, 310 "(limit exceeded by %d on %s) \n",
304 sensor_location[fan_number+1]); 311 new_speed, var,
312 sensor_location[fan_number+1]);
305 write_both_fan_speed(th, new_speed); 313 write_both_fan_speed(th, new_speed);
306 th->last_var[fan_number] = var; 314 th->last_var[fan_number] = var;
307 } else if (var < -2) { 315 } else if (var < -2) {
@@ -309,8 +317,9 @@ static void update_fans_speed (struct thermostat *th)
309 * so cold (lastvar >= -1) */ 317 * so cold (lastvar >= -1) */
310 if (i == 2 && lastvar < -1) { 318 if (i == 2 && lastvar < -1) {
311 if (th->last_speed[fan_number] != 0) 319 if (th->last_speed[fan_number] != 0)
312 printk(KERN_DEBUG "adt746x: Stopping " 320 if (verbose)
313 "fans.\n"); 321 printk(KERN_DEBUG "adt746x: Stopping "
322 "fans.\n");
314 write_both_fan_speed(th, 0); 323 write_both_fan_speed(th, 0);
315 } 324 }
316 } 325 }
@@ -406,7 +415,7 @@ static int attach_one_thermostat(struct i2c_adapter *adapter, int addr,
406 th->initial_limits[i] = read_reg(th, LIMIT_REG[i]); 415 th->initial_limits[i] = read_reg(th, LIMIT_REG[i]);
407 set_limit(th, i); 416 set_limit(th, i);
408 } 417 }
409 418
410 printk(KERN_INFO "adt746x: Lowering max temperatures from %d, %d, %d" 419 printk(KERN_INFO "adt746x: Lowering max temperatures from %d, %d, %d"
411 " to %d, %d, %d\n", 420 " to %d, %d, %d\n",
412 th->initial_limits[0], th->initial_limits[1], 421 th->initial_limits[0], th->initial_limits[1],
diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c
index cf72b782f60f..8d0958c38b6b 100644
--- a/drivers/macintosh/therm_pm72.c
+++ b/drivers/macintosh/therm_pm72.c
@@ -1988,18 +1988,13 @@ static void fcu_lookup_fans(struct device_node *fcu_node)
1988 1988
1989static int fcu_of_probe(struct of_device* dev, const struct of_device_id *match) 1989static int fcu_of_probe(struct of_device* dev, const struct of_device_id *match)
1990{ 1990{
1991 int rc;
1992
1993 state = state_detached; 1991 state = state_detached;
1994 1992
1995 /* Lookup the fans in the device tree */ 1993 /* Lookup the fans in the device tree */
1996 fcu_lookup_fans(dev->node); 1994 fcu_lookup_fans(dev->node);
1997 1995
1998 /* Add the driver */ 1996 /* Add the driver */
1999 rc = i2c_add_driver(&therm_pm72_driver); 1997 return i2c_add_driver(&therm_pm72_driver);
2000 if (rc < 0)
2001 return rc;
2002 return 0;
2003} 1998}
2004 1999
2005static int fcu_of_remove(struct of_device* dev) 2000static int fcu_of_remove(struct of_device* dev)
diff --git a/drivers/macintosh/windfarm_lm75_sensor.c b/drivers/macintosh/windfarm_lm75_sensor.c
index fd16642d98ab..57460e46c89f 100644
--- a/drivers/macintosh/windfarm_lm75_sensor.c
+++ b/drivers/macintosh/windfarm_lm75_sensor.c
@@ -240,12 +240,7 @@ static int wf_lm75_detach(struct i2c_client *client)
240 240
241static int __init wf_lm75_sensor_init(void) 241static int __init wf_lm75_sensor_init(void)
242{ 242{
243 int rc; 243 return i2c_add_driver(&wf_lm75_driver);
244
245 rc = i2c_add_driver(&wf_lm75_driver);
246 if (rc < 0)
247 return rc;
248 return 0;
249} 244}
250 245
251static void __exit wf_lm75_sensor_exit(void) 246static void __exit wf_lm75_sensor_exit(void)
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 252d55df9642..76a189ceb529 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -315,6 +315,8 @@ static int write_page(struct bitmap *bitmap, struct page *page, int wait)
315 if (bitmap->file == NULL) 315 if (bitmap->file == NULL)
316 return write_sb_page(bitmap->mddev, bitmap->offset, page, wait); 316 return write_sb_page(bitmap->mddev, bitmap->offset, page, wait);
317 317
318 flush_dcache_page(page); /* make sure visible to anyone reading the file */
319
318 if (wait) 320 if (wait)
319 lock_page(page); 321 lock_page(page);
320 else { 322 else {
@@ -341,7 +343,7 @@ static int write_page(struct bitmap *bitmap, struct page *page, int wait)
341 /* add to list to be waited for by daemon */ 343 /* add to list to be waited for by daemon */
342 struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO); 344 struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO);
343 item->page = page; 345 item->page = page;
344 page_cache_get(page); 346 get_page(page);
345 spin_lock(&bitmap->write_lock); 347 spin_lock(&bitmap->write_lock);
346 list_add(&item->list, &bitmap->complete_pages); 348 list_add(&item->list, &bitmap->complete_pages);
347 spin_unlock(&bitmap->write_lock); 349 spin_unlock(&bitmap->write_lock);
@@ -357,10 +359,10 @@ static struct page *read_page(struct file *file, unsigned long index,
357 struct inode *inode = file->f_mapping->host; 359 struct inode *inode = file->f_mapping->host;
358 struct page *page = NULL; 360 struct page *page = NULL;
359 loff_t isize = i_size_read(inode); 361 loff_t isize = i_size_read(inode);
360 unsigned long end_index = isize >> PAGE_CACHE_SHIFT; 362 unsigned long end_index = isize >> PAGE_SHIFT;
361 363
362 PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_CACHE_SIZE, 364 PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_SIZE,
363 (unsigned long long)index << PAGE_CACHE_SHIFT); 365 (unsigned long long)index << PAGE_SHIFT);
364 366
365 page = read_cache_page(inode->i_mapping, index, 367 page = read_cache_page(inode->i_mapping, index,
366 (filler_t *)inode->i_mapping->a_ops->readpage, file); 368 (filler_t *)inode->i_mapping->a_ops->readpage, file);
@@ -368,7 +370,7 @@ static struct page *read_page(struct file *file, unsigned long index,
368 goto out; 370 goto out;
369 wait_on_page_locked(page); 371 wait_on_page_locked(page);
370 if (!PageUptodate(page) || PageError(page)) { 372 if (!PageUptodate(page) || PageError(page)) {
371 page_cache_release(page); 373 put_page(page);
372 page = ERR_PTR(-EIO); 374 page = ERR_PTR(-EIO);
373 goto out; 375 goto out;
374 } 376 }
@@ -376,14 +378,14 @@ static struct page *read_page(struct file *file, unsigned long index,
376 if (index > end_index) /* we have read beyond EOF */ 378 if (index > end_index) /* we have read beyond EOF */
377 *bytes_read = 0; 379 *bytes_read = 0;
378 else if (index == end_index) /* possible short read */ 380 else if (index == end_index) /* possible short read */
379 *bytes_read = isize & ~PAGE_CACHE_MASK; 381 *bytes_read = isize & ~PAGE_MASK;
380 else 382 else
381 *bytes_read = PAGE_CACHE_SIZE; /* got a full page */ 383 *bytes_read = PAGE_SIZE; /* got a full page */
382out: 384out:
383 if (IS_ERR(page)) 385 if (IS_ERR(page))
384 printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n", 386 printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n",
385 (int)PAGE_CACHE_SIZE, 387 (int)PAGE_SIZE,
386 (unsigned long long)index << PAGE_CACHE_SHIFT, 388 (unsigned long long)index << PAGE_SHIFT,
387 PTR_ERR(page)); 389 PTR_ERR(page));
388 return page; 390 return page;
389} 391}
@@ -406,11 +408,11 @@ int bitmap_update_sb(struct bitmap *bitmap)
406 return 0; 408 return 0;
407 } 409 }
408 spin_unlock_irqrestore(&bitmap->lock, flags); 410 spin_unlock_irqrestore(&bitmap->lock, flags);
409 sb = (bitmap_super_t *)kmap(bitmap->sb_page); 411 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
410 sb->events = cpu_to_le64(bitmap->mddev->events); 412 sb->events = cpu_to_le64(bitmap->mddev->events);
411 if (!bitmap->mddev->degraded) 413 if (!bitmap->mddev->degraded)
412 sb->events_cleared = cpu_to_le64(bitmap->mddev->events); 414 sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
413 kunmap(bitmap->sb_page); 415 kunmap_atomic(sb, KM_USER0);
414 return write_page(bitmap, bitmap->sb_page, 1); 416 return write_page(bitmap, bitmap->sb_page, 1);
415} 417}
416 418
@@ -421,7 +423,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
421 423
422 if (!bitmap || !bitmap->sb_page) 424 if (!bitmap || !bitmap->sb_page)
423 return; 425 return;
424 sb = (bitmap_super_t *)kmap(bitmap->sb_page); 426 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
425 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); 427 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
426 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); 428 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
427 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); 429 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
@@ -440,7 +442,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
440 printk(KERN_DEBUG " sync size: %llu KB\n", 442 printk(KERN_DEBUG " sync size: %llu KB\n",
441 (unsigned long long)le64_to_cpu(sb->sync_size)/2); 443 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
442 printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); 444 printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
443 kunmap(bitmap->sb_page); 445 kunmap_atomic(sb, KM_USER0);
444} 446}
445 447
446/* read the superblock from the bitmap file and initialize some bitmap fields */ 448/* read the superblock from the bitmap file and initialize some bitmap fields */
@@ -466,7 +468,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
466 return err; 468 return err;
467 } 469 }
468 470
469 sb = (bitmap_super_t *)kmap(bitmap->sb_page); 471 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
470 472
471 if (bytes_read < sizeof(*sb)) { /* short read */ 473 if (bytes_read < sizeof(*sb)) { /* short read */
472 printk(KERN_INFO "%s: bitmap file superblock truncated\n", 474 printk(KERN_INFO "%s: bitmap file superblock truncated\n",
@@ -485,12 +487,12 @@ static int bitmap_read_sb(struct bitmap *bitmap)
485 else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || 487 else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
486 le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) 488 le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
487 reason = "unrecognized superblock version"; 489 reason = "unrecognized superblock version";
488 else if (chunksize < 512 || chunksize > (1024 * 1024 * 4)) 490 else if (chunksize < PAGE_SIZE)
489 reason = "bitmap chunksize out of range (512B - 4MB)"; 491 reason = "bitmap chunksize too small";
490 else if ((1 << ffz(~chunksize)) != chunksize) 492 else if ((1 << ffz(~chunksize)) != chunksize)
491 reason = "bitmap chunksize not a power of 2"; 493 reason = "bitmap chunksize not a power of 2";
492 else if (daemon_sleep < 1 || daemon_sleep > 15) 494 else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ)
493 reason = "daemon sleep period out of range (1-15s)"; 495 reason = "daemon sleep period out of range";
494 else if (write_behind > COUNTER_MAX) 496 else if (write_behind > COUNTER_MAX)
495 reason = "write-behind limit out of range (0 - 16383)"; 497 reason = "write-behind limit out of range (0 - 16383)";
496 if (reason) { 498 if (reason) {
@@ -535,7 +537,7 @@ success:
535 bitmap->events_cleared = bitmap->mddev->events; 537 bitmap->events_cleared = bitmap->mddev->events;
536 err = 0; 538 err = 0;
537out: 539out:
538 kunmap(bitmap->sb_page); 540 kunmap_atomic(sb, KM_USER0);
539 if (err) 541 if (err)
540 bitmap_print_sb(bitmap); 542 bitmap_print_sb(bitmap);
541 return err; 543 return err;
@@ -558,9 +560,9 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
558 spin_unlock_irqrestore(&bitmap->lock, flags); 560 spin_unlock_irqrestore(&bitmap->lock, flags);
559 return; 561 return;
560 } 562 }
561 page_cache_get(bitmap->sb_page); 563 get_page(bitmap->sb_page);
562 spin_unlock_irqrestore(&bitmap->lock, flags); 564 spin_unlock_irqrestore(&bitmap->lock, flags);
563 sb = (bitmap_super_t *)kmap(bitmap->sb_page); 565 sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0);
564 switch (op) { 566 switch (op) {
565 case MASK_SET: sb->state |= bits; 567 case MASK_SET: sb->state |= bits;
566 break; 568 break;
@@ -568,8 +570,8 @@ static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
568 break; 570 break;
569 default: BUG(); 571 default: BUG();
570 } 572 }
571 kunmap(bitmap->sb_page); 573 kunmap_atomic(sb, KM_USER0);
572 page_cache_release(bitmap->sb_page); 574 put_page(bitmap->sb_page);
573} 575}
574 576
575/* 577/*
@@ -622,12 +624,11 @@ static void bitmap_file_unmap(struct bitmap *bitmap)
622 624
623 while (pages--) 625 while (pages--)
624 if (map[pages]->index != 0) /* 0 is sb_page, release it below */ 626 if (map[pages]->index != 0) /* 0 is sb_page, release it below */
625 page_cache_release(map[pages]); 627 put_page(map[pages]);
626 kfree(map); 628 kfree(map);
627 kfree(attr); 629 kfree(attr);
628 630
629 if (sb_page) 631 safe_put_page(sb_page);
630 page_cache_release(sb_page);
631} 632}
632 633
633static void bitmap_stop_daemon(struct bitmap *bitmap); 634static void bitmap_stop_daemon(struct bitmap *bitmap);
@@ -654,7 +655,7 @@ static void drain_write_queues(struct bitmap *bitmap)
654 655
655 while ((item = dequeue_page(bitmap))) { 656 while ((item = dequeue_page(bitmap))) {
656 /* don't bother to wait */ 657 /* don't bother to wait */
657 page_cache_release(item->page); 658 put_page(item->page);
658 mempool_free(item, bitmap->write_pool); 659 mempool_free(item, bitmap->write_pool);
659 } 660 }
660 661
@@ -763,7 +764,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
763 764
764 /* make sure the page stays cached until it gets written out */ 765 /* make sure the page stays cached until it gets written out */
765 if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY)) 766 if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY))
766 page_cache_get(page); 767 get_page(page);
767 768
768 /* set the bit */ 769 /* set the bit */
769 kaddr = kmap_atomic(page, KM_USER0); 770 kaddr = kmap_atomic(page, KM_USER0);
@@ -854,6 +855,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
854 unsigned long bytes, offset, dummy; 855 unsigned long bytes, offset, dummy;
855 int outofdate; 856 int outofdate;
856 int ret = -ENOSPC; 857 int ret = -ENOSPC;
858 void *paddr;
857 859
858 chunks = bitmap->chunks; 860 chunks = bitmap->chunks;
859 file = bitmap->file; 861 file = bitmap->file;
@@ -887,12 +889,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
887 if (!bitmap->filemap) 889 if (!bitmap->filemap)
888 goto out; 890 goto out;
889 891
890 bitmap->filemap_attr = kmalloc(sizeof(long) * num_pages, GFP_KERNEL); 892 bitmap->filemap_attr = kzalloc(sizeof(long) * num_pages, GFP_KERNEL);
891 if (!bitmap->filemap_attr) 893 if (!bitmap->filemap_attr)
892 goto out; 894 goto out;
893 895
894 memset(bitmap->filemap_attr, 0, sizeof(long) * num_pages);
895
896 oldindex = ~0L; 896 oldindex = ~0L;
897 897
898 for (i = 0; i < chunks; i++) { 898 for (i = 0; i < chunks; i++) {
@@ -901,8 +901,6 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
901 bit = file_page_offset(i); 901 bit = file_page_offset(i);
902 if (index != oldindex) { /* this is a new page, read it in */ 902 if (index != oldindex) { /* this is a new page, read it in */
903 /* unmap the old page, we're done with it */ 903 /* unmap the old page, we're done with it */
904 if (oldpage != NULL)
905 kunmap(oldpage);
906 if (index == 0) { 904 if (index == 0) {
907 /* 905 /*
908 * if we're here then the superblock page 906 * if we're here then the superblock page
@@ -925,30 +923,32 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
925 923
926 oldindex = index; 924 oldindex = index;
927 oldpage = page; 925 oldpage = page;
928 kmap(page);
929 926
930 if (outofdate) { 927 if (outofdate) {
931 /* 928 /*
932 * if bitmap is out of date, dirty the 929 * if bitmap is out of date, dirty the
933 * whole page and write it out 930 * whole page and write it out
934 */ 931 */
935 memset(page_address(page) + offset, 0xff, 932 paddr = kmap_atomic(page, KM_USER0);
933 memset(paddr + offset, 0xff,
936 PAGE_SIZE - offset); 934 PAGE_SIZE - offset);
935 kunmap_atomic(paddr, KM_USER0);
937 ret = write_page(bitmap, page, 1); 936 ret = write_page(bitmap, page, 1);
938 if (ret) { 937 if (ret) {
939 kunmap(page);
940 /* release, page not in filemap yet */ 938 /* release, page not in filemap yet */
941 page_cache_release(page); 939 put_page(page);
942 goto out; 940 goto out;
943 } 941 }
944 } 942 }
945 943
946 bitmap->filemap[bitmap->file_pages++] = page; 944 bitmap->filemap[bitmap->file_pages++] = page;
947 } 945 }
946 paddr = kmap_atomic(page, KM_USER0);
948 if (bitmap->flags & BITMAP_HOSTENDIAN) 947 if (bitmap->flags & BITMAP_HOSTENDIAN)
949 b = test_bit(bit, page_address(page)); 948 b = test_bit(bit, paddr);
950 else 949 else
951 b = ext2_test_bit(bit, page_address(page)); 950 b = ext2_test_bit(bit, paddr);
951 kunmap_atomic(paddr, KM_USER0);
952 if (b) { 952 if (b) {
953 /* if the disk bit is set, set the memory bit */ 953 /* if the disk bit is set, set the memory bit */
954 bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap), 954 bitmap_set_memory_bits(bitmap, i << CHUNK_BLOCK_SHIFT(bitmap),
@@ -963,9 +963,6 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
963 ret = 0; 963 ret = 0;
964 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); 964 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
965 965
966 if (page) /* unmap the last page */
967 kunmap(page);
968
969 if (bit_cnt) { /* Kick recovery if any bits were set */ 966 if (bit_cnt) { /* Kick recovery if any bits were set */
970 set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); 967 set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
971 md_wakeup_thread(bitmap->mddev->thread); 968 md_wakeup_thread(bitmap->mddev->thread);
@@ -1021,6 +1018,7 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1021 int err = 0; 1018 int err = 0;
1022 int blocks; 1019 int blocks;
1023 int attr; 1020 int attr;
1021 void *paddr;
1024 1022
1025 if (bitmap == NULL) 1023 if (bitmap == NULL)
1026 return 0; 1024 return 0;
@@ -1043,7 +1041,7 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1043 /* skip this page unless it's marked as needing cleaning */ 1041 /* skip this page unless it's marked as needing cleaning */
1044 if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) { 1042 if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) {
1045 if (attr & BITMAP_PAGE_NEEDWRITE) { 1043 if (attr & BITMAP_PAGE_NEEDWRITE) {
1046 page_cache_get(page); 1044 get_page(page);
1047 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); 1045 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
1048 } 1046 }
1049 spin_unlock_irqrestore(&bitmap->lock, flags); 1047 spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1057,13 +1055,13 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1057 default: 1055 default:
1058 bitmap_file_kick(bitmap); 1056 bitmap_file_kick(bitmap);
1059 } 1057 }
1060 page_cache_release(page); 1058 put_page(page);
1061 } 1059 }
1062 continue; 1060 continue;
1063 } 1061 }
1064 1062
1065 /* grab the new page, sync and release the old */ 1063 /* grab the new page, sync and release the old */
1066 page_cache_get(page); 1064 get_page(page);
1067 if (lastpage != NULL) { 1065 if (lastpage != NULL) {
1068 if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) { 1066 if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) {
1069 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1067 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
@@ -1077,14 +1075,12 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1077 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1075 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1078 spin_unlock_irqrestore(&bitmap->lock, flags); 1076 spin_unlock_irqrestore(&bitmap->lock, flags);
1079 } 1077 }
1080 kunmap(lastpage); 1078 put_page(lastpage);
1081 page_cache_release(lastpage);
1082 if (err) 1079 if (err)
1083 bitmap_file_kick(bitmap); 1080 bitmap_file_kick(bitmap);
1084 } else 1081 } else
1085 spin_unlock_irqrestore(&bitmap->lock, flags); 1082 spin_unlock_irqrestore(&bitmap->lock, flags);
1086 lastpage = page; 1083 lastpage = page;
1087 kmap(page);
1088/* 1084/*
1089 printk("bitmap clean at page %lu\n", j); 1085 printk("bitmap clean at page %lu\n", j);
1090*/ 1086*/
@@ -1107,10 +1103,12 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1107 -1); 1103 -1);
1108 1104
1109 /* clear the bit */ 1105 /* clear the bit */
1106 paddr = kmap_atomic(page, KM_USER0);
1110 if (bitmap->flags & BITMAP_HOSTENDIAN) 1107 if (bitmap->flags & BITMAP_HOSTENDIAN)
1111 clear_bit(file_page_offset(j), page_address(page)); 1108 clear_bit(file_page_offset(j), paddr);
1112 else 1109 else
1113 ext2_clear_bit(file_page_offset(j), page_address(page)); 1110 ext2_clear_bit(file_page_offset(j), paddr);
1111 kunmap_atomic(paddr, KM_USER0);
1114 } 1112 }
1115 } 1113 }
1116 spin_unlock_irqrestore(&bitmap->lock, flags); 1114 spin_unlock_irqrestore(&bitmap->lock, flags);
@@ -1118,7 +1116,6 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1118 1116
1119 /* now sync the final page */ 1117 /* now sync the final page */
1120 if (lastpage != NULL) { 1118 if (lastpage != NULL) {
1121 kunmap(lastpage);
1122 spin_lock_irqsave(&bitmap->lock, flags); 1119 spin_lock_irqsave(&bitmap->lock, flags);
1123 if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) { 1120 if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) {
1124 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); 1121 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
@@ -1133,7 +1130,7 @@ int bitmap_daemon_work(struct bitmap *bitmap)
1133 spin_unlock_irqrestore(&bitmap->lock, flags); 1130 spin_unlock_irqrestore(&bitmap->lock, flags);
1134 } 1131 }
1135 1132
1136 page_cache_release(lastpage); 1133 put_page(lastpage);
1137 } 1134 }
1138 1135
1139 return err; 1136 return err;
@@ -1184,7 +1181,7 @@ static void bitmap_writeback_daemon(mddev_t *mddev)
1184 PRINTK("finished page writeback: %p\n", page); 1181 PRINTK("finished page writeback: %p\n", page);
1185 1182
1186 err = PageError(page); 1183 err = PageError(page);
1187 page_cache_release(page); 1184 put_page(page);
1188 if (err) { 1185 if (err) {
1189 printk(KERN_WARNING "%s: bitmap file writeback " 1186 printk(KERN_WARNING "%s: bitmap file writeback "
1190 "failed (page %lu): %d\n", 1187 "failed (page %lu): %d\n",
@@ -1530,6 +1527,8 @@ void bitmap_destroy(mddev_t *mddev)
1530 return; 1527 return;
1531 1528
1532 mddev->bitmap = NULL; /* disconnect from the md device */ 1529 mddev->bitmap = NULL; /* disconnect from the md device */
1530 if (mddev->thread)
1531 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1533 1532
1534 bitmap_free(bitmap); 1533 bitmap_free(bitmap);
1535} 1534}
@@ -1555,12 +1554,10 @@ int bitmap_create(mddev_t *mddev)
1555 1554
1556 BUG_ON(file && mddev->bitmap_offset); 1555 BUG_ON(file && mddev->bitmap_offset);
1557 1556
1558 bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL); 1557 bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
1559 if (!bitmap) 1558 if (!bitmap)
1560 return -ENOMEM; 1559 return -ENOMEM;
1561 1560
1562 memset(bitmap, 0, sizeof(*bitmap));
1563
1564 spin_lock_init(&bitmap->lock); 1561 spin_lock_init(&bitmap->lock);
1565 bitmap->mddev = mddev; 1562 bitmap->mddev = mddev;
1566 1563
@@ -1601,12 +1598,11 @@ int bitmap_create(mddev_t *mddev)
1601#ifdef INJECT_FATAL_FAULT_1 1598#ifdef INJECT_FATAL_FAULT_1
1602 bitmap->bp = NULL; 1599 bitmap->bp = NULL;
1603#else 1600#else
1604 bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); 1601 bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
1605#endif 1602#endif
1606 err = -ENOMEM; 1603 err = -ENOMEM;
1607 if (!bitmap->bp) 1604 if (!bitmap->bp)
1608 goto error; 1605 goto error;
1609 memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp));
1610 1606
1611 bitmap->flags |= BITMAP_ACTIVE; 1607 bitmap->flags |= BITMAP_ACTIVE;
1612 1608
@@ -1636,6 +1632,8 @@ int bitmap_create(mddev_t *mddev)
1636 1632
1637 if (IS_ERR(bitmap->writeback_daemon)) 1633 if (IS_ERR(bitmap->writeback_daemon))
1638 return PTR_ERR(bitmap->writeback_daemon); 1634 return PTR_ERR(bitmap->writeback_daemon);
1635 mddev->thread->timeout = bitmap->daemon_sleep * HZ;
1636
1639 return bitmap_update_sb(bitmap); 1637 return bitmap_update_sb(bitmap);
1640 1638
1641 error: 1639 error:
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index cf6631056683..a601a427885c 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -690,6 +690,8 @@ bad3:
690bad2: 690bad2:
691 crypto_free_tfm(tfm); 691 crypto_free_tfm(tfm);
692bad1: 692bad1:
693 /* Must zero key material before freeing */
694 memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
693 kfree(cc); 695 kfree(cc);
694 return -EINVAL; 696 return -EINVAL;
695} 697}
@@ -706,6 +708,9 @@ static void crypt_dtr(struct dm_target *ti)
706 cc->iv_gen_ops->dtr(cc); 708 cc->iv_gen_ops->dtr(cc);
707 crypto_free_tfm(cc->tfm); 709 crypto_free_tfm(cc->tfm);
708 dm_put_device(ti, cc->dev); 710 dm_put_device(ti, cc->dev);
711
712 /* Must zero key material before freeing */
713 memset(cc, 0, sizeof(*cc) + cc->key_size * sizeof(u8));
709 kfree(cc); 714 kfree(cc);
710} 715}
711 716
diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h
index 1a77f3265706..f9035bfd1a9f 100644
--- a/drivers/md/dm-io.h
+++ b/drivers/md/dm-io.h
@@ -9,9 +9,6 @@
9 9
10#include "dm.h" 10#include "dm.h"
11 11
12/* FIXME make this configurable */
13#define DM_MAX_IO_REGIONS 8
14
15struct io_region { 12struct io_region {
16 struct block_device *bdev; 13 struct block_device *bdev;
17 sector_t sector; 14 sector_t sector;
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 07d44e19536e..561bda5011e0 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -270,6 +270,7 @@ static int dm_hash_rename(const char *old, const char *new)
270{ 270{
271 char *new_name, *old_name; 271 char *new_name, *old_name;
272 struct hash_cell *hc; 272 struct hash_cell *hc;
273 struct dm_table *table;
273 274
274 /* 275 /*
275 * duplicate new. 276 * duplicate new.
@@ -317,6 +318,15 @@ static int dm_hash_rename(const char *old, const char *new)
317 /* rename the device node in devfs */ 318 /* rename the device node in devfs */
318 register_with_devfs(hc); 319 register_with_devfs(hc);
319 320
321 /*
322 * Wake up any dm event waiters.
323 */
324 table = dm_get_table(hc->md);
325 if (table) {
326 dm_table_event(table);
327 dm_table_put(table);
328 }
329
320 up_write(&_hash_lock); 330 up_write(&_hash_lock);
321 kfree(old_name); 331 kfree(old_name);
322 return 0; 332 return 0;
@@ -683,14 +693,18 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
683static int do_suspend(struct dm_ioctl *param) 693static int do_suspend(struct dm_ioctl *param)
684{ 694{
685 int r = 0; 695 int r = 0;
696 int do_lockfs = 1;
686 struct mapped_device *md; 697 struct mapped_device *md;
687 698
688 md = find_device(param); 699 md = find_device(param);
689 if (!md) 700 if (!md)
690 return -ENXIO; 701 return -ENXIO;
691 702
703 if (param->flags & DM_SKIP_LOCKFS_FLAG)
704 do_lockfs = 0;
705
692 if (!dm_suspended(md)) 706 if (!dm_suspended(md))
693 r = dm_suspend(md); 707 r = dm_suspend(md, do_lockfs);
694 708
695 if (!r) 709 if (!r)
696 r = __dev_status(md, param); 710 r = __dev_status(md, param);
@@ -702,6 +716,7 @@ static int do_suspend(struct dm_ioctl *param)
702static int do_resume(struct dm_ioctl *param) 716static int do_resume(struct dm_ioctl *param)
703{ 717{
704 int r = 0; 718 int r = 0;
719 int do_lockfs = 1;
705 struct hash_cell *hc; 720 struct hash_cell *hc;
706 struct mapped_device *md; 721 struct mapped_device *md;
707 struct dm_table *new_map; 722 struct dm_table *new_map;
@@ -727,8 +742,10 @@ static int do_resume(struct dm_ioctl *param)
727 /* Do we need to load a new map ? */ 742 /* Do we need to load a new map ? */
728 if (new_map) { 743 if (new_map) {
729 /* Suspend if it isn't already suspended */ 744 /* Suspend if it isn't already suspended */
745 if (param->flags & DM_SKIP_LOCKFS_FLAG)
746 do_lockfs = 0;
730 if (!dm_suspended(md)) 747 if (!dm_suspended(md))
731 dm_suspend(md); 748 dm_suspend(md, do_lockfs);
732 749
733 r = dm_swap_table(md, new_map); 750 r = dm_swap_table(md, new_map);
734 if (r) { 751 if (r) {
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index a76349cb10a5..efe4adf78530 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -573,7 +573,7 @@ static int core_get_resync_work(struct dirty_log *log, region_t *region)
573 lc->sync_search); 573 lc->sync_search);
574 lc->sync_search = *region + 1; 574 lc->sync_search = *region + 1;
575 575
576 if (*region == lc->region_count) 576 if (*region >= lc->region_count)
577 return 0; 577 return 0;
578 578
579 } while (log_test_bit(lc->recovering_bits, *region)); 579 } while (log_test_bit(lc->recovering_bits, *region));
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 6b0fc1670929..6cfa8d435d55 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -562,6 +562,8 @@ struct mirror_set {
562 region_t nr_regions; 562 region_t nr_regions;
563 int in_sync; 563 int in_sync;
564 564
565 struct mirror *default_mirror; /* Default mirror */
566
565 unsigned int nr_mirrors; 567 unsigned int nr_mirrors;
566 struct mirror mirror[0]; 568 struct mirror mirror[0];
567}; 569};
@@ -611,7 +613,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
611 unsigned long flags = 0; 613 unsigned long flags = 0;
612 614
613 /* fill in the source */ 615 /* fill in the source */
614 m = ms->mirror + DEFAULT_MIRROR; 616 m = ms->default_mirror;
615 from.bdev = m->dev->bdev; 617 from.bdev = m->dev->bdev;
616 from.sector = m->offset + region_to_sector(reg->rh, reg->key); 618 from.sector = m->offset + region_to_sector(reg->rh, reg->key);
617 if (reg->key == (ms->nr_regions - 1)) { 619 if (reg->key == (ms->nr_regions - 1)) {
@@ -627,7 +629,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
627 629
628 /* fill in the destinations */ 630 /* fill in the destinations */
629 for (i = 0, dest = to; i < ms->nr_mirrors; i++) { 631 for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
630 if (i == DEFAULT_MIRROR) 632 if (&ms->mirror[i] == ms->default_mirror)
631 continue; 633 continue;
632 634
633 m = ms->mirror + i; 635 m = ms->mirror + i;
@@ -682,7 +684,7 @@ static void do_recovery(struct mirror_set *ms)
682static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) 684static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
683{ 685{
684 /* FIXME: add read balancing */ 686 /* FIXME: add read balancing */
685 return ms->mirror + DEFAULT_MIRROR; 687 return ms->default_mirror;
686} 688}
687 689
688/* 690/*
@@ -709,7 +711,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
709 if (rh_in_sync(&ms->rh, region, 0)) 711 if (rh_in_sync(&ms->rh, region, 0))
710 m = choose_mirror(ms, bio->bi_sector); 712 m = choose_mirror(ms, bio->bi_sector);
711 else 713 else
712 m = ms->mirror + DEFAULT_MIRROR; 714 m = ms->default_mirror;
713 715
714 map_bio(ms, m, bio); 716 map_bio(ms, m, bio);
715 generic_make_request(bio); 717 generic_make_request(bio);
@@ -833,7 +835,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
833 rh_delay(&ms->rh, bio); 835 rh_delay(&ms->rh, bio);
834 836
835 while ((bio = bio_list_pop(&nosync))) { 837 while ((bio = bio_list_pop(&nosync))) {
836 map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio); 838 map_bio(ms, ms->default_mirror, bio);
837 generic_make_request(bio); 839 generic_make_request(bio);
838 } 840 }
839} 841}
@@ -900,6 +902,7 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
900 ms->nr_mirrors = nr_mirrors; 902 ms->nr_mirrors = nr_mirrors;
901 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 903 ms->nr_regions = dm_sector_div_up(ti->len, region_size);
902 ms->in_sync = 0; 904 ms->in_sync = 0;
905 ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
903 906
904 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { 907 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
905 ti->error = "dm-mirror: Error creating dirty region hash"; 908 ti->error = "dm-mirror: Error creating dirty region hash";
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index ab54f99b7c3b..4b9dd8fb1e5c 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -371,6 +371,20 @@ static inline ulong round_up(ulong n, ulong size)
371 return (n + size) & ~size; 371 return (n + size) & ~size;
372} 372}
373 373
374static void read_snapshot_metadata(struct dm_snapshot *s)
375{
376 if (s->have_metadata)
377 return;
378
379 if (s->store.read_metadata(&s->store)) {
380 down_write(&s->lock);
381 s->valid = 0;
382 up_write(&s->lock);
383 }
384
385 s->have_metadata = 1;
386}
387
374/* 388/*
375 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size> 389 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
376 */ 390 */
@@ -848,16 +862,7 @@ static void snapshot_resume(struct dm_target *ti)
848{ 862{
849 struct dm_snapshot *s = (struct dm_snapshot *) ti->private; 863 struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
850 864
851 if (s->have_metadata) 865 read_snapshot_metadata(s);
852 return;
853
854 if (s->store.read_metadata(&s->store)) {
855 down_write(&s->lock);
856 s->valid = 0;
857 up_write(&s->lock);
858 }
859
860 s->have_metadata = 1;
861} 866}
862 867
863static int snapshot_status(struct dm_target *ti, status_type_t type, 868static int snapshot_status(struct dm_target *ti, status_type_t type,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 930b9fc27953..0e481512f918 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -55,6 +55,7 @@ union map_info *dm_get_mapinfo(struct bio *bio)
55 */ 55 */
56#define DMF_BLOCK_IO 0 56#define DMF_BLOCK_IO 0
57#define DMF_SUSPENDED 1 57#define DMF_SUSPENDED 1
58#define DMF_FROZEN 2
58 59
59struct mapped_device { 60struct mapped_device {
60 struct rw_semaphore io_lock; 61 struct rw_semaphore io_lock;
@@ -97,7 +98,7 @@ struct mapped_device {
97 * freeze/thaw support require holding onto a super block 98 * freeze/thaw support require holding onto a super block
98 */ 99 */
99 struct super_block *frozen_sb; 100 struct super_block *frozen_sb;
100 struct block_device *frozen_bdev; 101 struct block_device *suspended_bdev;
101}; 102};
102 103
103#define MIN_IOS 256 104#define MIN_IOS 256
@@ -836,9 +837,9 @@ static void __set_size(struct mapped_device *md, sector_t size)
836{ 837{
837 set_capacity(md->disk, size); 838 set_capacity(md->disk, size);
838 839
839 down(&md->frozen_bdev->bd_inode->i_sem); 840 down(&md->suspended_bdev->bd_inode->i_sem);
840 i_size_write(md->frozen_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 841 i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
841 up(&md->frozen_bdev->bd_inode->i_sem); 842 up(&md->suspended_bdev->bd_inode->i_sem);
842} 843}
843 844
844static int __bind(struct mapped_device *md, struct dm_table *t) 845static int __bind(struct mapped_device *md, struct dm_table *t)
@@ -902,10 +903,9 @@ int dm_create_with_minor(unsigned int minor, struct mapped_device **result)
902 return create_aux(minor, 1, result); 903 return create_aux(minor, 1, result);
903} 904}
904 905
905void *dm_get_mdptr(dev_t dev) 906static struct mapped_device *dm_find_md(dev_t dev)
906{ 907{
907 struct mapped_device *md; 908 struct mapped_device *md;
908 void *mdptr = NULL;
909 unsigned minor = MINOR(dev); 909 unsigned minor = MINOR(dev);
910 910
911 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 911 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
@@ -914,12 +914,32 @@ void *dm_get_mdptr(dev_t dev)
914 down(&_minor_lock); 914 down(&_minor_lock);
915 915
916 md = idr_find(&_minor_idr, minor); 916 md = idr_find(&_minor_idr, minor);
917 917 if (!md || (dm_disk(md)->first_minor != minor))
918 if (md && (dm_disk(md)->first_minor == minor)) 918 md = NULL;
919 mdptr = md->interface_ptr;
920 919
921 up(&_minor_lock); 920 up(&_minor_lock);
922 921
922 return md;
923}
924
925struct mapped_device *dm_get_md(dev_t dev)
926{
927 struct mapped_device *md = dm_find_md(dev);
928
929 if (md)
930 dm_get(md);
931
932 return md;
933}
934
935void *dm_get_mdptr(dev_t dev)
936{
937 struct mapped_device *md;
938 void *mdptr = NULL;
939
940 md = dm_find_md(dev);
941 if (md)
942 mdptr = md->interface_ptr;
923 return mdptr; 943 return mdptr;
924} 944}
925 945
@@ -991,43 +1011,33 @@ out:
991 */ 1011 */
992static int lock_fs(struct mapped_device *md) 1012static int lock_fs(struct mapped_device *md)
993{ 1013{
994 int r = -ENOMEM; 1014 int r;
995
996 md->frozen_bdev = bdget_disk(md->disk, 0);
997 if (!md->frozen_bdev) {
998 DMWARN("bdget failed in lock_fs");
999 goto out;
1000 }
1001 1015
1002 WARN_ON(md->frozen_sb); 1016 WARN_ON(md->frozen_sb);
1003 1017
1004 md->frozen_sb = freeze_bdev(md->frozen_bdev); 1018 md->frozen_sb = freeze_bdev(md->suspended_bdev);
1005 if (IS_ERR(md->frozen_sb)) { 1019 if (IS_ERR(md->frozen_sb)) {
1006 r = PTR_ERR(md->frozen_sb); 1020 r = PTR_ERR(md->frozen_sb);
1007 goto out_bdput; 1021 md->frozen_sb = NULL;
1022 return r;
1008 } 1023 }
1009 1024
1025 set_bit(DMF_FROZEN, &md->flags);
1026
1010 /* don't bdput right now, we don't want the bdev 1027 /* don't bdput right now, we don't want the bdev
1011 * to go away while it is locked. We'll bdput 1028 * to go away while it is locked.
1012 * in unlock_fs
1013 */ 1029 */
1014 return 0; 1030 return 0;
1015
1016out_bdput:
1017 bdput(md->frozen_bdev);
1018 md->frozen_sb = NULL;
1019 md->frozen_bdev = NULL;
1020out:
1021 return r;
1022} 1031}
1023 1032
1024static void unlock_fs(struct mapped_device *md) 1033static void unlock_fs(struct mapped_device *md)
1025{ 1034{
1026 thaw_bdev(md->frozen_bdev, md->frozen_sb); 1035 if (!test_bit(DMF_FROZEN, &md->flags))
1027 bdput(md->frozen_bdev); 1036 return;
1028 1037
1038 thaw_bdev(md->suspended_bdev, md->frozen_sb);
1029 md->frozen_sb = NULL; 1039 md->frozen_sb = NULL;
1030 md->frozen_bdev = NULL; 1040 clear_bit(DMF_FROZEN, &md->flags);
1031} 1041}
1032 1042
1033/* 1043/*
@@ -1037,7 +1047,7 @@ static void unlock_fs(struct mapped_device *md)
1037 * dm_bind_table, dm_suspend must be called to flush any in 1047 * dm_bind_table, dm_suspend must be called to flush any in
1038 * flight bios and ensure that any further io gets deferred. 1048 * flight bios and ensure that any further io gets deferred.
1039 */ 1049 */
1040int dm_suspend(struct mapped_device *md) 1050int dm_suspend(struct mapped_device *md, int do_lockfs)
1041{ 1051{
1042 struct dm_table *map = NULL; 1052 struct dm_table *map = NULL;
1043 DECLARE_WAITQUEUE(wait, current); 1053 DECLARE_WAITQUEUE(wait, current);
@@ -1053,10 +1063,19 @@ int dm_suspend(struct mapped_device *md)
1053 /* This does not get reverted if there's an error later. */ 1063 /* This does not get reverted if there's an error later. */
1054 dm_table_presuspend_targets(map); 1064 dm_table_presuspend_targets(map);
1055 1065
1056 /* Flush I/O to the device. */ 1066 md->suspended_bdev = bdget_disk(md->disk, 0);
1057 r = lock_fs(md); 1067 if (!md->suspended_bdev) {
1058 if (r) 1068 DMWARN("bdget failed in dm_suspend");
1069 r = -ENOMEM;
1059 goto out; 1070 goto out;
1071 }
1072
1073 /* Flush I/O to the device. */
1074 if (do_lockfs) {
1075 r = lock_fs(md);
1076 if (r)
1077 goto out;
1078 }
1060 1079
1061 /* 1080 /*
1062 * First we set the BLOCK_IO flag so no more ios will be mapped. 1081 * First we set the BLOCK_IO flag so no more ios will be mapped.
@@ -1105,6 +1124,11 @@ int dm_suspend(struct mapped_device *md)
1105 r = 0; 1124 r = 0;
1106 1125
1107out: 1126out:
1127 if (r && md->suspended_bdev) {
1128 bdput(md->suspended_bdev);
1129 md->suspended_bdev = NULL;
1130 }
1131
1108 dm_table_put(map); 1132 dm_table_put(map);
1109 up(&md->suspend_lock); 1133 up(&md->suspend_lock);
1110 return r; 1134 return r;
@@ -1135,6 +1159,9 @@ int dm_resume(struct mapped_device *md)
1135 1159
1136 unlock_fs(md); 1160 unlock_fs(md);
1137 1161
1162 bdput(md->suspended_bdev);
1163 md->suspended_bdev = NULL;
1164
1138 clear_bit(DMF_SUSPENDED, &md->flags); 1165 clear_bit(DMF_SUSPENDED, &md->flags);
1139 1166
1140 dm_table_unplug_all(map); 1167 dm_table_unplug_all(map);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index e38c3fc1a1db..4eaf075da217 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -28,7 +28,7 @@
28 * in types.h. 28 * in types.h.
29 */ 29 */
30#ifdef CONFIG_LBD 30#ifdef CONFIG_LBD
31#define SECTOR_FORMAT "%Lu" 31#define SECTOR_FORMAT "%llu"
32#else 32#else
33#define SECTOR_FORMAT "%lu" 33#define SECTOR_FORMAT "%lu"
34#endif 34#endif
@@ -58,6 +58,7 @@ int dm_create(struct mapped_device **md);
58int dm_create_with_minor(unsigned int minor, struct mapped_device **md); 58int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
59void dm_set_mdptr(struct mapped_device *md, void *ptr); 59void dm_set_mdptr(struct mapped_device *md, void *ptr);
60void *dm_get_mdptr(dev_t dev); 60void *dm_get_mdptr(dev_t dev);
61struct mapped_device *dm_get_md(dev_t dev);
61 62
62/* 63/*
63 * Reference counting for md. 64 * Reference counting for md.
@@ -68,7 +69,7 @@ void dm_put(struct mapped_device *md);
68/* 69/*
69 * A device can still be used while suspended, but I/O is deferred. 70 * A device can still be used while suspended, but I/O is deferred.
70 */ 71 */
71int dm_suspend(struct mapped_device *md); 72int dm_suspend(struct mapped_device *md, int with_lockfs);
72int dm_resume(struct mapped_device *md); 73int dm_resume(struct mapped_device *md);
73 74
74/* 75/*
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 0248f8e7eac0..a7a5ab554338 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -316,9 +316,10 @@ static int stop(mddev_t *mddev)
316 return 0; 316 return 0;
317} 317}
318 318
319static mdk_personality_t faulty_personality = 319static struct mdk_personality faulty_personality =
320{ 320{
321 .name = "faulty", 321 .name = "faulty",
322 .level = LEVEL_FAULTY,
322 .owner = THIS_MODULE, 323 .owner = THIS_MODULE,
323 .make_request = make_request, 324 .make_request = make_request,
324 .run = run, 325 .run = run,
@@ -329,15 +330,17 @@ static mdk_personality_t faulty_personality =
329 330
330static int __init raid_init(void) 331static int __init raid_init(void)
331{ 332{
332 return register_md_personality(FAULTY, &faulty_personality); 333 return register_md_personality(&faulty_personality);
333} 334}
334 335
335static void raid_exit(void) 336static void raid_exit(void)
336{ 337{
337 unregister_md_personality(FAULTY); 338 unregister_md_personality(&faulty_personality);
338} 339}
339 340
340module_init(raid_init); 341module_init(raid_init);
341module_exit(raid_exit); 342module_exit(raid_exit);
342MODULE_LICENSE("GPL"); 343MODULE_LICENSE("GPL");
343MODULE_ALIAS("md-personality-10"); /* faulty */ 344MODULE_ALIAS("md-personality-10"); /* faulty */
345MODULE_ALIAS("md-faulty");
346MODULE_ALIAS("md-level--5");
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
index eb7036485975..ca99979c868a 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/kcopyd.c
@@ -561,11 +561,13 @@ int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
561 * Cancels a kcopyd job, eg. someone might be deactivating a 561 * Cancels a kcopyd job, eg. someone might be deactivating a
562 * mirror. 562 * mirror.
563 */ 563 */
564#if 0
564int kcopyd_cancel(struct kcopyd_job *job, int block) 565int kcopyd_cancel(struct kcopyd_job *job, int block)
565{ 566{
566 /* FIXME: finish */ 567 /* FIXME: finish */
567 return -1; 568 return -1;
568} 569}
570#endif /* 0 */
569 571
570/*----------------------------------------------------------------- 572/*-----------------------------------------------------------------
571 * Unit setup 573 * Unit setup
@@ -684,4 +686,3 @@ void kcopyd_client_destroy(struct kcopyd_client *kc)
684EXPORT_SYMBOL(kcopyd_client_create); 686EXPORT_SYMBOL(kcopyd_client_create);
685EXPORT_SYMBOL(kcopyd_client_destroy); 687EXPORT_SYMBOL(kcopyd_client_destroy);
686EXPORT_SYMBOL(kcopyd_copy); 688EXPORT_SYMBOL(kcopyd_copy);
687EXPORT_SYMBOL(kcopyd_cancel);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 946efef3a8f5..777585458c85 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -121,11 +121,10 @@ static int linear_run (mddev_t *mddev)
121 sector_t curr_offset; 121 sector_t curr_offset;
122 struct list_head *tmp; 122 struct list_head *tmp;
123 123
124 conf = kmalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t), 124 conf = kzalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t),
125 GFP_KERNEL); 125 GFP_KERNEL);
126 if (!conf) 126 if (!conf)
127 goto out; 127 goto out;
128 memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t));
129 mddev->private = conf; 128 mddev->private = conf;
130 129
131 cnt = 0; 130 cnt = 0;
@@ -352,9 +351,10 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev)
352} 351}
353 352
354 353
355static mdk_personality_t linear_personality= 354static struct mdk_personality linear_personality =
356{ 355{
357 .name = "linear", 356 .name = "linear",
357 .level = LEVEL_LINEAR,
358 .owner = THIS_MODULE, 358 .owner = THIS_MODULE,
359 .make_request = linear_make_request, 359 .make_request = linear_make_request,
360 .run = linear_run, 360 .run = linear_run,
@@ -364,16 +364,18 @@ static mdk_personality_t linear_personality=
364 364
365static int __init linear_init (void) 365static int __init linear_init (void)
366{ 366{
367 return register_md_personality (LINEAR, &linear_personality); 367 return register_md_personality (&linear_personality);
368} 368}
369 369
370static void linear_exit (void) 370static void linear_exit (void)
371{ 371{
372 unregister_md_personality (LINEAR); 372 unregister_md_personality (&linear_personality);
373} 373}
374 374
375 375
376module_init(linear_init); 376module_init(linear_init);
377module_exit(linear_exit); 377module_exit(linear_exit);
378MODULE_LICENSE("GPL"); 378MODULE_LICENSE("GPL");
379MODULE_ALIAS("md-personality-1"); /* LINEAR */ 379MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
380MODULE_ALIAS("md-linear");
381MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8175a2a222da..1b76fb29fb70 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -42,6 +42,7 @@
42#include <linux/devfs_fs_kernel.h> 42#include <linux/devfs_fs_kernel.h>
43#include <linux/buffer_head.h> /* for invalidate_bdev */ 43#include <linux/buffer_head.h> /* for invalidate_bdev */
44#include <linux/suspend.h> 44#include <linux/suspend.h>
45#include <linux/poll.h>
45 46
46#include <linux/init.h> 47#include <linux/init.h>
47 48
@@ -67,7 +68,7 @@
67static void autostart_arrays (int part); 68static void autostart_arrays (int part);
68#endif 69#endif
69 70
70static mdk_personality_t *pers[MAX_PERSONALITY]; 71static LIST_HEAD(pers_list);
71static DEFINE_SPINLOCK(pers_lock); 72static DEFINE_SPINLOCK(pers_lock);
72 73
73/* 74/*
@@ -80,10 +81,22 @@ static DEFINE_SPINLOCK(pers_lock);
80 * idle IO detection. 81 * idle IO detection.
81 * 82 *
82 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 83 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
84 * or /sys/block/mdX/md/sync_speed_{min,max}
83 */ 85 */
84 86
85static int sysctl_speed_limit_min = 1000; 87static int sysctl_speed_limit_min = 1000;
86static int sysctl_speed_limit_max = 200000; 88static int sysctl_speed_limit_max = 200000;
89static inline int speed_min(mddev_t *mddev)
90{
91 return mddev->sync_speed_min ?
92 mddev->sync_speed_min : sysctl_speed_limit_min;
93}
94
95static inline int speed_max(mddev_t *mddev)
96{
97 return mddev->sync_speed_max ?
98 mddev->sync_speed_max : sysctl_speed_limit_max;
99}
87 100
88static struct ctl_table_header *raid_table_header; 101static struct ctl_table_header *raid_table_header;
89 102
@@ -134,6 +147,24 @@ static struct block_device_operations md_fops;
134static int start_readonly; 147static int start_readonly;
135 148
136/* 149/*
150 * We have a system wide 'event count' that is incremented
151 * on any 'interesting' event, and readers of /proc/mdstat
152 * can use 'poll' or 'select' to find out when the event
153 * count increases.
154 *
155 * Events are:
156 * start array, stop array, error, add device, remove device,
157 * start build, activate spare
158 */
159static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
160static atomic_t md_event_count;
161static void md_new_event(mddev_t *mddev)
162{
163 atomic_inc(&md_event_count);
164 wake_up(&md_event_waiters);
165}
166
167/*
137 * Enables to iterate over all existing md arrays 168 * Enables to iterate over all existing md arrays
138 * all_mddevs_lock protects this list. 169 * all_mddevs_lock protects this list.
139 */ 170 */
@@ -209,12 +240,10 @@ static mddev_t * mddev_find(dev_t unit)
209 } 240 }
210 spin_unlock(&all_mddevs_lock); 241 spin_unlock(&all_mddevs_lock);
211 242
212 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL); 243 new = kzalloc(sizeof(*new), GFP_KERNEL);
213 if (!new) 244 if (!new)
214 return NULL; 245 return NULL;
215 246
216 memset(new, 0, sizeof(*new));
217
218 new->unit = unit; 247 new->unit = unit;
219 if (MAJOR(unit) == MD_MAJOR) 248 if (MAJOR(unit) == MD_MAJOR)
220 new->md_minor = MINOR(unit); 249 new->md_minor = MINOR(unit);
@@ -262,7 +291,7 @@ static inline void mddev_unlock(mddev_t * mddev)
262 md_wakeup_thread(mddev->thread); 291 md_wakeup_thread(mddev->thread);
263} 292}
264 293
265mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 294static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
266{ 295{
267 mdk_rdev_t * rdev; 296 mdk_rdev_t * rdev;
268 struct list_head *tmp; 297 struct list_head *tmp;
@@ -286,6 +315,18 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
286 return NULL; 315 return NULL;
287} 316}
288 317
318static struct mdk_personality *find_pers(int level, char *clevel)
319{
320 struct mdk_personality *pers;
321 list_for_each_entry(pers, &pers_list, list) {
322 if (level != LEVEL_NONE && pers->level == level)
323 return pers;
324 if (strcmp(pers->name, clevel)==0)
325 return pers;
326 }
327 return NULL;
328}
329
289static inline sector_t calc_dev_sboffset(struct block_device *bdev) 330static inline sector_t calc_dev_sboffset(struct block_device *bdev)
290{ 331{
291 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 332 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
@@ -320,7 +361,7 @@ static int alloc_disk_sb(mdk_rdev_t * rdev)
320static void free_disk_sb(mdk_rdev_t * rdev) 361static void free_disk_sb(mdk_rdev_t * rdev)
321{ 362{
322 if (rdev->sb_page) { 363 if (rdev->sb_page) {
323 page_cache_release(rdev->sb_page); 364 put_page(rdev->sb_page);
324 rdev->sb_loaded = 0; 365 rdev->sb_loaded = 0;
325 rdev->sb_page = NULL; 366 rdev->sb_page = NULL;
326 rdev->sb_offset = 0; 367 rdev->sb_offset = 0;
@@ -461,6 +502,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
461 bio_put(bio); 502 bio_put(bio);
462 return ret; 503 return ret;
463} 504}
505EXPORT_SYMBOL_GPL(sync_page_io);
464 506
465static int read_disk_sb(mdk_rdev_t * rdev, int size) 507static int read_disk_sb(mdk_rdev_t * rdev, int size)
466{ 508{
@@ -665,6 +707,10 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
665 } 707 }
666 rdev->size = calc_dev_size(rdev, sb->chunk_size); 708 rdev->size = calc_dev_size(rdev, sb->chunk_size);
667 709
710 if (rdev->size < sb->size && sb->level > 1)
711 /* "this cannot possibly happen" ... */
712 ret = -EINVAL;
713
668 abort: 714 abort:
669 return ret; 715 return ret;
670} 716}
@@ -688,6 +734,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
688 mddev->ctime = sb->ctime; 734 mddev->ctime = sb->ctime;
689 mddev->utime = sb->utime; 735 mddev->utime = sb->utime;
690 mddev->level = sb->level; 736 mddev->level = sb->level;
737 mddev->clevel[0] = 0;
691 mddev->layout = sb->layout; 738 mddev->layout = sb->layout;
692 mddev->raid_disks = sb->raid_disks; 739 mddev->raid_disks = sb->raid_disks;
693 mddev->size = sb->size; 740 mddev->size = sb->size;
@@ -714,9 +761,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
714 761
715 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 762 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
716 mddev->bitmap_file == NULL) { 763 mddev->bitmap_file == NULL) {
717 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) { 764 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
765 && mddev->level != 10) {
718 /* FIXME use a better test */ 766 /* FIXME use a better test */
719 printk(KERN_WARNING "md: bitmaps only support for raid1\n"); 767 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
720 return -EINVAL; 768 return -EINVAL;
721 } 769 }
722 mddev->bitmap_offset = mddev->default_bitmap_offset; 770 mddev->bitmap_offset = mddev->default_bitmap_offset;
@@ -968,6 +1016,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
968 } 1016 }
969 rdev->preferred_minor = 0xffff; 1017 rdev->preferred_minor = 0xffff;
970 rdev->data_offset = le64_to_cpu(sb->data_offset); 1018 rdev->data_offset = le64_to_cpu(sb->data_offset);
1019 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
971 1020
972 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1021 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
973 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; 1022 bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
@@ -1006,6 +1055,9 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1006 rdev->size = le64_to_cpu(sb->data_size)/2; 1055 rdev->size = le64_to_cpu(sb->data_size)/2;
1007 if (le32_to_cpu(sb->chunksize)) 1056 if (le32_to_cpu(sb->chunksize))
1008 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); 1057 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
1058
1059 if (le32_to_cpu(sb->size) > rdev->size*2)
1060 return -EINVAL;
1009 return 0; 1061 return 0;
1010} 1062}
1011 1063
@@ -1023,6 +1075,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1023 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1075 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1024 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1076 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1025 mddev->level = le32_to_cpu(sb->level); 1077 mddev->level = le32_to_cpu(sb->level);
1078 mddev->clevel[0] = 0;
1026 mddev->layout = le32_to_cpu(sb->layout); 1079 mddev->layout = le32_to_cpu(sb->layout);
1027 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1080 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1028 mddev->size = le64_to_cpu(sb->size)/2; 1081 mddev->size = le64_to_cpu(sb->size)/2;
@@ -1037,8 +1090,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1037 1090
1038 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1091 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1039 mddev->bitmap_file == NULL ) { 1092 mddev->bitmap_file == NULL ) {
1040 if (mddev->level != 1) { 1093 if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
1041 printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); 1094 && mddev->level != 10) {
1095 printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
1042 return -EINVAL; 1096 return -EINVAL;
1043 } 1097 }
1044 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1098 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
@@ -1105,6 +1159,8 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1105 else 1159 else
1106 sb->resync_offset = cpu_to_le64(0); 1160 sb->resync_offset = cpu_to_le64(0);
1107 1161
1162 sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
1163
1108 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1164 if (mddev->bitmap && mddev->bitmap_file == NULL) {
1109 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1165 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1110 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1166 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
@@ -1187,6 +1243,14 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1187 MD_BUG(); 1243 MD_BUG();
1188 return -EINVAL; 1244 return -EINVAL;
1189 } 1245 }
1246 /* make sure rdev->size exceeds mddev->size */
1247 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1248 if (mddev->pers)
1249 /* Cannot change size, so fail */
1250 return -ENOSPC;
1251 else
1252 mddev->size = rdev->size;
1253 }
1190 same_pdev = match_dev_unit(mddev, rdev); 1254 same_pdev = match_dev_unit(mddev, rdev);
1191 if (same_pdev) 1255 if (same_pdev)
1192 printk(KERN_WARNING 1256 printk(KERN_WARNING
@@ -1496,6 +1560,26 @@ repeat:
1496 1560
1497} 1561}
1498 1562
1563/* words written to sysfs files may, or my not, be \n terminated.
1564 * We want to accept with case. For this we use cmd_match.
1565 */
1566static int cmd_match(const char *cmd, const char *str)
1567{
1568 /* See if cmd, written into a sysfs file, matches
1569 * str. They must either be the same, or cmd can
1570 * have a trailing newline
1571 */
1572 while (*cmd && *str && *cmd == *str) {
1573 cmd++;
1574 str++;
1575 }
1576 if (*cmd == '\n')
1577 cmd++;
1578 if (*str || *cmd)
1579 return 0;
1580 return 1;
1581}
1582
1499struct rdev_sysfs_entry { 1583struct rdev_sysfs_entry {
1500 struct attribute attr; 1584 struct attribute attr;
1501 ssize_t (*show)(mdk_rdev_t *, char *); 1585 ssize_t (*show)(mdk_rdev_t *, char *);
@@ -1538,9 +1622,113 @@ super_show(mdk_rdev_t *rdev, char *page)
1538} 1622}
1539static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); 1623static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
1540 1624
1625static ssize_t
1626errors_show(mdk_rdev_t *rdev, char *page)
1627{
1628 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
1629}
1630
1631static ssize_t
1632errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1633{
1634 char *e;
1635 unsigned long n = simple_strtoul(buf, &e, 10);
1636 if (*buf && (*e == 0 || *e == '\n')) {
1637 atomic_set(&rdev->corrected_errors, n);
1638 return len;
1639 }
1640 return -EINVAL;
1641}
1642static struct rdev_sysfs_entry rdev_errors =
1643__ATTR(errors, 0644, errors_show, errors_store);
1644
1645static ssize_t
1646slot_show(mdk_rdev_t *rdev, char *page)
1647{
1648 if (rdev->raid_disk < 0)
1649 return sprintf(page, "none\n");
1650 else
1651 return sprintf(page, "%d\n", rdev->raid_disk);
1652}
1653
1654static ssize_t
1655slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1656{
1657 char *e;
1658 int slot = simple_strtoul(buf, &e, 10);
1659 if (strncmp(buf, "none", 4)==0)
1660 slot = -1;
1661 else if (e==buf || (*e && *e!= '\n'))
1662 return -EINVAL;
1663 if (rdev->mddev->pers)
1664 /* Cannot set slot in active array (yet) */
1665 return -EBUSY;
1666 if (slot >= rdev->mddev->raid_disks)
1667 return -ENOSPC;
1668 rdev->raid_disk = slot;
1669 /* assume it is working */
1670 rdev->flags = 0;
1671 set_bit(In_sync, &rdev->flags);
1672 return len;
1673}
1674
1675
1676static struct rdev_sysfs_entry rdev_slot =
1677__ATTR(slot, 0644, slot_show, slot_store);
1678
1679static ssize_t
1680offset_show(mdk_rdev_t *rdev, char *page)
1681{
1682 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
1683}
1684
1685static ssize_t
1686offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1687{
1688 char *e;
1689 unsigned long long offset = simple_strtoull(buf, &e, 10);
1690 if (e==buf || (*e && *e != '\n'))
1691 return -EINVAL;
1692 if (rdev->mddev->pers)
1693 return -EBUSY;
1694 rdev->data_offset = offset;
1695 return len;
1696}
1697
1698static struct rdev_sysfs_entry rdev_offset =
1699__ATTR(offset, 0644, offset_show, offset_store);
1700
1701static ssize_t
1702rdev_size_show(mdk_rdev_t *rdev, char *page)
1703{
1704 return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
1705}
1706
1707static ssize_t
1708rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1709{
1710 char *e;
1711 unsigned long long size = simple_strtoull(buf, &e, 10);
1712 if (e==buf || (*e && *e != '\n'))
1713 return -EINVAL;
1714 if (rdev->mddev->pers)
1715 return -EBUSY;
1716 rdev->size = size;
1717 if (size < rdev->mddev->size || rdev->mddev->size == 0)
1718 rdev->mddev->size = size;
1719 return len;
1720}
1721
1722static struct rdev_sysfs_entry rdev_size =
1723__ATTR(size, 0644, rdev_size_show, rdev_size_store);
1724
1541static struct attribute *rdev_default_attrs[] = { 1725static struct attribute *rdev_default_attrs[] = {
1542 &rdev_state.attr, 1726 &rdev_state.attr,
1543 &rdev_super.attr, 1727 &rdev_super.attr,
1728 &rdev_errors.attr,
1729 &rdev_slot.attr,
1730 &rdev_offset.attr,
1731 &rdev_size.attr,
1544 NULL, 1732 NULL,
1545}; 1733};
1546static ssize_t 1734static ssize_t
@@ -1598,12 +1786,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
1598 mdk_rdev_t *rdev; 1786 mdk_rdev_t *rdev;
1599 sector_t size; 1787 sector_t size;
1600 1788
1601 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); 1789 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
1602 if (!rdev) { 1790 if (!rdev) {
1603 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 1791 printk(KERN_ERR "md: could not alloc mem for new device!\n");
1604 return ERR_PTR(-ENOMEM); 1792 return ERR_PTR(-ENOMEM);
1605 } 1793 }
1606 memset(rdev, 0, sizeof(*rdev));
1607 1794
1608 if ((err = alloc_disk_sb(rdev))) 1795 if ((err = alloc_disk_sb(rdev)))
1609 goto abort_free; 1796 goto abort_free;
@@ -1621,6 +1808,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
1621 rdev->data_offset = 0; 1808 rdev->data_offset = 0;
1622 atomic_set(&rdev->nr_pending, 0); 1809 atomic_set(&rdev->nr_pending, 0);
1623 atomic_set(&rdev->read_errors, 0); 1810 atomic_set(&rdev->read_errors, 0);
1811 atomic_set(&rdev->corrected_errors, 0);
1624 1812
1625 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 1813 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
1626 if (!size) { 1814 if (!size) {
@@ -1725,16 +1913,37 @@ static void analyze_sbs(mddev_t * mddev)
1725static ssize_t 1913static ssize_t
1726level_show(mddev_t *mddev, char *page) 1914level_show(mddev_t *mddev, char *page)
1727{ 1915{
1728 mdk_personality_t *p = mddev->pers; 1916 struct mdk_personality *p = mddev->pers;
1729 if (p == NULL && mddev->raid_disks == 0) 1917 if (p)
1730 return 0;
1731 if (mddev->level >= 0)
1732 return sprintf(page, "raid%d\n", mddev->level);
1733 else
1734 return sprintf(page, "%s\n", p->name); 1918 return sprintf(page, "%s\n", p->name);
1919 else if (mddev->clevel[0])
1920 return sprintf(page, "%s\n", mddev->clevel);
1921 else if (mddev->level != LEVEL_NONE)
1922 return sprintf(page, "%d\n", mddev->level);
1923 else
1924 return 0;
1925}
1926
1927static ssize_t
1928level_store(mddev_t *mddev, const char *buf, size_t len)
1929{
1930 int rv = len;
1931 if (mddev->pers)
1932 return -EBUSY;
1933 if (len == 0)
1934 return 0;
1935 if (len >= sizeof(mddev->clevel))
1936 return -ENOSPC;
1937 strncpy(mddev->clevel, buf, len);
1938 if (mddev->clevel[len-1] == '\n')
1939 len--;
1940 mddev->clevel[len] = 0;
1941 mddev->level = LEVEL_NONE;
1942 return rv;
1735} 1943}
1736 1944
1737static struct md_sysfs_entry md_level = __ATTR_RO(level); 1945static struct md_sysfs_entry md_level =
1946__ATTR(level, 0644, level_show, level_store);
1738 1947
1739static ssize_t 1948static ssize_t
1740raid_disks_show(mddev_t *mddev, char *page) 1949raid_disks_show(mddev_t *mddev, char *page)
@@ -1744,7 +1953,197 @@ raid_disks_show(mddev_t *mddev, char *page)
1744 return sprintf(page, "%d\n", mddev->raid_disks); 1953 return sprintf(page, "%d\n", mddev->raid_disks);
1745} 1954}
1746 1955
1747static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks); 1956static int update_raid_disks(mddev_t *mddev, int raid_disks);
1957
1958static ssize_t
1959raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
1960{
1961 /* can only set raid_disks if array is not yet active */
1962 char *e;
1963 int rv = 0;
1964 unsigned long n = simple_strtoul(buf, &e, 10);
1965
1966 if (!*buf || (*e && *e != '\n'))
1967 return -EINVAL;
1968
1969 if (mddev->pers)
1970 rv = update_raid_disks(mddev, n);
1971 else
1972 mddev->raid_disks = n;
1973 return rv ? rv : len;
1974}
1975static struct md_sysfs_entry md_raid_disks =
1976__ATTR(raid_disks, 0644, raid_disks_show, raid_disks_store);
1977
1978static ssize_t
1979chunk_size_show(mddev_t *mddev, char *page)
1980{
1981 return sprintf(page, "%d\n", mddev->chunk_size);
1982}
1983
1984static ssize_t
1985chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
1986{
1987 /* can only set chunk_size if array is not yet active */
1988 char *e;
1989 unsigned long n = simple_strtoul(buf, &e, 10);
1990
1991 if (mddev->pers)
1992 return -EBUSY;
1993 if (!*buf || (*e && *e != '\n'))
1994 return -EINVAL;
1995
1996 mddev->chunk_size = n;
1997 return len;
1998}
1999static struct md_sysfs_entry md_chunk_size =
2000__ATTR(chunk_size, 0644, chunk_size_show, chunk_size_store);
2001
2002static ssize_t
2003null_show(mddev_t *mddev, char *page)
2004{
2005 return -EINVAL;
2006}
2007
2008static ssize_t
2009new_dev_store(mddev_t *mddev, const char *buf, size_t len)
2010{
2011 /* buf must be %d:%d\n? giving major and minor numbers */
2012 /* The new device is added to the array.
2013 * If the array has a persistent superblock, we read the
2014 * superblock to initialise info and check validity.
2015 * Otherwise, only checking done is that in bind_rdev_to_array,
2016 * which mainly checks size.
2017 */
2018 char *e;
2019 int major = simple_strtoul(buf, &e, 10);
2020 int minor;
2021 dev_t dev;
2022 mdk_rdev_t *rdev;
2023 int err;
2024
2025 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
2026 return -EINVAL;
2027 minor = simple_strtoul(e+1, &e, 10);
2028 if (*e && *e != '\n')
2029 return -EINVAL;
2030 dev = MKDEV(major, minor);
2031 if (major != MAJOR(dev) ||
2032 minor != MINOR(dev))
2033 return -EOVERFLOW;
2034
2035
2036 if (mddev->persistent) {
2037 rdev = md_import_device(dev, mddev->major_version,
2038 mddev->minor_version);
2039 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
2040 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2041 mdk_rdev_t, same_set);
2042 err = super_types[mddev->major_version]
2043 .load_super(rdev, rdev0, mddev->minor_version);
2044 if (err < 0)
2045 goto out;
2046 }
2047 } else
2048 rdev = md_import_device(dev, -1, -1);
2049
2050 if (IS_ERR(rdev))
2051 return PTR_ERR(rdev);
2052 err = bind_rdev_to_array(rdev, mddev);
2053 out:
2054 if (err)
2055 export_rdev(rdev);
2056 return err ? err : len;
2057}
2058
2059static struct md_sysfs_entry md_new_device =
2060__ATTR(new_dev, 0200, null_show, new_dev_store);
2061
2062static ssize_t
2063size_show(mddev_t *mddev, char *page)
2064{
2065 return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
2066}
2067
2068static int update_size(mddev_t *mddev, unsigned long size);
2069
2070static ssize_t
2071size_store(mddev_t *mddev, const char *buf, size_t len)
2072{
2073 /* If array is inactive, we can reduce the component size, but
2074 * not increase it (except from 0).
2075 * If array is active, we can try an on-line resize
2076 */
2077 char *e;
2078 int err = 0;
2079 unsigned long long size = simple_strtoull(buf, &e, 10);
2080 if (!*buf || *buf == '\n' ||
2081 (*e && *e != '\n'))
2082 return -EINVAL;
2083
2084 if (mddev->pers) {
2085 err = update_size(mddev, size);
2086 md_update_sb(mddev);
2087 } else {
2088 if (mddev->size == 0 ||
2089 mddev->size > size)
2090 mddev->size = size;
2091 else
2092 err = -ENOSPC;
2093 }
2094 return err ? err : len;
2095}
2096
2097static struct md_sysfs_entry md_size =
2098__ATTR(component_size, 0644, size_show, size_store);
2099
2100
2101/* Metdata version.
2102 * This is either 'none' for arrays with externally managed metadata,
2103 * or N.M for internally known formats
2104 */
2105static ssize_t
2106metadata_show(mddev_t *mddev, char *page)
2107{
2108 if (mddev->persistent)
2109 return sprintf(page, "%d.%d\n",
2110 mddev->major_version, mddev->minor_version);
2111 else
2112 return sprintf(page, "none\n");
2113}
2114
2115static ssize_t
2116metadata_store(mddev_t *mddev, const char *buf, size_t len)
2117{
2118 int major, minor;
2119 char *e;
2120 if (!list_empty(&mddev->disks))
2121 return -EBUSY;
2122
2123 if (cmd_match(buf, "none")) {
2124 mddev->persistent = 0;
2125 mddev->major_version = 0;
2126 mddev->minor_version = 90;
2127 return len;
2128 }
2129 major = simple_strtoul(buf, &e, 10);
2130 if (e==buf || *e != '.')
2131 return -EINVAL;
2132 buf = e+1;
2133 minor = simple_strtoul(buf, &e, 10);
2134 if (e==buf || *e != '\n')
2135 return -EINVAL;
2136 if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
2137 super_types[major].name == NULL)
2138 return -ENOENT;
2139 mddev->major_version = major;
2140 mddev->minor_version = minor;
2141 mddev->persistent = 1;
2142 return len;
2143}
2144
2145static struct md_sysfs_entry md_metadata =
2146__ATTR(metadata_version, 0644, metadata_show, metadata_store);
1748 2147
1749static ssize_t 2148static ssize_t
1750action_show(mddev_t *mddev, char *page) 2149action_show(mddev_t *mddev, char *page)
@@ -1771,31 +2170,27 @@ action_store(mddev_t *mddev, const char *page, size_t len)
1771 if (!mddev->pers || !mddev->pers->sync_request) 2170 if (!mddev->pers || !mddev->pers->sync_request)
1772 return -EINVAL; 2171 return -EINVAL;
1773 2172
1774 if (strcmp(page, "idle")==0 || strcmp(page, "idle\n")==0) { 2173 if (cmd_match(page, "idle")) {
1775 if (mddev->sync_thread) { 2174 if (mddev->sync_thread) {
1776 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 2175 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1777 md_unregister_thread(mddev->sync_thread); 2176 md_unregister_thread(mddev->sync_thread);
1778 mddev->sync_thread = NULL; 2177 mddev->sync_thread = NULL;
1779 mddev->recovery = 0; 2178 mddev->recovery = 0;
1780 } 2179 }
1781 return len; 2180 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
1782 } 2181 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
1783
1784 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
1785 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
1786 return -EBUSY; 2182 return -EBUSY;
1787 if (strcmp(page, "resync")==0 || strcmp(page, "resync\n")==0 || 2183 else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
1788 strcmp(page, "recover")==0 || strcmp(page, "recover\n")==0)
1789 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2184 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1790 else { 2185 else {
1791 if (strcmp(page, "check")==0 || strcmp(page, "check\n")==0) 2186 if (cmd_match(page, "check"))
1792 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 2187 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
1793 else if (strcmp(page, "repair")!=0 && strcmp(page, "repair\n")!=0) 2188 else if (cmd_match(page, "repair"))
1794 return -EINVAL; 2189 return -EINVAL;
1795 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 2190 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
1796 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 2191 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
1797 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1798 } 2192 }
2193 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1799 md_wakeup_thread(mddev->thread); 2194 md_wakeup_thread(mddev->thread);
1800 return len; 2195 return len;
1801} 2196}
@@ -1814,15 +2209,107 @@ md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
1814static struct md_sysfs_entry 2209static struct md_sysfs_entry
1815md_mismatches = __ATTR_RO(mismatch_cnt); 2210md_mismatches = __ATTR_RO(mismatch_cnt);
1816 2211
2212static ssize_t
2213sync_min_show(mddev_t *mddev, char *page)
2214{
2215 return sprintf(page, "%d (%s)\n", speed_min(mddev),
2216 mddev->sync_speed_min ? "local": "system");
2217}
2218
2219static ssize_t
2220sync_min_store(mddev_t *mddev, const char *buf, size_t len)
2221{
2222 int min;
2223 char *e;
2224 if (strncmp(buf, "system", 6)==0) {
2225 mddev->sync_speed_min = 0;
2226 return len;
2227 }
2228 min = simple_strtoul(buf, &e, 10);
2229 if (buf == e || (*e && *e != '\n') || min <= 0)
2230 return -EINVAL;
2231 mddev->sync_speed_min = min;
2232 return len;
2233}
2234
2235static struct md_sysfs_entry md_sync_min =
2236__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
2237
2238static ssize_t
2239sync_max_show(mddev_t *mddev, char *page)
2240{
2241 return sprintf(page, "%d (%s)\n", speed_max(mddev),
2242 mddev->sync_speed_max ? "local": "system");
2243}
2244
2245static ssize_t
2246sync_max_store(mddev_t *mddev, const char *buf, size_t len)
2247{
2248 int max;
2249 char *e;
2250 if (strncmp(buf, "system", 6)==0) {
2251 mddev->sync_speed_max = 0;
2252 return len;
2253 }
2254 max = simple_strtoul(buf, &e, 10);
2255 if (buf == e || (*e && *e != '\n') || max <= 0)
2256 return -EINVAL;
2257 mddev->sync_speed_max = max;
2258 return len;
2259}
2260
2261static struct md_sysfs_entry md_sync_max =
2262__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
2263
2264
2265static ssize_t
2266sync_speed_show(mddev_t *mddev, char *page)
2267{
2268 unsigned long resync, dt, db;
2269 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2270 dt = ((jiffies - mddev->resync_mark) / HZ);
2271 if (!dt) dt++;
2272 db = resync - (mddev->resync_mark_cnt);
2273 return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
2274}
2275
2276static struct md_sysfs_entry
2277md_sync_speed = __ATTR_RO(sync_speed);
2278
2279static ssize_t
2280sync_completed_show(mddev_t *mddev, char *page)
2281{
2282 unsigned long max_blocks, resync;
2283
2284 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2285 max_blocks = mddev->resync_max_sectors;
2286 else
2287 max_blocks = mddev->size << 1;
2288
2289 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
2290 return sprintf(page, "%lu / %lu\n", resync, max_blocks);
2291}
2292
2293static struct md_sysfs_entry
2294md_sync_completed = __ATTR_RO(sync_completed);
2295
1817static struct attribute *md_default_attrs[] = { 2296static struct attribute *md_default_attrs[] = {
1818 &md_level.attr, 2297 &md_level.attr,
1819 &md_raid_disks.attr, 2298 &md_raid_disks.attr,
2299 &md_chunk_size.attr,
2300 &md_size.attr,
2301 &md_metadata.attr,
2302 &md_new_device.attr,
1820 NULL, 2303 NULL,
1821}; 2304};
1822 2305
1823static struct attribute *md_redundancy_attrs[] = { 2306static struct attribute *md_redundancy_attrs[] = {
1824 &md_scan_mode.attr, 2307 &md_scan_mode.attr,
1825 &md_mismatches.attr, 2308 &md_mismatches.attr,
2309 &md_sync_min.attr,
2310 &md_sync_max.attr,
2311 &md_sync_speed.attr,
2312 &md_sync_completed.attr,
1826 NULL, 2313 NULL,
1827}; 2314};
1828static struct attribute_group md_redundancy_group = { 2315static struct attribute_group md_redundancy_group = {
@@ -1937,14 +2424,16 @@ static void md_safemode_timeout(unsigned long data)
1937 md_wakeup_thread(mddev->thread); 2424 md_wakeup_thread(mddev->thread);
1938} 2425}
1939 2426
2427static int start_dirty_degraded;
1940 2428
1941static int do_md_run(mddev_t * mddev) 2429static int do_md_run(mddev_t * mddev)
1942{ 2430{
1943 int pnum, err; 2431 int err;
1944 int chunk_size; 2432 int chunk_size;
1945 struct list_head *tmp; 2433 struct list_head *tmp;
1946 mdk_rdev_t *rdev; 2434 mdk_rdev_t *rdev;
1947 struct gendisk *disk; 2435 struct gendisk *disk;
2436 struct mdk_personality *pers;
1948 char b[BDEVNAME_SIZE]; 2437 char b[BDEVNAME_SIZE];
1949 2438
1950 if (list_empty(&mddev->disks)) 2439 if (list_empty(&mddev->disks))
@@ -1961,20 +2450,8 @@ static int do_md_run(mddev_t * mddev)
1961 analyze_sbs(mddev); 2450 analyze_sbs(mddev);
1962 2451
1963 chunk_size = mddev->chunk_size; 2452 chunk_size = mddev->chunk_size;
1964 pnum = level_to_pers(mddev->level);
1965 2453
1966 if ((pnum != MULTIPATH) && (pnum != RAID1)) { 2454 if (chunk_size) {
1967 if (!chunk_size) {
1968 /*
1969 * 'default chunksize' in the old md code used to
1970 * be PAGE_SIZE, baaad.
1971 * we abort here to be on the safe side. We don't
1972 * want to continue the bad practice.
1973 */
1974 printk(KERN_ERR
1975 "no chunksize specified, see 'man raidtab'\n");
1976 return -EINVAL;
1977 }
1978 if (chunk_size > MAX_CHUNK_SIZE) { 2455 if (chunk_size > MAX_CHUNK_SIZE) {
1979 printk(KERN_ERR "too big chunk_size: %d > %d\n", 2456 printk(KERN_ERR "too big chunk_size: %d > %d\n",
1980 chunk_size, MAX_CHUNK_SIZE); 2457 chunk_size, MAX_CHUNK_SIZE);
@@ -2010,10 +2487,10 @@ static int do_md_run(mddev_t * mddev)
2010 } 2487 }
2011 2488
2012#ifdef CONFIG_KMOD 2489#ifdef CONFIG_KMOD
2013 if (!pers[pnum]) 2490 if (mddev->level != LEVEL_NONE)
2014 { 2491 request_module("md-level-%d", mddev->level);
2015 request_module("md-personality-%d", pnum); 2492 else if (mddev->clevel[0])
2016 } 2493 request_module("md-%s", mddev->clevel);
2017#endif 2494#endif
2018 2495
2019 /* 2496 /*
@@ -2035,30 +2512,39 @@ static int do_md_run(mddev_t * mddev)
2035 return -ENOMEM; 2512 return -ENOMEM;
2036 2513
2037 spin_lock(&pers_lock); 2514 spin_lock(&pers_lock);
2038 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) { 2515 pers = find_pers(mddev->level, mddev->clevel);
2516 if (!pers || !try_module_get(pers->owner)) {
2039 spin_unlock(&pers_lock); 2517 spin_unlock(&pers_lock);
2040 printk(KERN_WARNING "md: personality %d is not loaded!\n", 2518 if (mddev->level != LEVEL_NONE)
2041 pnum); 2519 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
2520 mddev->level);
2521 else
2522 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
2523 mddev->clevel);
2042 return -EINVAL; 2524 return -EINVAL;
2043 } 2525 }
2044 2526 mddev->pers = pers;
2045 mddev->pers = pers[pnum];
2046 spin_unlock(&pers_lock); 2527 spin_unlock(&pers_lock);
2528 mddev->level = pers->level;
2529 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2047 2530
2048 mddev->recovery = 0; 2531 mddev->recovery = 0;
2049 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 2532 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
2050 mddev->barriers_work = 1; 2533 mddev->barriers_work = 1;
2534 mddev->ok_start_degraded = start_dirty_degraded;
2051 2535
2052 if (start_readonly) 2536 if (start_readonly)
2053 mddev->ro = 2; /* read-only, but switch on first write */ 2537 mddev->ro = 2; /* read-only, but switch on first write */
2054 2538
2055 /* before we start the array running, initialise the bitmap */ 2539 err = mddev->pers->run(mddev);
2056 err = bitmap_create(mddev); 2540 if (!err && mddev->pers->sync_request) {
2057 if (err) 2541 err = bitmap_create(mddev);
2058 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 2542 if (err) {
2059 mdname(mddev), err); 2543 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
2060 else 2544 mdname(mddev), err);
2061 err = mddev->pers->run(mddev); 2545 mddev->pers->stop(mddev);
2546 }
2547 }
2062 if (err) { 2548 if (err) {
2063 printk(KERN_ERR "md: pers->run() failed ...\n"); 2549 printk(KERN_ERR "md: pers->run() failed ...\n");
2064 module_put(mddev->pers->owner); 2550 module_put(mddev->pers->owner);
@@ -2104,6 +2590,7 @@ static int do_md_run(mddev_t * mddev)
2104 mddev->queue->make_request_fn = mddev->pers->make_request; 2590 mddev->queue->make_request_fn = mddev->pers->make_request;
2105 2591
2106 mddev->changed = 1; 2592 mddev->changed = 1;
2593 md_new_event(mddev);
2107 return 0; 2594 return 0;
2108} 2595}
2109 2596
@@ -2231,6 +2718,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
2231 printk(KERN_INFO "md: %s switched to read-only mode.\n", 2718 printk(KERN_INFO "md: %s switched to read-only mode.\n",
2232 mdname(mddev)); 2719 mdname(mddev));
2233 err = 0; 2720 err = 0;
2721 md_new_event(mddev);
2234out: 2722out:
2235 return err; 2723 return err;
2236} 2724}
@@ -2668,12 +3156,6 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2668 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 3156 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
2669 set_bit(WriteMostly, &rdev->flags); 3157 set_bit(WriteMostly, &rdev->flags);
2670 3158
2671 err = bind_rdev_to_array(rdev, mddev);
2672 if (err) {
2673 export_rdev(rdev);
2674 return err;
2675 }
2676
2677 if (!mddev->persistent) { 3159 if (!mddev->persistent) {
2678 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 3160 printk(KERN_INFO "md: nonpersistent superblock ...\n");
2679 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 3161 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
@@ -2681,8 +3163,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2681 rdev->sb_offset = calc_dev_sboffset(rdev->bdev); 3163 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
2682 rdev->size = calc_dev_size(rdev, mddev->chunk_size); 3164 rdev->size = calc_dev_size(rdev, mddev->chunk_size);
2683 3165
2684 if (!mddev->size || (mddev->size > rdev->size)) 3166 err = bind_rdev_to_array(rdev, mddev);
2685 mddev->size = rdev->size; 3167 if (err) {
3168 export_rdev(rdev);
3169 return err;
3170 }
2686 } 3171 }
2687 3172
2688 return 0; 3173 return 0;
@@ -2705,6 +3190,7 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
2705 3190
2706 kick_rdev_from_array(rdev); 3191 kick_rdev_from_array(rdev);
2707 md_update_sb(mddev); 3192 md_update_sb(mddev);
3193 md_new_event(mddev);
2708 3194
2709 return 0; 3195 return 0;
2710busy: 3196busy:
@@ -2753,15 +3239,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
2753 size = calc_dev_size(rdev, mddev->chunk_size); 3239 size = calc_dev_size(rdev, mddev->chunk_size);
2754 rdev->size = size; 3240 rdev->size = size;
2755 3241
2756 if (size < mddev->size) {
2757 printk(KERN_WARNING
2758 "%s: disk size %llu blocks < array size %llu\n",
2759 mdname(mddev), (unsigned long long)size,
2760 (unsigned long long)mddev->size);
2761 err = -ENOSPC;
2762 goto abort_export;
2763 }
2764
2765 if (test_bit(Faulty, &rdev->flags)) { 3242 if (test_bit(Faulty, &rdev->flags)) {
2766 printk(KERN_WARNING 3243 printk(KERN_WARNING
2767 "md: can not hot-add faulty %s disk to %s!\n", 3244 "md: can not hot-add faulty %s disk to %s!\n",
@@ -2771,7 +3248,9 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
2771 } 3248 }
2772 clear_bit(In_sync, &rdev->flags); 3249 clear_bit(In_sync, &rdev->flags);
2773 rdev->desc_nr = -1; 3250 rdev->desc_nr = -1;
2774 bind_rdev_to_array(rdev, mddev); 3251 err = bind_rdev_to_array(rdev, mddev);
3252 if (err)
3253 goto abort_export;
2775 3254
2776 /* 3255 /*
2777 * The rest should better be atomic, we can have disk failures 3256 * The rest should better be atomic, we can have disk failures
@@ -2795,7 +3274,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
2795 */ 3274 */
2796 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3275 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2797 md_wakeup_thread(mddev->thread); 3276 md_wakeup_thread(mddev->thread);
2798 3277 md_new_event(mddev);
2799 return 0; 3278 return 0;
2800 3279
2801abort_unbind_export: 3280abort_unbind_export:
@@ -2942,6 +3421,81 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
2942 return 0; 3421 return 0;
2943} 3422}
2944 3423
3424static int update_size(mddev_t *mddev, unsigned long size)
3425{
3426 mdk_rdev_t * rdev;
3427 int rv;
3428 struct list_head *tmp;
3429
3430 if (mddev->pers->resize == NULL)
3431 return -EINVAL;
3432 /* The "size" is the amount of each device that is used.
3433 * This can only make sense for arrays with redundancy.
3434 * linear and raid0 always use whatever space is available
3435 * We can only consider changing the size if no resync
3436 * or reconstruction is happening, and if the new size
3437 * is acceptable. It must fit before the sb_offset or,
3438 * if that is <data_offset, it must fit before the
3439 * size of each device.
3440 * If size is zero, we find the largest size that fits.
3441 */
3442 if (mddev->sync_thread)
3443 return -EBUSY;
3444 ITERATE_RDEV(mddev,rdev,tmp) {
3445 sector_t avail;
3446 int fit = (size == 0);
3447 if (rdev->sb_offset > rdev->data_offset)
3448 avail = (rdev->sb_offset*2) - rdev->data_offset;
3449 else
3450 avail = get_capacity(rdev->bdev->bd_disk)
3451 - rdev->data_offset;
3452 if (fit && (size == 0 || size > avail/2))
3453 size = avail/2;
3454 if (avail < ((sector_t)size << 1))
3455 return -ENOSPC;
3456 }
3457 rv = mddev->pers->resize(mddev, (sector_t)size *2);
3458 if (!rv) {
3459 struct block_device *bdev;
3460
3461 bdev = bdget_disk(mddev->gendisk, 0);
3462 if (bdev) {
3463 down(&bdev->bd_inode->i_sem);
3464 i_size_write(bdev->bd_inode, mddev->array_size << 10);
3465 up(&bdev->bd_inode->i_sem);
3466 bdput(bdev);
3467 }
3468 }
3469 return rv;
3470}
3471
3472static int update_raid_disks(mddev_t *mddev, int raid_disks)
3473{
3474 int rv;
3475 /* change the number of raid disks */
3476 if (mddev->pers->reshape == NULL)
3477 return -EINVAL;
3478 if (raid_disks <= 0 ||
3479 raid_disks >= mddev->max_disks)
3480 return -EINVAL;
3481 if (mddev->sync_thread)
3482 return -EBUSY;
3483 rv = mddev->pers->reshape(mddev, raid_disks);
3484 if (!rv) {
3485 struct block_device *bdev;
3486
3487 bdev = bdget_disk(mddev->gendisk, 0);
3488 if (bdev) {
3489 down(&bdev->bd_inode->i_sem);
3490 i_size_write(bdev->bd_inode, mddev->array_size << 10);
3491 up(&bdev->bd_inode->i_sem);
3492 bdput(bdev);
3493 }
3494 }
3495 return rv;
3496}
3497
3498
2945/* 3499/*
2946 * update_array_info is used to change the configuration of an 3500 * update_array_info is used to change the configuration of an
2947 * on-line array. 3501 * on-line array.
@@ -2990,71 +3544,12 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
2990 else 3544 else
2991 return mddev->pers->reconfig(mddev, info->layout, -1); 3545 return mddev->pers->reconfig(mddev, info->layout, -1);
2992 } 3546 }
2993 if (mddev->size != info->size) { 3547 if (mddev->size != info->size)
2994 mdk_rdev_t * rdev; 3548 rv = update_size(mddev, info->size);
2995 struct list_head *tmp; 3549
2996 if (mddev->pers->resize == NULL) 3550 if (mddev->raid_disks != info->raid_disks)
2997 return -EINVAL; 3551 rv = update_raid_disks(mddev, info->raid_disks);
2998 /* The "size" is the amount of each device that is used. 3552
2999 * This can only make sense for arrays with redundancy.
3000 * linear and raid0 always use whatever space is available
3001 * We can only consider changing the size if no resync
3002 * or reconstruction is happening, and if the new size
3003 * is acceptable. It must fit before the sb_offset or,
3004 * if that is <data_offset, it must fit before the
3005 * size of each device.
3006 * If size is zero, we find the largest size that fits.
3007 */
3008 if (mddev->sync_thread)
3009 return -EBUSY;
3010 ITERATE_RDEV(mddev,rdev,tmp) {
3011 sector_t avail;
3012 int fit = (info->size == 0);
3013 if (rdev->sb_offset > rdev->data_offset)
3014 avail = (rdev->sb_offset*2) - rdev->data_offset;
3015 else
3016 avail = get_capacity(rdev->bdev->bd_disk)
3017 - rdev->data_offset;
3018 if (fit && (info->size == 0 || info->size > avail/2))
3019 info->size = avail/2;
3020 if (avail < ((sector_t)info->size << 1))
3021 return -ENOSPC;
3022 }
3023 rv = mddev->pers->resize(mddev, (sector_t)info->size *2);
3024 if (!rv) {
3025 struct block_device *bdev;
3026
3027 bdev = bdget_disk(mddev->gendisk, 0);
3028 if (bdev) {
3029 down(&bdev->bd_inode->i_sem);
3030 i_size_write(bdev->bd_inode, mddev->array_size << 10);
3031 up(&bdev->bd_inode->i_sem);
3032 bdput(bdev);
3033 }
3034 }
3035 }
3036 if (mddev->raid_disks != info->raid_disks) {
3037 /* change the number of raid disks */
3038 if (mddev->pers->reshape == NULL)
3039 return -EINVAL;
3040 if (info->raid_disks <= 0 ||
3041 info->raid_disks >= mddev->max_disks)
3042 return -EINVAL;
3043 if (mddev->sync_thread)
3044 return -EBUSY;
3045 rv = mddev->pers->reshape(mddev, info->raid_disks);
3046 if (!rv) {
3047 struct block_device *bdev;
3048
3049 bdev = bdget_disk(mddev->gendisk, 0);
3050 if (bdev) {
3051 down(&bdev->bd_inode->i_sem);
3052 i_size_write(bdev->bd_inode, mddev->array_size << 10);
3053 up(&bdev->bd_inode->i_sem);
3054 bdput(bdev);
3055 }
3056 }
3057 }
3058 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 3553 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
3059 if (mddev->pers->quiesce == NULL) 3554 if (mddev->pers->quiesce == NULL)
3060 return -EINVAL; 3555 return -EINVAL;
@@ -3476,11 +3971,10 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
3476{ 3971{
3477 mdk_thread_t *thread; 3972 mdk_thread_t *thread;
3478 3973
3479 thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); 3974 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
3480 if (!thread) 3975 if (!thread)
3481 return NULL; 3976 return NULL;
3482 3977
3483 memset(thread, 0, sizeof(mdk_thread_t));
3484 init_waitqueue_head(&thread->wqueue); 3978 init_waitqueue_head(&thread->wqueue);
3485 3979
3486 thread->run = run; 3980 thread->run = run;
@@ -3524,6 +4018,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
3524 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4018 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3525 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4019 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3526 md_wakeup_thread(mddev->thread); 4020 md_wakeup_thread(mddev->thread);
4021 md_new_event(mddev);
3527} 4022}
3528 4023
3529/* seq_file implementation /proc/mdstat */ 4024/* seq_file implementation /proc/mdstat */
@@ -3664,24 +4159,29 @@ static void md_seq_stop(struct seq_file *seq, void *v)
3664 mddev_put(mddev); 4159 mddev_put(mddev);
3665} 4160}
3666 4161
4162struct mdstat_info {
4163 int event;
4164};
4165
3667static int md_seq_show(struct seq_file *seq, void *v) 4166static int md_seq_show(struct seq_file *seq, void *v)
3668{ 4167{
3669 mddev_t *mddev = v; 4168 mddev_t *mddev = v;
3670 sector_t size; 4169 sector_t size;
3671 struct list_head *tmp2; 4170 struct list_head *tmp2;
3672 mdk_rdev_t *rdev; 4171 mdk_rdev_t *rdev;
3673 int i; 4172 struct mdstat_info *mi = seq->private;
3674 struct bitmap *bitmap; 4173 struct bitmap *bitmap;
3675 4174
3676 if (v == (void*)1) { 4175 if (v == (void*)1) {
4176 struct mdk_personality *pers;
3677 seq_printf(seq, "Personalities : "); 4177 seq_printf(seq, "Personalities : ");
3678 spin_lock(&pers_lock); 4178 spin_lock(&pers_lock);
3679 for (i = 0; i < MAX_PERSONALITY; i++) 4179 list_for_each_entry(pers, &pers_list, list)
3680 if (pers[i]) 4180 seq_printf(seq, "[%s] ", pers->name);
3681 seq_printf(seq, "[%s] ", pers[i]->name);
3682 4181
3683 spin_unlock(&pers_lock); 4182 spin_unlock(&pers_lock);
3684 seq_printf(seq, "\n"); 4183 seq_printf(seq, "\n");
4184 mi->event = atomic_read(&md_event_count);
3685 return 0; 4185 return 0;
3686 } 4186 }
3687 if (v == (void*)2) { 4187 if (v == (void*)2) {
@@ -3790,47 +4290,68 @@ static struct seq_operations md_seq_ops = {
3790static int md_seq_open(struct inode *inode, struct file *file) 4290static int md_seq_open(struct inode *inode, struct file *file)
3791{ 4291{
3792 int error; 4292 int error;
4293 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
4294 if (mi == NULL)
4295 return -ENOMEM;
3793 4296
3794 error = seq_open(file, &md_seq_ops); 4297 error = seq_open(file, &md_seq_ops);
4298 if (error)
4299 kfree(mi);
4300 else {
4301 struct seq_file *p = file->private_data;
4302 p->private = mi;
4303 mi->event = atomic_read(&md_event_count);
4304 }
3795 return error; 4305 return error;
3796} 4306}
3797 4307
4308static int md_seq_release(struct inode *inode, struct file *file)
4309{
4310 struct seq_file *m = file->private_data;
4311 struct mdstat_info *mi = m->private;
4312 m->private = NULL;
4313 kfree(mi);
4314 return seq_release(inode, file);
4315}
4316
4317static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
4318{
4319 struct seq_file *m = filp->private_data;
4320 struct mdstat_info *mi = m->private;
4321 int mask;
4322
4323 poll_wait(filp, &md_event_waiters, wait);
4324
4325 /* always allow read */
4326 mask = POLLIN | POLLRDNORM;
4327
4328 if (mi->event != atomic_read(&md_event_count))
4329 mask |= POLLERR | POLLPRI;
4330 return mask;
4331}
4332
3798static struct file_operations md_seq_fops = { 4333static struct file_operations md_seq_fops = {
3799 .open = md_seq_open, 4334 .open = md_seq_open,
3800 .read = seq_read, 4335 .read = seq_read,
3801 .llseek = seq_lseek, 4336 .llseek = seq_lseek,
3802 .release = seq_release, 4337 .release = md_seq_release,
4338 .poll = mdstat_poll,
3803}; 4339};
3804 4340
3805int register_md_personality(int pnum, mdk_personality_t *p) 4341int register_md_personality(struct mdk_personality *p)
3806{ 4342{
3807 if (pnum >= MAX_PERSONALITY) {
3808 printk(KERN_ERR
3809 "md: tried to install personality %s as nr %d, but max is %lu\n",
3810 p->name, pnum, MAX_PERSONALITY-1);
3811 return -EINVAL;
3812 }
3813
3814 spin_lock(&pers_lock); 4343 spin_lock(&pers_lock);
3815 if (pers[pnum]) { 4344 list_add_tail(&p->list, &pers_list);
3816 spin_unlock(&pers_lock); 4345 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
3817 return -EBUSY;
3818 }
3819
3820 pers[pnum] = p;
3821 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
3822 spin_unlock(&pers_lock); 4346 spin_unlock(&pers_lock);
3823 return 0; 4347 return 0;
3824} 4348}
3825 4349
3826int unregister_md_personality(int pnum) 4350int unregister_md_personality(struct mdk_personality *p)
3827{ 4351{
3828 if (pnum >= MAX_PERSONALITY) 4352 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
3829 return -EINVAL;
3830
3831 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
3832 spin_lock(&pers_lock); 4353 spin_lock(&pers_lock);
3833 pers[pnum] = NULL; 4354 list_del_init(&p->list);
3834 spin_unlock(&pers_lock); 4355 spin_unlock(&pers_lock);
3835 return 0; 4356 return 0;
3836} 4357}
@@ -4012,10 +4533,10 @@ static void md_do_sync(mddev_t *mddev)
4012 4533
4013 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev)); 4534 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
4014 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:" 4535 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
4015 " %d KB/sec/disc.\n", sysctl_speed_limit_min); 4536 " %d KB/sec/disc.\n", speed_min(mddev));
4016 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 4537 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
4017 "(but not more than %d KB/sec) for reconstruction.\n", 4538 "(but not more than %d KB/sec) for reconstruction.\n",
4018 sysctl_speed_limit_max); 4539 speed_max(mddev));
4019 4540
4020 is_mddev_idle(mddev); /* this also initializes IO event counters */ 4541 is_mddev_idle(mddev); /* this also initializes IO event counters */
4021 /* we don't use the checkpoint if there's a bitmap */ 4542 /* we don't use the checkpoint if there's a bitmap */
@@ -4056,7 +4577,7 @@ static void md_do_sync(mddev_t *mddev)
4056 4577
4057 skipped = 0; 4578 skipped = 0;
4058 sectors = mddev->pers->sync_request(mddev, j, &skipped, 4579 sectors = mddev->pers->sync_request(mddev, j, &skipped,
4059 currspeed < sysctl_speed_limit_min); 4580 currspeed < speed_min(mddev));
4060 if (sectors == 0) { 4581 if (sectors == 0) {
4061 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 4582 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
4062 goto out; 4583 goto out;
@@ -4069,7 +4590,11 @@ static void md_do_sync(mddev_t *mddev)
4069 4590
4070 j += sectors; 4591 j += sectors;
4071 if (j>1) mddev->curr_resync = j; 4592 if (j>1) mddev->curr_resync = j;
4072 4593 if (last_check == 0)
4594 /* this is the earliers that rebuilt will be
4595 * visible in /proc/mdstat
4596 */
4597 md_new_event(mddev);
4073 4598
4074 if (last_check + window > io_sectors || j == max_sectors) 4599 if (last_check + window > io_sectors || j == max_sectors)
4075 continue; 4600 continue;
@@ -4117,8 +4642,8 @@ static void md_do_sync(mddev_t *mddev)
4117 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 4642 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
4118 /((jiffies-mddev->resync_mark)/HZ +1) +1; 4643 /((jiffies-mddev->resync_mark)/HZ +1) +1;
4119 4644
4120 if (currspeed > sysctl_speed_limit_min) { 4645 if (currspeed > speed_min(mddev)) {
4121 if ((currspeed > sysctl_speed_limit_max) || 4646 if ((currspeed > speed_max(mddev)) ||
4122 !is_mddev_idle(mddev)) { 4647 !is_mddev_idle(mddev)) {
4123 msleep(500); 4648 msleep(500);
4124 goto repeat; 4649 goto repeat;
@@ -4255,6 +4780,7 @@ void md_check_recovery(mddev_t *mddev)
4255 mddev->recovery = 0; 4780 mddev->recovery = 0;
4256 /* flag recovery needed just to double check */ 4781 /* flag recovery needed just to double check */
4257 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4782 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4783 md_new_event(mddev);
4258 goto unlock; 4784 goto unlock;
4259 } 4785 }
4260 /* Clear some bits that don't mean anything, but 4786 /* Clear some bits that don't mean anything, but
@@ -4292,6 +4818,7 @@ void md_check_recovery(mddev_t *mddev)
4292 sprintf(nm, "rd%d", rdev->raid_disk); 4818 sprintf(nm, "rd%d", rdev->raid_disk);
4293 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 4819 sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
4294 spares++; 4820 spares++;
4821 md_new_event(mddev);
4295 } else 4822 } else
4296 break; 4823 break;
4297 } 4824 }
@@ -4324,9 +4851,9 @@ void md_check_recovery(mddev_t *mddev)
4324 mdname(mddev)); 4851 mdname(mddev));
4325 /* leave the spares where they are, it shouldn't hurt */ 4852 /* leave the spares where they are, it shouldn't hurt */
4326 mddev->recovery = 0; 4853 mddev->recovery = 0;
4327 } else { 4854 } else
4328 md_wakeup_thread(mddev->sync_thread); 4855 md_wakeup_thread(mddev->sync_thread);
4329 } 4856 md_new_event(mddev);
4330 } 4857 }
4331 unlock: 4858 unlock:
4332 mddev_unlock(mddev); 4859 mddev_unlock(mddev);
@@ -4503,12 +5030,14 @@ static int set_ro(const char *val, struct kernel_param *kp)
4503 int num = simple_strtoul(val, &e, 10); 5030 int num = simple_strtoul(val, &e, 10);
4504 if (*val && (*e == '\0' || *e == '\n')) { 5031 if (*val && (*e == '\0' || *e == '\n')) {
4505 start_readonly = num; 5032 start_readonly = num;
4506 return 0;; 5033 return 0;
4507 } 5034 }
4508 return -EINVAL; 5035 return -EINVAL;
4509} 5036}
4510 5037
4511module_param_call(start_ro, set_ro, get_ro, NULL, 0600); 5038module_param_call(start_ro, set_ro, get_ro, NULL, 0600);
5039module_param(start_dirty_degraded, int, 0644);
5040
4512 5041
4513EXPORT_SYMBOL(register_md_personality); 5042EXPORT_SYMBOL(register_md_personality);
4514EXPORT_SYMBOL(unregister_md_personality); 5043EXPORT_SYMBOL(unregister_md_personality);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 145cdc5ad008..e6aa309a66d7 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -35,15 +35,10 @@
35#define NR_RESERVED_BUFS 32 35#define NR_RESERVED_BUFS 32
36 36
37 37
38static mdk_personality_t multipath_personality;
39
40
41static void *mp_pool_alloc(gfp_t gfp_flags, void *data) 38static void *mp_pool_alloc(gfp_t gfp_flags, void *data)
42{ 39{
43 struct multipath_bh *mpb; 40 struct multipath_bh *mpb;
44 mpb = kmalloc(sizeof(*mpb), gfp_flags); 41 mpb = kzalloc(sizeof(*mpb), gfp_flags);
45 if (mpb)
46 memset(mpb, 0, sizeof(*mpb));
47 return mpb; 42 return mpb;
48} 43}
49 44
@@ -444,7 +439,7 @@ static int multipath_run (mddev_t *mddev)
444 * should be freed in multipath_stop()] 439 * should be freed in multipath_stop()]
445 */ 440 */
446 441
447 conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL); 442 conf = kzalloc(sizeof(multipath_conf_t), GFP_KERNEL);
448 mddev->private = conf; 443 mddev->private = conf;
449 if (!conf) { 444 if (!conf) {
450 printk(KERN_ERR 445 printk(KERN_ERR
@@ -452,9 +447,8 @@ static int multipath_run (mddev_t *mddev)
452 mdname(mddev)); 447 mdname(mddev));
453 goto out; 448 goto out;
454 } 449 }
455 memset(conf, 0, sizeof(*conf));
456 450
457 conf->multipaths = kmalloc(sizeof(struct multipath_info)*mddev->raid_disks, 451 conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
458 GFP_KERNEL); 452 GFP_KERNEL);
459 if (!conf->multipaths) { 453 if (!conf->multipaths) {
460 printk(KERN_ERR 454 printk(KERN_ERR
@@ -462,7 +456,6 @@ static int multipath_run (mddev_t *mddev)
462 mdname(mddev)); 456 mdname(mddev));
463 goto out_free_conf; 457 goto out_free_conf;
464 } 458 }
465 memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks);
466 459
467 conf->working_disks = 0; 460 conf->working_disks = 0;
468 ITERATE_RDEV(mddev,rdev,tmp) { 461 ITERATE_RDEV(mddev,rdev,tmp) {
@@ -557,9 +550,10 @@ static int multipath_stop (mddev_t *mddev)
557 return 0; 550 return 0;
558} 551}
559 552
560static mdk_personality_t multipath_personality= 553static struct mdk_personality multipath_personality =
561{ 554{
562 .name = "multipath", 555 .name = "multipath",
556 .level = LEVEL_MULTIPATH,
563 .owner = THIS_MODULE, 557 .owner = THIS_MODULE,
564 .make_request = multipath_make_request, 558 .make_request = multipath_make_request,
565 .run = multipath_run, 559 .run = multipath_run,
@@ -572,15 +566,17 @@ static mdk_personality_t multipath_personality=
572 566
573static int __init multipath_init (void) 567static int __init multipath_init (void)
574{ 568{
575 return register_md_personality (MULTIPATH, &multipath_personality); 569 return register_md_personality (&multipath_personality);
576} 570}
577 571
578static void __exit multipath_exit (void) 572static void __exit multipath_exit (void)
579{ 573{
580 unregister_md_personality (MULTIPATH); 574 unregister_md_personality (&multipath_personality);
581} 575}
582 576
583module_init(multipath_init); 577module_init(multipath_init);
584module_exit(multipath_exit); 578module_exit(multipath_exit);
585MODULE_LICENSE("GPL"); 579MODULE_LICENSE("GPL");
586MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ 580MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
581MODULE_ALIAS("md-multipath");
582MODULE_ALIAS("md-level--4");
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index fece3277c2a5..abbca150202b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -113,21 +113,16 @@ static int create_strip_zones (mddev_t *mddev)
113 } 113 }
114 printk("raid0: FINAL %d zones\n", conf->nr_strip_zones); 114 printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
115 115
116 conf->strip_zone = kmalloc(sizeof(struct strip_zone)* 116 conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
117 conf->nr_strip_zones, GFP_KERNEL); 117 conf->nr_strip_zones, GFP_KERNEL);
118 if (!conf->strip_zone) 118 if (!conf->strip_zone)
119 return 1; 119 return 1;
120 conf->devlist = kmalloc(sizeof(mdk_rdev_t*)* 120 conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
121 conf->nr_strip_zones*mddev->raid_disks, 121 conf->nr_strip_zones*mddev->raid_disks,
122 GFP_KERNEL); 122 GFP_KERNEL);
123 if (!conf->devlist) 123 if (!conf->devlist)
124 return 1; 124 return 1;
125 125
126 memset(conf->strip_zone, 0,sizeof(struct strip_zone)*
127 conf->nr_strip_zones);
128 memset(conf->devlist, 0,
129 sizeof(mdk_rdev_t*) * conf->nr_strip_zones * mddev->raid_disks);
130
131 /* The first zone must contain all devices, so here we check that 126 /* The first zone must contain all devices, so here we check that
132 * there is a proper alignment of slots to devices and find them all 127 * there is a proper alignment of slots to devices and find them all
133 */ 128 */
@@ -280,7 +275,11 @@ static int raid0_run (mddev_t *mddev)
280 mdk_rdev_t *rdev; 275 mdk_rdev_t *rdev;
281 struct list_head *tmp; 276 struct list_head *tmp;
282 277
283 printk("%s: setting max_sectors to %d, segment boundary to %d\n", 278 if (mddev->chunk_size == 0) {
279 printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
280 return -EINVAL;
281 }
282 printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n",
284 mdname(mddev), 283 mdname(mddev),
285 mddev->chunk_size >> 9, 284 mddev->chunk_size >> 9,
286 (mddev->chunk_size>>1)-1); 285 (mddev->chunk_size>>1)-1);
@@ -361,7 +360,7 @@ static int raid0_run (mddev_t *mddev)
361 * chunksize should be used in that case. 360 * chunksize should be used in that case.
362 */ 361 */
363 { 362 {
364 int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE; 363 int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
365 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) 364 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
366 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 365 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
367 } 366 }
@@ -512,9 +511,10 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
512 return; 511 return;
513} 512}
514 513
515static mdk_personality_t raid0_personality= 514static struct mdk_personality raid0_personality=
516{ 515{
517 .name = "raid0", 516 .name = "raid0",
517 .level = 0,
518 .owner = THIS_MODULE, 518 .owner = THIS_MODULE,
519 .make_request = raid0_make_request, 519 .make_request = raid0_make_request,
520 .run = raid0_run, 520 .run = raid0_run,
@@ -524,15 +524,17 @@ static mdk_personality_t raid0_personality=
524 524
525static int __init raid0_init (void) 525static int __init raid0_init (void)
526{ 526{
527 return register_md_personality (RAID0, &raid0_personality); 527 return register_md_personality (&raid0_personality);
528} 528}
529 529
530static void raid0_exit (void) 530static void raid0_exit (void)
531{ 531{
532 unregister_md_personality (RAID0); 532 unregister_md_personality (&raid0_personality);
533} 533}
534 534
535module_init(raid0_init); 535module_init(raid0_init);
536module_exit(raid0_exit); 536module_exit(raid0_exit);
537MODULE_LICENSE("GPL"); 537MODULE_LICENSE("GPL");
538MODULE_ALIAS("md-personality-2"); /* RAID0 */ 538MODULE_ALIAS("md-personality-2"); /* RAID0 */
539MODULE_ALIAS("md-raid0");
540MODULE_ALIAS("md-level-0");
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 229d7b204297..a06ff91f27e2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -47,10 +47,11 @@
47 */ 47 */
48#define NR_RAID1_BIOS 256 48#define NR_RAID1_BIOS 256
49 49
50static mdk_personality_t raid1_personality;
51 50
52static void unplug_slaves(mddev_t *mddev); 51static void unplug_slaves(mddev_t *mddev);
53 52
53static void allow_barrier(conf_t *conf);
54static void lower_barrier(conf_t *conf);
54 55
55static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 56static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
56{ 57{
@@ -59,10 +60,8 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
59 int size = offsetof(r1bio_t, bios[pi->raid_disks]); 60 int size = offsetof(r1bio_t, bios[pi->raid_disks]);
60 61
61 /* allocate a r1bio with room for raid_disks entries in the bios array */ 62 /* allocate a r1bio with room for raid_disks entries in the bios array */
62 r1_bio = kmalloc(size, gfp_flags); 63 r1_bio = kzalloc(size, gfp_flags);
63 if (r1_bio) 64 if (!r1_bio)
64 memset(r1_bio, 0, size);
65 else
66 unplug_slaves(pi->mddev); 65 unplug_slaves(pi->mddev);
67 66
68 return r1_bio; 67 return r1_bio;
@@ -104,15 +103,30 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
104 } 103 }
105 /* 104 /*
106 * Allocate RESYNC_PAGES data pages and attach them to 105 * Allocate RESYNC_PAGES data pages and attach them to
107 * the first bio; 106 * the first bio.
107 * If this is a user-requested check/repair, allocate
108 * RESYNC_PAGES for each bio.
108 */ 109 */
109 bio = r1_bio->bios[0]; 110 if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
110 for (i = 0; i < RESYNC_PAGES; i++) { 111 j = pi->raid_disks;
111 page = alloc_page(gfp_flags); 112 else
112 if (unlikely(!page)) 113 j = 1;
113 goto out_free_pages; 114 while(j--) {
114 115 bio = r1_bio->bios[j];
115 bio->bi_io_vec[i].bv_page = page; 116 for (i = 0; i < RESYNC_PAGES; i++) {
117 page = alloc_page(gfp_flags);
118 if (unlikely(!page))
119 goto out_free_pages;
120
121 bio->bi_io_vec[i].bv_page = page;
122 }
123 }
124 /* If not user-requests, copy the page pointers to all bios */
125 if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
126 for (i=0; i<RESYNC_PAGES ; i++)
127 for (j=1; j<pi->raid_disks; j++)
128 r1_bio->bios[j]->bi_io_vec[i].bv_page =
129 r1_bio->bios[0]->bi_io_vec[i].bv_page;
116 } 130 }
117 131
118 r1_bio->master_bio = NULL; 132 r1_bio->master_bio = NULL;
@@ -120,8 +134,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
120 return r1_bio; 134 return r1_bio;
121 135
122out_free_pages: 136out_free_pages:
123 for ( ; i > 0 ; i--) 137 for (i=0; i < RESYNC_PAGES ; i++)
124 __free_page(bio->bi_io_vec[i-1].bv_page); 138 for (j=0 ; j < pi->raid_disks; j++)
139 safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
140 j = -1;
125out_free_bio: 141out_free_bio:
126 while ( ++j < pi->raid_disks ) 142 while ( ++j < pi->raid_disks )
127 bio_put(r1_bio->bios[j]); 143 bio_put(r1_bio->bios[j]);
@@ -132,14 +148,16 @@ out_free_bio:
132static void r1buf_pool_free(void *__r1_bio, void *data) 148static void r1buf_pool_free(void *__r1_bio, void *data)
133{ 149{
134 struct pool_info *pi = data; 150 struct pool_info *pi = data;
135 int i; 151 int i,j;
136 r1bio_t *r1bio = __r1_bio; 152 r1bio_t *r1bio = __r1_bio;
137 struct bio *bio = r1bio->bios[0];
138 153
139 for (i = 0; i < RESYNC_PAGES; i++) { 154 for (i = 0; i < RESYNC_PAGES; i++)
140 __free_page(bio->bi_io_vec[i].bv_page); 155 for (j = pi->raid_disks; j-- ;) {
141 bio->bi_io_vec[i].bv_page = NULL; 156 if (j == 0 ||
142 } 157 r1bio->bios[j]->bi_io_vec[i].bv_page !=
158 r1bio->bios[0]->bi_io_vec[i].bv_page)
159 safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
160 }
143 for (i=0 ; i < pi->raid_disks; i++) 161 for (i=0 ; i < pi->raid_disks; i++)
144 bio_put(r1bio->bios[i]); 162 bio_put(r1bio->bios[i]);
145 163
@@ -152,7 +170,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
152 170
153 for (i = 0; i < conf->raid_disks; i++) { 171 for (i = 0; i < conf->raid_disks; i++) {
154 struct bio **bio = r1_bio->bios + i; 172 struct bio **bio = r1_bio->bios + i;
155 if (*bio) 173 if (*bio && *bio != IO_BLOCKED)
156 bio_put(*bio); 174 bio_put(*bio);
157 *bio = NULL; 175 *bio = NULL;
158 } 176 }
@@ -160,20 +178,13 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
160 178
161static inline void free_r1bio(r1bio_t *r1_bio) 179static inline void free_r1bio(r1bio_t *r1_bio)
162{ 180{
163 unsigned long flags;
164
165 conf_t *conf = mddev_to_conf(r1_bio->mddev); 181 conf_t *conf = mddev_to_conf(r1_bio->mddev);
166 182
167 /* 183 /*
168 * Wake up any possible resync thread that waits for the device 184 * Wake up any possible resync thread that waits for the device
169 * to go idle. 185 * to go idle.
170 */ 186 */
171 spin_lock_irqsave(&conf->resync_lock, flags); 187 allow_barrier(conf);
172 if (!--conf->nr_pending) {
173 wake_up(&conf->wait_idle);
174 wake_up(&conf->wait_resume);
175 }
176 spin_unlock_irqrestore(&conf->resync_lock, flags);
177 188
178 put_all_bios(conf, r1_bio); 189 put_all_bios(conf, r1_bio);
179 mempool_free(r1_bio, conf->r1bio_pool); 190 mempool_free(r1_bio, conf->r1bio_pool);
@@ -182,22 +193,17 @@ static inline void free_r1bio(r1bio_t *r1_bio)
182static inline void put_buf(r1bio_t *r1_bio) 193static inline void put_buf(r1bio_t *r1_bio)
183{ 194{
184 conf_t *conf = mddev_to_conf(r1_bio->mddev); 195 conf_t *conf = mddev_to_conf(r1_bio->mddev);
185 unsigned long flags; 196 int i;
186 197
187 mempool_free(r1_bio, conf->r1buf_pool); 198 for (i=0; i<conf->raid_disks; i++) {
199 struct bio *bio = r1_bio->bios[i];
200 if (bio->bi_end_io)
201 rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
202 }
188 203
189 spin_lock_irqsave(&conf->resync_lock, flags); 204 mempool_free(r1_bio, conf->r1buf_pool);
190 if (!conf->barrier)
191 BUG();
192 --conf->barrier;
193 wake_up(&conf->wait_resume);
194 wake_up(&conf->wait_idle);
195 205
196 if (!--conf->nr_pending) { 206 lower_barrier(conf);
197 wake_up(&conf->wait_idle);
198 wake_up(&conf->wait_resume);
199 }
200 spin_unlock_irqrestore(&conf->resync_lock, flags);
201} 207}
202 208
203static void reschedule_retry(r1bio_t *r1_bio) 209static void reschedule_retry(r1bio_t *r1_bio)
@@ -208,8 +214,10 @@ static void reschedule_retry(r1bio_t *r1_bio)
208 214
209 spin_lock_irqsave(&conf->device_lock, flags); 215 spin_lock_irqsave(&conf->device_lock, flags);
210 list_add(&r1_bio->retry_list, &conf->retry_list); 216 list_add(&r1_bio->retry_list, &conf->retry_list);
217 conf->nr_queued ++;
211 spin_unlock_irqrestore(&conf->device_lock, flags); 218 spin_unlock_irqrestore(&conf->device_lock, flags);
212 219
220 wake_up(&conf->wait_barrier);
213 md_wakeup_thread(mddev->thread); 221 md_wakeup_thread(mddev->thread);
214} 222}
215 223
@@ -261,9 +269,9 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
261 /* 269 /*
262 * this branch is our 'one mirror IO has finished' event handler: 270 * this branch is our 'one mirror IO has finished' event handler:
263 */ 271 */
264 if (!uptodate) 272 update_head_pos(mirror, r1_bio);
265 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 273
266 else 274 if (uptodate || conf->working_disks <= 1) {
267 /* 275 /*
268 * Set R1BIO_Uptodate in our master bio, so that 276 * Set R1BIO_Uptodate in our master bio, so that
269 * we will return a good error code for to the higher 277 * we will return a good error code for to the higher
@@ -273,16 +281,11 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
273 * user-side. So if something waits for IO, then it will 281 * user-side. So if something waits for IO, then it will
274 * wait for the 'master' bio. 282 * wait for the 'master' bio.
275 */ 283 */
276 set_bit(R1BIO_Uptodate, &r1_bio->state); 284 if (uptodate)
277 285 set_bit(R1BIO_Uptodate, &r1_bio->state);
278 update_head_pos(mirror, r1_bio);
279 286
280 /*
281 * we have only one bio on the read side
282 */
283 if (uptodate)
284 raid_end_bio_io(r1_bio); 287 raid_end_bio_io(r1_bio);
285 else { 288 } else {
286 /* 289 /*
287 * oops, read error: 290 * oops, read error:
288 */ 291 */
@@ -378,7 +381,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
378 /* free extra copy of the data pages */ 381 /* free extra copy of the data pages */
379 int i = bio->bi_vcnt; 382 int i = bio->bi_vcnt;
380 while (i--) 383 while (i--)
381 __free_page(bio->bi_io_vec[i].bv_page); 384 safe_put_page(bio->bi_io_vec[i].bv_page);
382 } 385 }
383 /* clear the bitmap if all writes complete successfully */ 386 /* clear the bitmap if all writes complete successfully */
384 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 387 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
@@ -433,11 +436,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
433 new_disk = 0; 436 new_disk = 0;
434 437
435 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 438 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
439 r1_bio->bios[new_disk] == IO_BLOCKED ||
436 !rdev || !test_bit(In_sync, &rdev->flags) 440 !rdev || !test_bit(In_sync, &rdev->flags)
437 || test_bit(WriteMostly, &rdev->flags); 441 || test_bit(WriteMostly, &rdev->flags);
438 rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { 442 rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
439 443
440 if (rdev && test_bit(In_sync, &rdev->flags)) 444 if (rdev && test_bit(In_sync, &rdev->flags) &&
445 r1_bio->bios[new_disk] != IO_BLOCKED)
441 wonly_disk = new_disk; 446 wonly_disk = new_disk;
442 447
443 if (new_disk == conf->raid_disks - 1) { 448 if (new_disk == conf->raid_disks - 1) {
@@ -451,11 +456,13 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
451 456
452 /* make sure the disk is operational */ 457 /* make sure the disk is operational */
453 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 458 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
459 r1_bio->bios[new_disk] == IO_BLOCKED ||
454 !rdev || !test_bit(In_sync, &rdev->flags) || 460 !rdev || !test_bit(In_sync, &rdev->flags) ||
455 test_bit(WriteMostly, &rdev->flags); 461 test_bit(WriteMostly, &rdev->flags);
456 rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { 462 rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
457 463
458 if (rdev && test_bit(In_sync, &rdev->flags)) 464 if (rdev && test_bit(In_sync, &rdev->flags) &&
465 r1_bio->bios[new_disk] != IO_BLOCKED)
459 wonly_disk = new_disk; 466 wonly_disk = new_disk;
460 467
461 if (new_disk <= 0) 468 if (new_disk <= 0)
@@ -492,7 +499,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
492 499
493 rdev = rcu_dereference(conf->mirrors[disk].rdev); 500 rdev = rcu_dereference(conf->mirrors[disk].rdev);
494 501
495 if (!rdev || 502 if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||
496 !test_bit(In_sync, &rdev->flags) || 503 !test_bit(In_sync, &rdev->flags) ||
497 test_bit(WriteMostly, &rdev->flags)) 504 test_bit(WriteMostly, &rdev->flags))
498 continue; 505 continue;
@@ -520,7 +527,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
520 /* cannot risk returning a device that failed 527 /* cannot risk returning a device that failed
521 * before we inc'ed nr_pending 528 * before we inc'ed nr_pending
522 */ 529 */
523 atomic_dec(&rdev->nr_pending); 530 rdev_dec_pending(rdev, conf->mddev);
524 goto retry; 531 goto retry;
525 } 532 }
526 conf->next_seq_sect = this_sector + sectors; 533 conf->next_seq_sect = this_sector + sectors;
@@ -593,42 +600,119 @@ static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
593 return ret; 600 return ret;
594} 601}
595 602
596/* 603/* Barriers....
597 * Throttle resync depth, so that we can both get proper overlapping of 604 * Sometimes we need to suspend IO while we do something else,
598 * requests, but are still able to handle normal requests quickly. 605 * either some resync/recovery, or reconfigure the array.
606 * To do this we raise a 'barrier'.
607 * The 'barrier' is a counter that can be raised multiple times
608 * to count how many activities are happening which preclude
609 * normal IO.
610 * We can only raise the barrier if there is no pending IO.
611 * i.e. if nr_pending == 0.
612 * We choose only to raise the barrier if no-one is waiting for the
613 * barrier to go down. This means that as soon as an IO request
614 * is ready, no other operations which require a barrier will start
615 * until the IO request has had a chance.
616 *
617 * So: regular IO calls 'wait_barrier'. When that returns there
618 * is no backgroup IO happening, It must arrange to call
619 * allow_barrier when it has finished its IO.
620 * backgroup IO calls must call raise_barrier. Once that returns
621 * there is no normal IO happeing. It must arrange to call
622 * lower_barrier when the particular background IO completes.
599 */ 623 */
600#define RESYNC_DEPTH 32 624#define RESYNC_DEPTH 32
601 625
602static void device_barrier(conf_t *conf, sector_t sect) 626static void raise_barrier(conf_t *conf)
603{ 627{
604 spin_lock_irq(&conf->resync_lock); 628 spin_lock_irq(&conf->resync_lock);
605 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), 629
606 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 630 /* Wait until no block IO is waiting */
607 631 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
608 if (!conf->barrier++) { 632 conf->resync_lock,
609 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 633 raid1_unplug(conf->mddev->queue));
610 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 634
611 if (conf->nr_pending) 635 /* block any new IO from starting */
612 BUG(); 636 conf->barrier++;
637
638 /* No wait for all pending IO to complete */
639 wait_event_lock_irq(conf->wait_barrier,
640 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
641 conf->resync_lock,
642 raid1_unplug(conf->mddev->queue));
643
644 spin_unlock_irq(&conf->resync_lock);
645}
646
647static void lower_barrier(conf_t *conf)
648{
649 unsigned long flags;
650 spin_lock_irqsave(&conf->resync_lock, flags);
651 conf->barrier--;
652 spin_unlock_irqrestore(&conf->resync_lock, flags);
653 wake_up(&conf->wait_barrier);
654}
655
656static void wait_barrier(conf_t *conf)
657{
658 spin_lock_irq(&conf->resync_lock);
659 if (conf->barrier) {
660 conf->nr_waiting++;
661 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
662 conf->resync_lock,
663 raid1_unplug(conf->mddev->queue));
664 conf->nr_waiting--;
613 } 665 }
614 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, 666 conf->nr_pending++;
615 conf->resync_lock, raid1_unplug(conf->mddev->queue)); 667 spin_unlock_irq(&conf->resync_lock);
616 conf->next_resync = sect; 668}
669
670static void allow_barrier(conf_t *conf)
671{
672 unsigned long flags;
673 spin_lock_irqsave(&conf->resync_lock, flags);
674 conf->nr_pending--;
675 spin_unlock_irqrestore(&conf->resync_lock, flags);
676 wake_up(&conf->wait_barrier);
677}
678
679static void freeze_array(conf_t *conf)
680{
681 /* stop syncio and normal IO and wait for everything to
682 * go quite.
683 * We increment barrier and nr_waiting, and then
684 * wait until barrier+nr_pending match nr_queued+2
685 */
686 spin_lock_irq(&conf->resync_lock);
687 conf->barrier++;
688 conf->nr_waiting++;
689 wait_event_lock_irq(conf->wait_barrier,
690 conf->barrier+conf->nr_pending == conf->nr_queued+2,
691 conf->resync_lock,
692 raid1_unplug(conf->mddev->queue));
693 spin_unlock_irq(&conf->resync_lock);
694}
695static void unfreeze_array(conf_t *conf)
696{
697 /* reverse the effect of the freeze */
698 spin_lock_irq(&conf->resync_lock);
699 conf->barrier--;
700 conf->nr_waiting--;
701 wake_up(&conf->wait_barrier);
617 spin_unlock_irq(&conf->resync_lock); 702 spin_unlock_irq(&conf->resync_lock);
618} 703}
619 704
705
620/* duplicate the data pages for behind I/O */ 706/* duplicate the data pages for behind I/O */
621static struct page **alloc_behind_pages(struct bio *bio) 707static struct page **alloc_behind_pages(struct bio *bio)
622{ 708{
623 int i; 709 int i;
624 struct bio_vec *bvec; 710 struct bio_vec *bvec;
625 struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), 711 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),
626 GFP_NOIO); 712 GFP_NOIO);
627 if (unlikely(!pages)) 713 if (unlikely(!pages))
628 goto do_sync_io; 714 goto do_sync_io;
629 715
630 memset(pages, 0, bio->bi_vcnt * sizeof(struct page *));
631
632 bio_for_each_segment(bvec, bio, i) { 716 bio_for_each_segment(bvec, bio, i) {
633 pages[i] = alloc_page(GFP_NOIO); 717 pages[i] = alloc_page(GFP_NOIO);
634 if (unlikely(!pages[i])) 718 if (unlikely(!pages[i]))
@@ -644,7 +728,7 @@ static struct page **alloc_behind_pages(struct bio *bio)
644do_sync_io: 728do_sync_io:
645 if (pages) 729 if (pages)
646 for (i = 0; i < bio->bi_vcnt && pages[i]; i++) 730 for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
647 __free_page(pages[i]); 731 put_page(pages[i]);
648 kfree(pages); 732 kfree(pages);
649 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 733 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
650 return NULL; 734 return NULL;
@@ -678,10 +762,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
678 */ 762 */
679 md_write_start(mddev, bio); /* wait on superblock update early */ 763 md_write_start(mddev, bio); /* wait on superblock update early */
680 764
681 spin_lock_irq(&conf->resync_lock); 765 wait_barrier(conf);
682 wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
683 conf->nr_pending++;
684 spin_unlock_irq(&conf->resync_lock);
685 766
686 disk_stat_inc(mddev->gendisk, ios[rw]); 767 disk_stat_inc(mddev->gendisk, ios[rw]);
687 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); 768 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
@@ -749,7 +830,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
749 !test_bit(Faulty, &rdev->flags)) { 830 !test_bit(Faulty, &rdev->flags)) {
750 atomic_inc(&rdev->nr_pending); 831 atomic_inc(&rdev->nr_pending);
751 if (test_bit(Faulty, &rdev->flags)) { 832 if (test_bit(Faulty, &rdev->flags)) {
752 atomic_dec(&rdev->nr_pending); 833 rdev_dec_pending(rdev, mddev);
753 r1_bio->bios[i] = NULL; 834 r1_bio->bios[i] = NULL;
754 } else 835 } else
755 r1_bio->bios[i] = bio; 836 r1_bio->bios[i] = bio;
@@ -909,13 +990,8 @@ static void print_conf(conf_t *conf)
909 990
910static void close_sync(conf_t *conf) 991static void close_sync(conf_t *conf)
911{ 992{
912 spin_lock_irq(&conf->resync_lock); 993 wait_barrier(conf);
913 wait_event_lock_irq(conf->wait_resume, !conf->barrier, 994 allow_barrier(conf);
914 conf->resync_lock, raid1_unplug(conf->mddev->queue));
915 spin_unlock_irq(&conf->resync_lock);
916
917 if (conf->barrier) BUG();
918 if (waitqueue_active(&conf->wait_idle)) BUG();
919 995
920 mempool_destroy(conf->r1buf_pool); 996 mempool_destroy(conf->r1buf_pool);
921 conf->r1buf_pool = NULL; 997 conf->r1buf_pool = NULL;
@@ -1015,28 +1091,27 @@ abort:
1015 1091
1016static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) 1092static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
1017{ 1093{
1018 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1019 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1094 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
1020 conf_t *conf = mddev_to_conf(r1_bio->mddev); 1095 int i;
1021 1096
1022 if (bio->bi_size) 1097 if (bio->bi_size)
1023 return 1; 1098 return 1;
1024 1099
1025 if (r1_bio->bios[r1_bio->read_disk] != bio) 1100 for (i=r1_bio->mddev->raid_disks; i--; )
1026 BUG(); 1101 if (r1_bio->bios[i] == bio)
1027 update_head_pos(r1_bio->read_disk, r1_bio); 1102 break;
1103 BUG_ON(i < 0);
1104 update_head_pos(i, r1_bio);
1028 /* 1105 /*
1029 * we have read a block, now it needs to be re-written, 1106 * we have read a block, now it needs to be re-written,
1030 * or re-read if the read failed. 1107 * or re-read if the read failed.
1031 * We don't do much here, just schedule handling by raid1d 1108 * We don't do much here, just schedule handling by raid1d
1032 */ 1109 */
1033 if (!uptodate) { 1110 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1034 md_error(r1_bio->mddev,
1035 conf->mirrors[r1_bio->read_disk].rdev);
1036 } else
1037 set_bit(R1BIO_Uptodate, &r1_bio->state); 1111 set_bit(R1BIO_Uptodate, &r1_bio->state);
1038 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); 1112
1039 reschedule_retry(r1_bio); 1113 if (atomic_dec_and_test(&r1_bio->remaining))
1114 reschedule_retry(r1_bio);
1040 return 0; 1115 return 0;
1041} 1116}
1042 1117
@@ -1066,7 +1141,6 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
1066 md_done_sync(mddev, r1_bio->sectors, uptodate); 1141 md_done_sync(mddev, r1_bio->sectors, uptodate);
1067 put_buf(r1_bio); 1142 put_buf(r1_bio);
1068 } 1143 }
1069 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
1070 return 0; 1144 return 0;
1071} 1145}
1072 1146
@@ -1079,34 +1153,173 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1079 1153
1080 bio = r1_bio->bios[r1_bio->read_disk]; 1154 bio = r1_bio->bios[r1_bio->read_disk];
1081 1155
1082/* 1156
1083 if (r1_bio->sector == 0) printk("First sync write startss\n"); 1157 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1084*/ 1158 /* We have read all readable devices. If we haven't
1085 /* 1159 * got the block, then there is no hope left.
1086 * schedule writes 1160 * If we have, then we want to do a comparison
1087 */ 1161 * and skip the write if everything is the same.
1162 * If any blocks failed to read, then we need to
1163 * attempt an over-write
1164 */
1165 int primary;
1166 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
1167 for (i=0; i<mddev->raid_disks; i++)
1168 if (r1_bio->bios[i]->bi_end_io == end_sync_read)
1169 md_error(mddev, conf->mirrors[i].rdev);
1170
1171 md_done_sync(mddev, r1_bio->sectors, 1);
1172 put_buf(r1_bio);
1173 return;
1174 }
1175 for (primary=0; primary<mddev->raid_disks; primary++)
1176 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1177 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1178 r1_bio->bios[primary]->bi_end_io = NULL;
1179 rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1180 break;
1181 }
1182 r1_bio->read_disk = primary;
1183 for (i=0; i<mddev->raid_disks; i++)
1184 if (r1_bio->bios[i]->bi_end_io == end_sync_read &&
1185 test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) {
1186 int j;
1187 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1188 struct bio *pbio = r1_bio->bios[primary];
1189 struct bio *sbio = r1_bio->bios[i];
1190 for (j = vcnt; j-- ; )
1191 if (memcmp(page_address(pbio->bi_io_vec[j].bv_page),
1192 page_address(sbio->bi_io_vec[j].bv_page),
1193 PAGE_SIZE))
1194 break;
1195 if (j >= 0)
1196 mddev->resync_mismatches += r1_bio->sectors;
1197 if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
1198 sbio->bi_end_io = NULL;
1199 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1200 } else {
1201 /* fixup the bio for reuse */
1202 sbio->bi_vcnt = vcnt;
1203 sbio->bi_size = r1_bio->sectors << 9;
1204 sbio->bi_idx = 0;
1205 sbio->bi_phys_segments = 0;
1206 sbio->bi_hw_segments = 0;
1207 sbio->bi_hw_front_size = 0;
1208 sbio->bi_hw_back_size = 0;
1209 sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1210 sbio->bi_flags |= 1 << BIO_UPTODATE;
1211 sbio->bi_next = NULL;
1212 sbio->bi_sector = r1_bio->sector +
1213 conf->mirrors[i].rdev->data_offset;
1214 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1215 }
1216 }
1217 }
1088 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { 1218 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
1089 /* 1219 /* ouch - failed to read all of that.
1090 * There is no point trying a read-for-reconstruct as 1220 * Try some synchronous reads of other devices to get
1091 * reconstruct is about to be aborted 1221 * good data, much like with normal read errors. Only
1222 * read into the pages we already have so they we don't
1223 * need to re-issue the read request.
1224 * We don't need to freeze the array, because being in an
1225 * active sync request, there is no normal IO, and
1226 * no overlapping syncs.
1092 */ 1227 */
1093 char b[BDEVNAME_SIZE]; 1228 sector_t sect = r1_bio->sector;
1094 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" 1229 int sectors = r1_bio->sectors;
1095 " for block %llu\n", 1230 int idx = 0;
1096 bdevname(bio->bi_bdev,b), 1231
1097 (unsigned long long)r1_bio->sector); 1232 while(sectors) {
1098 md_done_sync(mddev, r1_bio->sectors, 0); 1233 int s = sectors;
1099 put_buf(r1_bio); 1234 int d = r1_bio->read_disk;
1100 return; 1235 int success = 0;
1236 mdk_rdev_t *rdev;
1237
1238 if (s > (PAGE_SIZE>>9))
1239 s = PAGE_SIZE >> 9;
1240 do {
1241 if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1242 rdev = conf->mirrors[d].rdev;
1243 if (sync_page_io(rdev->bdev,
1244 sect + rdev->data_offset,
1245 s<<9,
1246 bio->bi_io_vec[idx].bv_page,
1247 READ)) {
1248 success = 1;
1249 break;
1250 }
1251 }
1252 d++;
1253 if (d == conf->raid_disks)
1254 d = 0;
1255 } while (!success && d != r1_bio->read_disk);
1256
1257 if (success) {
1258 int start = d;
1259 /* write it back and re-read */
1260 set_bit(R1BIO_Uptodate, &r1_bio->state);
1261 while (d != r1_bio->read_disk) {
1262 if (d == 0)
1263 d = conf->raid_disks;
1264 d--;
1265 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1266 continue;
1267 rdev = conf->mirrors[d].rdev;
1268 atomic_add(s, &rdev->corrected_errors);
1269 if (sync_page_io(rdev->bdev,
1270 sect + rdev->data_offset,
1271 s<<9,
1272 bio->bi_io_vec[idx].bv_page,
1273 WRITE) == 0)
1274 md_error(mddev, rdev);
1275 }
1276 d = start;
1277 while (d != r1_bio->read_disk) {
1278 if (d == 0)
1279 d = conf->raid_disks;
1280 d--;
1281 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1282 continue;
1283 rdev = conf->mirrors[d].rdev;
1284 if (sync_page_io(rdev->bdev,
1285 sect + rdev->data_offset,
1286 s<<9,
1287 bio->bi_io_vec[idx].bv_page,
1288 READ) == 0)
1289 md_error(mddev, rdev);
1290 }
1291 } else {
1292 char b[BDEVNAME_SIZE];
1293 /* Cannot read from anywhere, array is toast */
1294 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1295 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
1296 " for block %llu\n",
1297 bdevname(bio->bi_bdev,b),
1298 (unsigned long long)r1_bio->sector);
1299 md_done_sync(mddev, r1_bio->sectors, 0);
1300 put_buf(r1_bio);
1301 return;
1302 }
1303 sectors -= s;
1304 sect += s;
1305 idx ++;
1306 }
1101 } 1307 }
1102 1308
1309 /*
1310 * schedule writes
1311 */
1103 atomic_set(&r1_bio->remaining, 1); 1312 atomic_set(&r1_bio->remaining, 1);
1104 for (i = 0; i < disks ; i++) { 1313 for (i = 0; i < disks ; i++) {
1105 wbio = r1_bio->bios[i]; 1314 wbio = r1_bio->bios[i];
1106 if (wbio->bi_end_io != end_sync_write) 1315 if (wbio->bi_end_io == NULL ||
1316 (wbio->bi_end_io == end_sync_read &&
1317 (i == r1_bio->read_disk ||
1318 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
1107 continue; 1319 continue;
1108 1320
1109 atomic_inc(&conf->mirrors[i].rdev->nr_pending); 1321 wbio->bi_rw = WRITE;
1322 wbio->bi_end_io = end_sync_write;
1110 atomic_inc(&r1_bio->remaining); 1323 atomic_inc(&r1_bio->remaining);
1111 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); 1324 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
1112 1325
@@ -1167,6 +1380,7 @@ static void raid1d(mddev_t *mddev)
1167 break; 1380 break;
1168 r1_bio = list_entry(head->prev, r1bio_t, retry_list); 1381 r1_bio = list_entry(head->prev, r1bio_t, retry_list);
1169 list_del(head->prev); 1382 list_del(head->prev);
1383 conf->nr_queued--;
1170 spin_unlock_irqrestore(&conf->device_lock, flags); 1384 spin_unlock_irqrestore(&conf->device_lock, flags);
1171 1385
1172 mddev = r1_bio->mddev; 1386 mddev = r1_bio->mddev;
@@ -1206,6 +1420,86 @@ static void raid1d(mddev_t *mddev)
1206 } 1420 }
1207 } else { 1421 } else {
1208 int disk; 1422 int disk;
1423
1424 /* we got a read error. Maybe the drive is bad. Maybe just
1425 * the block and we can fix it.
1426 * We freeze all other IO, and try reading the block from
1427 * other devices. When we find one, we re-write
1428 * and check it that fixes the read error.
1429 * This is all done synchronously while the array is
1430 * frozen
1431 */
1432 sector_t sect = r1_bio->sector;
1433 int sectors = r1_bio->sectors;
1434 freeze_array(conf);
1435 if (mddev->ro == 0) while(sectors) {
1436 int s = sectors;
1437 int d = r1_bio->read_disk;
1438 int success = 0;
1439
1440 if (s > (PAGE_SIZE>>9))
1441 s = PAGE_SIZE >> 9;
1442
1443 do {
1444 rdev = conf->mirrors[d].rdev;
1445 if (rdev &&
1446 test_bit(In_sync, &rdev->flags) &&
1447 sync_page_io(rdev->bdev,
1448 sect + rdev->data_offset,
1449 s<<9,
1450 conf->tmppage, READ))
1451 success = 1;
1452 else {
1453 d++;
1454 if (d == conf->raid_disks)
1455 d = 0;
1456 }
1457 } while (!success && d != r1_bio->read_disk);
1458
1459 if (success) {
1460 /* write it back and re-read */
1461 int start = d;
1462 while (d != r1_bio->read_disk) {
1463 if (d==0)
1464 d = conf->raid_disks;
1465 d--;
1466 rdev = conf->mirrors[d].rdev;
1467 atomic_add(s, &rdev->corrected_errors);
1468 if (rdev &&
1469 test_bit(In_sync, &rdev->flags)) {
1470 if (sync_page_io(rdev->bdev,
1471 sect + rdev->data_offset,
1472 s<<9, conf->tmppage, WRITE) == 0)
1473 /* Well, this device is dead */
1474 md_error(mddev, rdev);
1475 }
1476 }
1477 d = start;
1478 while (d != r1_bio->read_disk) {
1479 if (d==0)
1480 d = conf->raid_disks;
1481 d--;
1482 rdev = conf->mirrors[d].rdev;
1483 if (rdev &&
1484 test_bit(In_sync, &rdev->flags)) {
1485 if (sync_page_io(rdev->bdev,
1486 sect + rdev->data_offset,
1487 s<<9, conf->tmppage, READ) == 0)
1488 /* Well, this device is dead */
1489 md_error(mddev, rdev);
1490 }
1491 }
1492 } else {
1493 /* Cannot read from anywhere -- bye bye array */
1494 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1495 break;
1496 }
1497 sectors -= s;
1498 sect += s;
1499 }
1500
1501 unfreeze_array(conf);
1502
1209 bio = r1_bio->bios[r1_bio->read_disk]; 1503 bio = r1_bio->bios[r1_bio->read_disk];
1210 if ((disk=read_balance(conf, r1_bio)) == -1) { 1504 if ((disk=read_balance(conf, r1_bio)) == -1) {
1211 printk(KERN_ALERT "raid1: %s: unrecoverable I/O" 1505 printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -1214,7 +1508,8 @@ static void raid1d(mddev_t *mddev)
1214 (unsigned long long)r1_bio->sector); 1508 (unsigned long long)r1_bio->sector);
1215 raid_end_bio_io(r1_bio); 1509 raid_end_bio_io(r1_bio);
1216 } else { 1510 } else {
1217 r1_bio->bios[r1_bio->read_disk] = NULL; 1511 r1_bio->bios[r1_bio->read_disk] =
1512 mddev->ro ? IO_BLOCKED : NULL;
1218 r1_bio->read_disk = disk; 1513 r1_bio->read_disk = disk;
1219 bio_put(bio); 1514 bio_put(bio);
1220 bio = bio_clone(r1_bio->master_bio, GFP_NOIO); 1515 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
@@ -1269,14 +1564,13 @@ static int init_resync(conf_t *conf)
1269static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 1564static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1270{ 1565{
1271 conf_t *conf = mddev_to_conf(mddev); 1566 conf_t *conf = mddev_to_conf(mddev);
1272 mirror_info_t *mirror;
1273 r1bio_t *r1_bio; 1567 r1bio_t *r1_bio;
1274 struct bio *bio; 1568 struct bio *bio;
1275 sector_t max_sector, nr_sectors; 1569 sector_t max_sector, nr_sectors;
1276 int disk; 1570 int disk = -1;
1277 int i; 1571 int i;
1278 int wonly; 1572 int wonly = -1;
1279 int write_targets = 0; 1573 int write_targets = 0, read_targets = 0;
1280 int sync_blocks; 1574 int sync_blocks;
1281 int still_degraded = 0; 1575 int still_degraded = 0;
1282 1576
@@ -1317,55 +1611,35 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1317 return sync_blocks; 1611 return sync_blocks;
1318 } 1612 }
1319 /* 1613 /*
1320 * If there is non-resync activity waiting for us then 1614 * If there is non-resync activity waiting for a turn,
1321 * put in a delay to throttle resync. 1615 * and resync is going fast enough,
1616 * then let it though before starting on this new sync request.
1322 */ 1617 */
1323 if (!go_faster && waitqueue_active(&conf->wait_resume)) 1618 if (!go_faster && conf->nr_waiting)
1324 msleep_interruptible(1000); 1619 msleep_interruptible(1000);
1325 device_barrier(conf, sector_nr + RESYNC_SECTORS);
1326
1327 /*
1328 * If reconstructing, and >1 working disc,
1329 * could dedicate one to rebuild and others to
1330 * service read requests ..
1331 */
1332 disk = conf->last_used;
1333 /* make sure disk is operational */
1334 wonly = disk;
1335 while (conf->mirrors[disk].rdev == NULL ||
1336 !test_bit(In_sync, &conf->mirrors[disk].rdev->flags) ||
1337 test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
1338 ) {
1339 if (conf->mirrors[disk].rdev &&
1340 test_bit(In_sync, &conf->mirrors[disk].rdev->flags))
1341 wonly = disk;
1342 if (disk <= 0)
1343 disk = conf->raid_disks;
1344 disk--;
1345 if (disk == conf->last_used) {
1346 disk = wonly;
1347 break;
1348 }
1349 }
1350 conf->last_used = disk;
1351 atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
1352 1620
1621 raise_barrier(conf);
1353 1622
1354 mirror = conf->mirrors + disk; 1623 conf->next_resync = sector_nr;
1355 1624
1356 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); 1625 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1357 1626 rcu_read_lock();
1358 spin_lock_irq(&conf->resync_lock); 1627 /*
1359 conf->nr_pending++; 1628 * If we get a correctably read error during resync or recovery,
1360 spin_unlock_irq(&conf->resync_lock); 1629 * we might want to read from a different device. So we
1630 * flag all drives that could conceivably be read from for READ,
1631 * and any others (which will be non-In_sync devices) for WRITE.
1632 * If a read fails, we try reading from something else for which READ
1633 * is OK.
1634 */
1361 1635
1362 r1_bio->mddev = mddev; 1636 r1_bio->mddev = mddev;
1363 r1_bio->sector = sector_nr; 1637 r1_bio->sector = sector_nr;
1364 r1_bio->state = 0; 1638 r1_bio->state = 0;
1365 set_bit(R1BIO_IsSync, &r1_bio->state); 1639 set_bit(R1BIO_IsSync, &r1_bio->state);
1366 r1_bio->read_disk = disk;
1367 1640
1368 for (i=0; i < conf->raid_disks; i++) { 1641 for (i=0; i < conf->raid_disks; i++) {
1642 mdk_rdev_t *rdev;
1369 bio = r1_bio->bios[i]; 1643 bio = r1_bio->bios[i];
1370 1644
1371 /* take from bio_init */ 1645 /* take from bio_init */
@@ -1380,35 +1654,49 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1380 bio->bi_end_io = NULL; 1654 bio->bi_end_io = NULL;
1381 bio->bi_private = NULL; 1655 bio->bi_private = NULL;
1382 1656
1383 if (i == disk) { 1657 rdev = rcu_dereference(conf->mirrors[i].rdev);
1384 bio->bi_rw = READ; 1658 if (rdev == NULL ||
1385 bio->bi_end_io = end_sync_read; 1659 test_bit(Faulty, &rdev->flags)) {
1386 } else if (conf->mirrors[i].rdev == NULL ||
1387 test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
1388 still_degraded = 1; 1660 still_degraded = 1;
1389 continue; 1661 continue;
1390 } else if (!test_bit(In_sync, &conf->mirrors[i].rdev->flags) || 1662 } else if (!test_bit(In_sync, &rdev->flags)) {
1391 sector_nr + RESYNC_SECTORS > mddev->recovery_cp ||
1392 test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1393 bio->bi_rw = WRITE; 1663 bio->bi_rw = WRITE;
1394 bio->bi_end_io = end_sync_write; 1664 bio->bi_end_io = end_sync_write;
1395 write_targets ++; 1665 write_targets ++;
1396 } else 1666 } else {
1397 /* no need to read or write here */ 1667 /* may need to read from here */
1398 continue; 1668 bio->bi_rw = READ;
1399 bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset; 1669 bio->bi_end_io = end_sync_read;
1400 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 1670 if (test_bit(WriteMostly, &rdev->flags)) {
1671 if (wonly < 0)
1672 wonly = i;
1673 } else {
1674 if (disk < 0)
1675 disk = i;
1676 }
1677 read_targets++;
1678 }
1679 atomic_inc(&rdev->nr_pending);
1680 bio->bi_sector = sector_nr + rdev->data_offset;
1681 bio->bi_bdev = rdev->bdev;
1401 bio->bi_private = r1_bio; 1682 bio->bi_private = r1_bio;
1402 } 1683 }
1684 rcu_read_unlock();
1685 if (disk < 0)
1686 disk = wonly;
1687 r1_bio->read_disk = disk;
1688
1689 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
1690 /* extra read targets are also write targets */
1691 write_targets += read_targets-1;
1403 1692
1404 if (write_targets == 0) { 1693 if (write_targets == 0 || read_targets == 0) {
1405 /* There is nowhere to write, so all non-sync 1694 /* There is nowhere to write, so all non-sync
1406 * drives must be failed - so we are finished 1695 * drives must be failed - so we are finished
1407 */ 1696 */
1408 sector_t rv = max_sector - sector_nr; 1697 sector_t rv = max_sector - sector_nr;
1409 *skipped = 1; 1698 *skipped = 1;
1410 put_buf(r1_bio); 1699 put_buf(r1_bio);
1411 rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
1412 return rv; 1700 return rv;
1413 } 1701 }
1414 1702
@@ -1436,10 +1724,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1436 for (i=0 ; i < conf->raid_disks; i++) { 1724 for (i=0 ; i < conf->raid_disks; i++) {
1437 bio = r1_bio->bios[i]; 1725 bio = r1_bio->bios[i];
1438 if (bio->bi_end_io) { 1726 if (bio->bi_end_io) {
1439 page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page; 1727 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1440 if (bio_add_page(bio, page, len, 0) == 0) { 1728 if (bio_add_page(bio, page, len, 0) == 0) {
1441 /* stop here */ 1729 /* stop here */
1442 r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page; 1730 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1443 while (i > 0) { 1731 while (i > 0) {
1444 i--; 1732 i--;
1445 bio = r1_bio->bios[i]; 1733 bio = r1_bio->bios[i];
@@ -1459,12 +1747,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1459 sync_blocks -= (len>>9); 1747 sync_blocks -= (len>>9);
1460 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); 1748 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
1461 bio_full: 1749 bio_full:
1462 bio = r1_bio->bios[disk];
1463 r1_bio->sectors = nr_sectors; 1750 r1_bio->sectors = nr_sectors;
1464 1751
1465 md_sync_acct(mirror->rdev->bdev, nr_sectors); 1752 /* For a user-requested sync, we read all readable devices and do a
1753 * compare
1754 */
1755 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1756 atomic_set(&r1_bio->remaining, read_targets);
1757 for (i=0; i<conf->raid_disks; i++) {
1758 bio = r1_bio->bios[i];
1759 if (bio->bi_end_io == end_sync_read) {
1760 md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors);
1761 generic_make_request(bio);
1762 }
1763 }
1764 } else {
1765 atomic_set(&r1_bio->remaining, 1);
1766 bio = r1_bio->bios[r1_bio->read_disk];
1767 md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev,
1768 nr_sectors);
1769 generic_make_request(bio);
1466 1770
1467 generic_make_request(bio); 1771 }
1468 1772
1469 return nr_sectors; 1773 return nr_sectors;
1470} 1774}
@@ -1487,18 +1791,19 @@ static int run(mddev_t *mddev)
1487 * bookkeeping area. [whatever we allocate in run(), 1791 * bookkeeping area. [whatever we allocate in run(),
1488 * should be freed in stop()] 1792 * should be freed in stop()]
1489 */ 1793 */
1490 conf = kmalloc(sizeof(conf_t), GFP_KERNEL); 1794 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1491 mddev->private = conf; 1795 mddev->private = conf;
1492 if (!conf) 1796 if (!conf)
1493 goto out_no_mem; 1797 goto out_no_mem;
1494 1798
1495 memset(conf, 0, sizeof(*conf)); 1799 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1496 conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1497 GFP_KERNEL); 1800 GFP_KERNEL);
1498 if (!conf->mirrors) 1801 if (!conf->mirrors)
1499 goto out_no_mem; 1802 goto out_no_mem;
1500 1803
1501 memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); 1804 conf->tmppage = alloc_page(GFP_KERNEL);
1805 if (!conf->tmppage)
1806 goto out_no_mem;
1502 1807
1503 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 1808 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1504 if (!conf->poolinfo) 1809 if (!conf->poolinfo)
@@ -1542,8 +1847,7 @@ static int run(mddev_t *mddev)
1542 mddev->recovery_cp = MaxSector; 1847 mddev->recovery_cp = MaxSector;
1543 1848
1544 spin_lock_init(&conf->resync_lock); 1849 spin_lock_init(&conf->resync_lock);
1545 init_waitqueue_head(&conf->wait_idle); 1850 init_waitqueue_head(&conf->wait_barrier);
1546 init_waitqueue_head(&conf->wait_resume);
1547 1851
1548 bio_list_init(&conf->pending_bio_list); 1852 bio_list_init(&conf->pending_bio_list);
1549 bio_list_init(&conf->flushing_bio_list); 1853 bio_list_init(&conf->flushing_bio_list);
@@ -1583,7 +1887,6 @@ static int run(mddev_t *mddev)
1583 mdname(mddev)); 1887 mdname(mddev));
1584 goto out_free_conf; 1888 goto out_free_conf;
1585 } 1889 }
1586 if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1587 1890
1588 printk(KERN_INFO 1891 printk(KERN_INFO
1589 "raid1: raid set %s active with %d out of %d mirrors\n", 1892 "raid1: raid set %s active with %d out of %d mirrors\n",
@@ -1608,6 +1911,7 @@ out_free_conf:
1608 if (conf->r1bio_pool) 1911 if (conf->r1bio_pool)
1609 mempool_destroy(conf->r1bio_pool); 1912 mempool_destroy(conf->r1bio_pool);
1610 kfree(conf->mirrors); 1913 kfree(conf->mirrors);
1914 safe_put_page(conf->tmppage);
1611 kfree(conf->poolinfo); 1915 kfree(conf->poolinfo);
1612 kfree(conf); 1916 kfree(conf);
1613 mddev->private = NULL; 1917 mddev->private = NULL;
@@ -1706,19 +2010,14 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1706 kfree(newpoolinfo); 2010 kfree(newpoolinfo);
1707 return -ENOMEM; 2011 return -ENOMEM;
1708 } 2012 }
1709 newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); 2013 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
1710 if (!newmirrors) { 2014 if (!newmirrors) {
1711 kfree(newpoolinfo); 2015 kfree(newpoolinfo);
1712 mempool_destroy(newpool); 2016 mempool_destroy(newpool);
1713 return -ENOMEM; 2017 return -ENOMEM;
1714 } 2018 }
1715 memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks);
1716 2019
1717 spin_lock_irq(&conf->resync_lock); 2020 raise_barrier(conf);
1718 conf->barrier++;
1719 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
1720 conf->resync_lock, raid1_unplug(mddev->queue));
1721 spin_unlock_irq(&conf->resync_lock);
1722 2021
1723 /* ok, everything is stopped */ 2022 /* ok, everything is stopped */
1724 oldpool = conf->r1bio_pool; 2023 oldpool = conf->r1bio_pool;
@@ -1738,12 +2037,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1738 conf->raid_disks = mddev->raid_disks = raid_disks; 2037 conf->raid_disks = mddev->raid_disks = raid_disks;
1739 2038
1740 conf->last_used = 0; /* just make sure it is in-range */ 2039 conf->last_used = 0; /* just make sure it is in-range */
1741 spin_lock_irq(&conf->resync_lock); 2040 lower_barrier(conf);
1742 conf->barrier--;
1743 spin_unlock_irq(&conf->resync_lock);
1744 wake_up(&conf->wait_resume);
1745 wake_up(&conf->wait_idle);
1746
1747 2041
1748 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2042 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1749 md_wakeup_thread(mddev->thread); 2043 md_wakeup_thread(mddev->thread);
@@ -1758,33 +2052,19 @@ static void raid1_quiesce(mddev_t *mddev, int state)
1758 2052
1759 switch(state) { 2053 switch(state) {
1760 case 1: 2054 case 1:
1761 spin_lock_irq(&conf->resync_lock); 2055 raise_barrier(conf);
1762 conf->barrier++;
1763 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
1764 conf->resync_lock, raid1_unplug(mddev->queue));
1765 spin_unlock_irq(&conf->resync_lock);
1766 break; 2056 break;
1767 case 0: 2057 case 0:
1768 spin_lock_irq(&conf->resync_lock); 2058 lower_barrier(conf);
1769 conf->barrier--;
1770 spin_unlock_irq(&conf->resync_lock);
1771 wake_up(&conf->wait_resume);
1772 wake_up(&conf->wait_idle);
1773 break; 2059 break;
1774 } 2060 }
1775 if (mddev->thread) {
1776 if (mddev->bitmap)
1777 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1778 else
1779 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
1780 md_wakeup_thread(mddev->thread);
1781 }
1782} 2061}
1783 2062
1784 2063
1785static mdk_personality_t raid1_personality = 2064static struct mdk_personality raid1_personality =
1786{ 2065{
1787 .name = "raid1", 2066 .name = "raid1",
2067 .level = 1,
1788 .owner = THIS_MODULE, 2068 .owner = THIS_MODULE,
1789 .make_request = make_request, 2069 .make_request = make_request,
1790 .run = run, 2070 .run = run,
@@ -1802,15 +2082,17 @@ static mdk_personality_t raid1_personality =
1802 2082
1803static int __init raid_init(void) 2083static int __init raid_init(void)
1804{ 2084{
1805 return register_md_personality(RAID1, &raid1_personality); 2085 return register_md_personality(&raid1_personality);
1806} 2086}
1807 2087
1808static void raid_exit(void) 2088static void raid_exit(void)
1809{ 2089{
1810 unregister_md_personality(RAID1); 2090 unregister_md_personality(&raid1_personality);
1811} 2091}
1812 2092
1813module_init(raid_init); 2093module_init(raid_init);
1814module_exit(raid_exit); 2094module_exit(raid_exit);
1815MODULE_LICENSE("GPL"); 2095MODULE_LICENSE("GPL");
1816MODULE_ALIAS("md-personality-3"); /* RAID1 */ 2096MODULE_ALIAS("md-personality-3"); /* RAID1 */
2097MODULE_ALIAS("md-raid1");
2098MODULE_ALIAS("md-level-1");
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 713dc9c2c730..9e658e519a27 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,7 +18,9 @@
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20 20
21#include "dm-bio-list.h"
21#include <linux/raid/raid10.h> 22#include <linux/raid/raid10.h>
23#include <linux/raid/bitmap.h>
22 24
23/* 25/*
24 * RAID10 provides a combination of RAID0 and RAID1 functionality. 26 * RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -47,6 +49,9 @@
47 49
48static void unplug_slaves(mddev_t *mddev); 50static void unplug_slaves(mddev_t *mddev);
49 51
52static void allow_barrier(conf_t *conf);
53static void lower_barrier(conf_t *conf);
54
50static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 55static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
51{ 56{
52 conf_t *conf = data; 57 conf_t *conf = data;
@@ -54,10 +59,8 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
54 int size = offsetof(struct r10bio_s, devs[conf->copies]); 59 int size = offsetof(struct r10bio_s, devs[conf->copies]);
55 60
56 /* allocate a r10bio with room for raid_disks entries in the bios array */ 61 /* allocate a r10bio with room for raid_disks entries in the bios array */
57 r10_bio = kmalloc(size, gfp_flags); 62 r10_bio = kzalloc(size, gfp_flags);
58 if (r10_bio) 63 if (!r10_bio)
59 memset(r10_bio, 0, size);
60 else
61 unplug_slaves(conf->mddev); 64 unplug_slaves(conf->mddev);
62 65
63 return r10_bio; 66 return r10_bio;
@@ -129,10 +132,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
129 132
130out_free_pages: 133out_free_pages:
131 for ( ; i > 0 ; i--) 134 for ( ; i > 0 ; i--)
132 __free_page(bio->bi_io_vec[i-1].bv_page); 135 safe_put_page(bio->bi_io_vec[i-1].bv_page);
133 while (j--) 136 while (j--)
134 for (i = 0; i < RESYNC_PAGES ; i++) 137 for (i = 0; i < RESYNC_PAGES ; i++)
135 __free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); 138 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
136 j = -1; 139 j = -1;
137out_free_bio: 140out_free_bio:
138 while ( ++j < nalloc ) 141 while ( ++j < nalloc )
@@ -152,7 +155,7 @@ static void r10buf_pool_free(void *__r10_bio, void *data)
152 struct bio *bio = r10bio->devs[j].bio; 155 struct bio *bio = r10bio->devs[j].bio;
153 if (bio) { 156 if (bio) {
154 for (i = 0; i < RESYNC_PAGES; i++) { 157 for (i = 0; i < RESYNC_PAGES; i++) {
155 __free_page(bio->bi_io_vec[i].bv_page); 158 safe_put_page(bio->bi_io_vec[i].bv_page);
156 bio->bi_io_vec[i].bv_page = NULL; 159 bio->bi_io_vec[i].bv_page = NULL;
157 } 160 }
158 bio_put(bio); 161 bio_put(bio);
@@ -167,7 +170,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
167 170
168 for (i = 0; i < conf->copies; i++) { 171 for (i = 0; i < conf->copies; i++) {
169 struct bio **bio = & r10_bio->devs[i].bio; 172 struct bio **bio = & r10_bio->devs[i].bio;
170 if (*bio) 173 if (*bio && *bio != IO_BLOCKED)
171 bio_put(*bio); 174 bio_put(*bio);
172 *bio = NULL; 175 *bio = NULL;
173 } 176 }
@@ -175,20 +178,13 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
175 178
176static inline void free_r10bio(r10bio_t *r10_bio) 179static inline void free_r10bio(r10bio_t *r10_bio)
177{ 180{
178 unsigned long flags;
179
180 conf_t *conf = mddev_to_conf(r10_bio->mddev); 181 conf_t *conf = mddev_to_conf(r10_bio->mddev);
181 182
182 /* 183 /*
183 * Wake up any possible resync thread that waits for the device 184 * Wake up any possible resync thread that waits for the device
184 * to go idle. 185 * to go idle.
185 */ 186 */
186 spin_lock_irqsave(&conf->resync_lock, flags); 187 allow_barrier(conf);
187 if (!--conf->nr_pending) {
188 wake_up(&conf->wait_idle);
189 wake_up(&conf->wait_resume);
190 }
191 spin_unlock_irqrestore(&conf->resync_lock, flags);
192 188
193 put_all_bios(conf, r10_bio); 189 put_all_bios(conf, r10_bio);
194 mempool_free(r10_bio, conf->r10bio_pool); 190 mempool_free(r10_bio, conf->r10bio_pool);
@@ -197,22 +193,10 @@ static inline void free_r10bio(r10bio_t *r10_bio)
197static inline void put_buf(r10bio_t *r10_bio) 193static inline void put_buf(r10bio_t *r10_bio)
198{ 194{
199 conf_t *conf = mddev_to_conf(r10_bio->mddev); 195 conf_t *conf = mddev_to_conf(r10_bio->mddev);
200 unsigned long flags;
201 196
202 mempool_free(r10_bio, conf->r10buf_pool); 197 mempool_free(r10_bio, conf->r10buf_pool);
203 198
204 spin_lock_irqsave(&conf->resync_lock, flags); 199 lower_barrier(conf);
205 if (!conf->barrier)
206 BUG();
207 --conf->barrier;
208 wake_up(&conf->wait_resume);
209 wake_up(&conf->wait_idle);
210
211 if (!--conf->nr_pending) {
212 wake_up(&conf->wait_idle);
213 wake_up(&conf->wait_resume);
214 }
215 spin_unlock_irqrestore(&conf->resync_lock, flags);
216} 200}
217 201
218static void reschedule_retry(r10bio_t *r10_bio) 202static void reschedule_retry(r10bio_t *r10_bio)
@@ -223,6 +207,7 @@ static void reschedule_retry(r10bio_t *r10_bio)
223 207
224 spin_lock_irqsave(&conf->device_lock, flags); 208 spin_lock_irqsave(&conf->device_lock, flags);
225 list_add(&r10_bio->retry_list, &conf->retry_list); 209 list_add(&r10_bio->retry_list, &conf->retry_list);
210 conf->nr_queued ++;
226 spin_unlock_irqrestore(&conf->device_lock, flags); 211 spin_unlock_irqrestore(&conf->device_lock, flags);
227 212
228 md_wakeup_thread(mddev->thread); 213 md_wakeup_thread(mddev->thread);
@@ -268,9 +253,9 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
268 /* 253 /*
269 * this branch is our 'one mirror IO has finished' event handler: 254 * this branch is our 'one mirror IO has finished' event handler:
270 */ 255 */
271 if (!uptodate) 256 update_head_pos(slot, r10_bio);
272 md_error(r10_bio->mddev, conf->mirrors[dev].rdev); 257
273 else 258 if (uptodate) {
274 /* 259 /*
275 * Set R10BIO_Uptodate in our master bio, so that 260 * Set R10BIO_Uptodate in our master bio, so that
276 * we will return a good error code to the higher 261 * we will return a good error code to the higher
@@ -281,15 +266,8 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
281 * wait for the 'master' bio. 266 * wait for the 'master' bio.
282 */ 267 */
283 set_bit(R10BIO_Uptodate, &r10_bio->state); 268 set_bit(R10BIO_Uptodate, &r10_bio->state);
284
285 update_head_pos(slot, r10_bio);
286
287 /*
288 * we have only one bio on the read side
289 */
290 if (uptodate)
291 raid_end_bio_io(r10_bio); 269 raid_end_bio_io(r10_bio);
292 else { 270 } else {
293 /* 271 /*
294 * oops, read error: 272 * oops, read error:
295 */ 273 */
@@ -322,9 +300,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
322 /* 300 /*
323 * this branch is our 'one mirror IO has finished' event handler: 301 * this branch is our 'one mirror IO has finished' event handler:
324 */ 302 */
325 if (!uptodate) 303 if (!uptodate) {
326 md_error(r10_bio->mddev, conf->mirrors[dev].rdev); 304 md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
327 else 305 /* an I/O failed, we can't clear the bitmap */
306 set_bit(R10BIO_Degraded, &r10_bio->state);
307 } else
328 /* 308 /*
329 * Set R10BIO_Uptodate in our master bio, so that 309 * Set R10BIO_Uptodate in our master bio, so that
330 * we will return a good error code for to the higher 310 * we will return a good error code for to the higher
@@ -344,6 +324,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
344 * already. 324 * already.
345 */ 325 */
346 if (atomic_dec_and_test(&r10_bio->remaining)) { 326 if (atomic_dec_and_test(&r10_bio->remaining)) {
327 /* clear the bitmap if all writes complete successfully */
328 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
329 r10_bio->sectors,
330 !test_bit(R10BIO_Degraded, &r10_bio->state),
331 0);
347 md_write_end(r10_bio->mddev); 332 md_write_end(r10_bio->mddev);
348 raid_end_bio_io(r10_bio); 333 raid_end_bio_io(r10_bio);
349 } 334 }
@@ -502,8 +487,9 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
502 rcu_read_lock(); 487 rcu_read_lock();
503 /* 488 /*
504 * Check if we can balance. We can balance on the whole 489 * Check if we can balance. We can balance on the whole
505 * device if no resync is going on, or below the resync window. 490 * device if no resync is going on (recovery is ok), or below
506 * We take the first readable disk when above the resync window. 491 * the resync window. We take the first readable disk when
492 * above the resync window.
507 */ 493 */
508 if (conf->mddev->recovery_cp < MaxSector 494 if (conf->mddev->recovery_cp < MaxSector
509 && (this_sector + sectors >= conf->next_resync)) { 495 && (this_sector + sectors >= conf->next_resync)) {
@@ -512,6 +498,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
512 disk = r10_bio->devs[slot].devnum; 498 disk = r10_bio->devs[slot].devnum;
513 499
514 while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || 500 while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
501 r10_bio->devs[slot].bio == IO_BLOCKED ||
515 !test_bit(In_sync, &rdev->flags)) { 502 !test_bit(In_sync, &rdev->flags)) {
516 slot++; 503 slot++;
517 if (slot == conf->copies) { 504 if (slot == conf->copies) {
@@ -529,6 +516,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
529 slot = 0; 516 slot = 0;
530 disk = r10_bio->devs[slot].devnum; 517 disk = r10_bio->devs[slot].devnum;
531 while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL || 518 while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL ||
519 r10_bio->devs[slot].bio == IO_BLOCKED ||
532 !test_bit(In_sync, &rdev->flags)) { 520 !test_bit(In_sync, &rdev->flags)) {
533 slot ++; 521 slot ++;
534 if (slot == conf->copies) { 522 if (slot == conf->copies) {
@@ -549,6 +537,7 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
549 537
550 538
551 if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL || 539 if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL ||
540 r10_bio->devs[nslot].bio == IO_BLOCKED ||
552 !test_bit(In_sync, &rdev->flags)) 541 !test_bit(In_sync, &rdev->flags))
553 continue; 542 continue;
554 543
@@ -607,7 +596,10 @@ static void unplug_slaves(mddev_t *mddev)
607 596
608static void raid10_unplug(request_queue_t *q) 597static void raid10_unplug(request_queue_t *q)
609{ 598{
599 mddev_t *mddev = q->queuedata;
600
610 unplug_slaves(q->queuedata); 601 unplug_slaves(q->queuedata);
602 md_wakeup_thread(mddev->thread);
611} 603}
612 604
613static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk, 605static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -640,27 +632,107 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
640 return ret; 632 return ret;
641} 633}
642 634
643/* 635/* Barriers....
644 * Throttle resync depth, so that we can both get proper overlapping of 636 * Sometimes we need to suspend IO while we do something else,
645 * requests, but are still able to handle normal requests quickly. 637 * either some resync/recovery, or reconfigure the array.
638 * To do this we raise a 'barrier'.
639 * The 'barrier' is a counter that can be raised multiple times
640 * to count how many activities are happening which preclude
641 * normal IO.
642 * We can only raise the barrier if there is no pending IO.
643 * i.e. if nr_pending == 0.
644 * We choose only to raise the barrier if no-one is waiting for the
645 * barrier to go down. This means that as soon as an IO request
646 * is ready, no other operations which require a barrier will start
647 * until the IO request has had a chance.
648 *
649 * So: regular IO calls 'wait_barrier'. When that returns there
650 * is no backgroup IO happening, It must arrange to call
651 * allow_barrier when it has finished its IO.
652 * backgroup IO calls must call raise_barrier. Once that returns
653 * there is no normal IO happeing. It must arrange to call
654 * lower_barrier when the particular background IO completes.
646 */ 655 */
647#define RESYNC_DEPTH 32 656#define RESYNC_DEPTH 32
648 657
649static void device_barrier(conf_t *conf, sector_t sect) 658static void raise_barrier(conf_t *conf, int force)
659{
660 BUG_ON(force && !conf->barrier);
661 spin_lock_irq(&conf->resync_lock);
662
663 /* Wait until no block IO is waiting (unless 'force') */
664 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
665 conf->resync_lock,
666 raid10_unplug(conf->mddev->queue));
667
668 /* block any new IO from starting */
669 conf->barrier++;
670
671 /* No wait for all pending IO to complete */
672 wait_event_lock_irq(conf->wait_barrier,
673 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
674 conf->resync_lock,
675 raid10_unplug(conf->mddev->queue));
676
677 spin_unlock_irq(&conf->resync_lock);
678}
679
680static void lower_barrier(conf_t *conf)
681{
682 unsigned long flags;
683 spin_lock_irqsave(&conf->resync_lock, flags);
684 conf->barrier--;
685 spin_unlock_irqrestore(&conf->resync_lock, flags);
686 wake_up(&conf->wait_barrier);
687}
688
689static void wait_barrier(conf_t *conf)
650{ 690{
651 spin_lock_irq(&conf->resync_lock); 691 spin_lock_irq(&conf->resync_lock);
652 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), 692 if (conf->barrier) {
653 conf->resync_lock, unplug_slaves(conf->mddev)); 693 conf->nr_waiting++;
654 694 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
655 if (!conf->barrier++) { 695 conf->resync_lock,
656 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 696 raid10_unplug(conf->mddev->queue));
657 conf->resync_lock, unplug_slaves(conf->mddev)); 697 conf->nr_waiting--;
658 if (conf->nr_pending)
659 BUG();
660 } 698 }
661 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, 699 conf->nr_pending++;
662 conf->resync_lock, unplug_slaves(conf->mddev)); 700 spin_unlock_irq(&conf->resync_lock);
663 conf->next_resync = sect; 701}
702
703static void allow_barrier(conf_t *conf)
704{
705 unsigned long flags;
706 spin_lock_irqsave(&conf->resync_lock, flags);
707 conf->nr_pending--;
708 spin_unlock_irqrestore(&conf->resync_lock, flags);
709 wake_up(&conf->wait_barrier);
710}
711
712static void freeze_array(conf_t *conf)
713{
714 /* stop syncio and normal IO and wait for everything to
715 * go quiet.
716 * We increment barrier and nr_waiting, and then
717 * wait until barrier+nr_pending match nr_queued+2
718 */
719 spin_lock_irq(&conf->resync_lock);
720 conf->barrier++;
721 conf->nr_waiting++;
722 wait_event_lock_irq(conf->wait_barrier,
723 conf->barrier+conf->nr_pending == conf->nr_queued+2,
724 conf->resync_lock,
725 raid10_unplug(conf->mddev->queue));
726 spin_unlock_irq(&conf->resync_lock);
727}
728
729static void unfreeze_array(conf_t *conf)
730{
731 /* reverse the effect of the freeze */
732 spin_lock_irq(&conf->resync_lock);
733 conf->barrier--;
734 conf->nr_waiting--;
735 wake_up(&conf->wait_barrier);
664 spin_unlock_irq(&conf->resync_lock); 736 spin_unlock_irq(&conf->resync_lock);
665} 737}
666 738
@@ -674,6 +746,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
674 int i; 746 int i;
675 int chunk_sects = conf->chunk_mask + 1; 747 int chunk_sects = conf->chunk_mask + 1;
676 const int rw = bio_data_dir(bio); 748 const int rw = bio_data_dir(bio);
749 struct bio_list bl;
750 unsigned long flags;
677 751
678 if (unlikely(bio_barrier(bio))) { 752 if (unlikely(bio_barrier(bio))) {
679 bio_endio(bio, bio->bi_size, -EOPNOTSUPP); 753 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -719,10 +793,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
719 * thread has put up a bar for new requests. 793 * thread has put up a bar for new requests.
720 * Continue immediately if no resync is active currently. 794 * Continue immediately if no resync is active currently.
721 */ 795 */
722 spin_lock_irq(&conf->resync_lock); 796 wait_barrier(conf);
723 wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
724 conf->nr_pending++;
725 spin_unlock_irq(&conf->resync_lock);
726 797
727 disk_stat_inc(mddev->gendisk, ios[rw]); 798 disk_stat_inc(mddev->gendisk, ios[rw]);
728 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); 799 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
@@ -734,6 +805,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
734 805
735 r10_bio->mddev = mddev; 806 r10_bio->mddev = mddev;
736 r10_bio->sector = bio->bi_sector; 807 r10_bio->sector = bio->bi_sector;
808 r10_bio->state = 0;
737 809
738 if (rw == READ) { 810 if (rw == READ) {
739 /* 811 /*
@@ -778,13 +850,16 @@ static int make_request(request_queue_t *q, struct bio * bio)
778 !test_bit(Faulty, &rdev->flags)) { 850 !test_bit(Faulty, &rdev->flags)) {
779 atomic_inc(&rdev->nr_pending); 851 atomic_inc(&rdev->nr_pending);
780 r10_bio->devs[i].bio = bio; 852 r10_bio->devs[i].bio = bio;
781 } else 853 } else {
782 r10_bio->devs[i].bio = NULL; 854 r10_bio->devs[i].bio = NULL;
855 set_bit(R10BIO_Degraded, &r10_bio->state);
856 }
783 } 857 }
784 rcu_read_unlock(); 858 rcu_read_unlock();
785 859
786 atomic_set(&r10_bio->remaining, 1); 860 atomic_set(&r10_bio->remaining, 0);
787 861
862 bio_list_init(&bl);
788 for (i = 0; i < conf->copies; i++) { 863 for (i = 0; i < conf->copies; i++) {
789 struct bio *mbio; 864 struct bio *mbio;
790 int d = r10_bio->devs[i].devnum; 865 int d = r10_bio->devs[i].devnum;
@@ -802,13 +877,14 @@ static int make_request(request_queue_t *q, struct bio * bio)
802 mbio->bi_private = r10_bio; 877 mbio->bi_private = r10_bio;
803 878
804 atomic_inc(&r10_bio->remaining); 879 atomic_inc(&r10_bio->remaining);
805 generic_make_request(mbio); 880 bio_list_add(&bl, mbio);
806 } 881 }
807 882
808 if (atomic_dec_and_test(&r10_bio->remaining)) { 883 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
809 md_write_end(mddev); 884 spin_lock_irqsave(&conf->device_lock, flags);
810 raid_end_bio_io(r10_bio); 885 bio_list_merge(&conf->pending_bio_list, &bl);
811 } 886 blk_plug_device(mddev->queue);
887 spin_unlock_irqrestore(&conf->device_lock, flags);
812 888
813 return 0; 889 return 0;
814} 890}
@@ -897,13 +973,8 @@ static void print_conf(conf_t *conf)
897 973
898static void close_sync(conf_t *conf) 974static void close_sync(conf_t *conf)
899{ 975{
900 spin_lock_irq(&conf->resync_lock); 976 wait_barrier(conf);
901 wait_event_lock_irq(conf->wait_resume, !conf->barrier, 977 allow_barrier(conf);
902 conf->resync_lock, unplug_slaves(conf->mddev));
903 spin_unlock_irq(&conf->resync_lock);
904
905 if (conf->barrier) BUG();
906 if (waitqueue_active(&conf->wait_idle)) BUG();
907 978
908 mempool_destroy(conf->r10buf_pool); 979 mempool_destroy(conf->r10buf_pool);
909 conf->r10buf_pool = NULL; 980 conf->r10buf_pool = NULL;
@@ -971,7 +1042,12 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
971 if (!enough(conf)) 1042 if (!enough(conf))
972 return 0; 1043 return 0;
973 1044
974 for (mirror=0; mirror < mddev->raid_disks; mirror++) 1045 if (rdev->saved_raid_disk >= 0 &&
1046 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1047 mirror = rdev->saved_raid_disk;
1048 else
1049 mirror = 0;
1050 for ( ; mirror < mddev->raid_disks; mirror++)
975 if ( !(p=conf->mirrors+mirror)->rdev) { 1051 if ( !(p=conf->mirrors+mirror)->rdev) {
976 1052
977 blk_queue_stack_limits(mddev->queue, 1053 blk_queue_stack_limits(mddev->queue,
@@ -987,6 +1063,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
987 p->head_position = 0; 1063 p->head_position = 0;
988 rdev->raid_disk = mirror; 1064 rdev->raid_disk = mirror;
989 found = 1; 1065 found = 1;
1066 if (rdev->saved_raid_disk != mirror)
1067 conf->fullsync = 1;
990 rcu_assign_pointer(p->rdev, rdev); 1068 rcu_assign_pointer(p->rdev, rdev);
991 break; 1069 break;
992 } 1070 }
@@ -1027,7 +1105,6 @@ abort:
1027 1105
1028static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) 1106static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
1029{ 1107{
1030 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1031 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 1108 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1032 conf_t *conf = mddev_to_conf(r10_bio->mddev); 1109 conf_t *conf = mddev_to_conf(r10_bio->mddev);
1033 int i,d; 1110 int i,d;
@@ -1042,9 +1119,16 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
1042 BUG(); 1119 BUG();
1043 update_head_pos(i, r10_bio); 1120 update_head_pos(i, r10_bio);
1044 d = r10_bio->devs[i].devnum; 1121 d = r10_bio->devs[i].devnum;
1045 if (!uptodate) 1122
1046 md_error(r10_bio->mddev, 1123 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1047 conf->mirrors[d].rdev); 1124 set_bit(R10BIO_Uptodate, &r10_bio->state);
1125 else {
1126 atomic_add(r10_bio->sectors,
1127 &conf->mirrors[d].rdev->corrected_errors);
1128 if (!test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
1129 md_error(r10_bio->mddev,
1130 conf->mirrors[d].rdev);
1131 }
1048 1132
1049 /* for reconstruct, we always reschedule after a read. 1133 /* for reconstruct, we always reschedule after a read.
1050 * for resync, only after all reads 1134 * for resync, only after all reads
@@ -1132,23 +1216,32 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1132 fbio = r10_bio->devs[i].bio; 1216 fbio = r10_bio->devs[i].bio;
1133 1217
1134 /* now find blocks with errors */ 1218 /* now find blocks with errors */
1135 for (i=first+1 ; i < conf->copies ; i++) { 1219 for (i=0 ; i < conf->copies ; i++) {
1136 int vcnt, j, d; 1220 int j, d;
1221 int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1137 1222
1138 if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1139 continue;
1140 /* We know that the bi_io_vec layout is the same for
1141 * both 'first' and 'i', so we just compare them.
1142 * All vec entries are PAGE_SIZE;
1143 */
1144 tbio = r10_bio->devs[i].bio; 1223 tbio = r10_bio->devs[i].bio;
1145 vcnt = r10_bio->sectors >> (PAGE_SHIFT-9); 1224
1146 for (j = 0; j < vcnt; j++) 1225 if (tbio->bi_end_io != end_sync_read)
1147 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), 1226 continue;
1148 page_address(tbio->bi_io_vec[j].bv_page), 1227 if (i == first)
1149 PAGE_SIZE)) 1228 continue;
1150 break; 1229 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
1151 if (j == vcnt) 1230 /* We know that the bi_io_vec layout is the same for
1231 * both 'first' and 'i', so we just compare them.
1232 * All vec entries are PAGE_SIZE;
1233 */
1234 for (j = 0; j < vcnt; j++)
1235 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1236 page_address(tbio->bi_io_vec[j].bv_page),
1237 PAGE_SIZE))
1238 break;
1239 if (j == vcnt)
1240 continue;
1241 mddev->resync_mismatches += r10_bio->sectors;
1242 }
1243 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
1244 /* Don't fix anything. */
1152 continue; 1245 continue;
1153 /* Ok, we need to write this bio 1246 /* Ok, we need to write this bio
1154 * First we need to fixup bv_offset, bv_len and 1247 * First we need to fixup bv_offset, bv_len and
@@ -1227,7 +1320,10 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1227 1320
1228 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 1321 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1229 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); 1322 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1230 generic_make_request(wbio); 1323 if (test_bit(R10BIO_Uptodate, &r10_bio->state))
1324 generic_make_request(wbio);
1325 else
1326 bio_endio(wbio, wbio->bi_size, -EIO);
1231} 1327}
1232 1328
1233 1329
@@ -1254,10 +1350,31 @@ static void raid10d(mddev_t *mddev)
1254 for (;;) { 1350 for (;;) {
1255 char b[BDEVNAME_SIZE]; 1351 char b[BDEVNAME_SIZE];
1256 spin_lock_irqsave(&conf->device_lock, flags); 1352 spin_lock_irqsave(&conf->device_lock, flags);
1353
1354 if (conf->pending_bio_list.head) {
1355 bio = bio_list_get(&conf->pending_bio_list);
1356 blk_remove_plug(mddev->queue);
1357 spin_unlock_irqrestore(&conf->device_lock, flags);
1358 /* flush any pending bitmap writes to disk before proceeding w/ I/O */
1359 if (bitmap_unplug(mddev->bitmap) != 0)
1360 printk("%s: bitmap file write failed!\n", mdname(mddev));
1361
1362 while (bio) { /* submit pending writes */
1363 struct bio *next = bio->bi_next;
1364 bio->bi_next = NULL;
1365 generic_make_request(bio);
1366 bio = next;
1367 }
1368 unplug = 1;
1369
1370 continue;
1371 }
1372
1257 if (list_empty(head)) 1373 if (list_empty(head))
1258 break; 1374 break;
1259 r10_bio = list_entry(head->prev, r10bio_t, retry_list); 1375 r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1260 list_del(head->prev); 1376 list_del(head->prev);
1377 conf->nr_queued--;
1261 spin_unlock_irqrestore(&conf->device_lock, flags); 1378 spin_unlock_irqrestore(&conf->device_lock, flags);
1262 1379
1263 mddev = r10_bio->mddev; 1380 mddev = r10_bio->mddev;
@@ -1270,8 +1387,96 @@ static void raid10d(mddev_t *mddev)
1270 unplug = 1; 1387 unplug = 1;
1271 } else { 1388 } else {
1272 int mirror; 1389 int mirror;
1390 /* we got a read error. Maybe the drive is bad. Maybe just
1391 * the block and we can fix it.
1392 * We freeze all other IO, and try reading the block from
1393 * other devices. When we find one, we re-write
1394 * and check it that fixes the read error.
1395 * This is all done synchronously while the array is
1396 * frozen.
1397 */
1398 int sect = 0; /* Offset from r10_bio->sector */
1399 int sectors = r10_bio->sectors;
1400 freeze_array(conf);
1401 if (mddev->ro == 0) while(sectors) {
1402 int s = sectors;
1403 int sl = r10_bio->read_slot;
1404 int success = 0;
1405
1406 if (s > (PAGE_SIZE>>9))
1407 s = PAGE_SIZE >> 9;
1408
1409 do {
1410 int d = r10_bio->devs[sl].devnum;
1411 rdev = conf->mirrors[d].rdev;
1412 if (rdev &&
1413 test_bit(In_sync, &rdev->flags) &&
1414 sync_page_io(rdev->bdev,
1415 r10_bio->devs[sl].addr +
1416 sect + rdev->data_offset,
1417 s<<9,
1418 conf->tmppage, READ))
1419 success = 1;
1420 else {
1421 sl++;
1422 if (sl == conf->copies)
1423 sl = 0;
1424 }
1425 } while (!success && sl != r10_bio->read_slot);
1426
1427 if (success) {
1428 int start = sl;
1429 /* write it back and re-read */
1430 while (sl != r10_bio->read_slot) {
1431 int d;
1432 if (sl==0)
1433 sl = conf->copies;
1434 sl--;
1435 d = r10_bio->devs[sl].devnum;
1436 rdev = conf->mirrors[d].rdev;
1437 atomic_add(s, &rdev->corrected_errors);
1438 if (rdev &&
1439 test_bit(In_sync, &rdev->flags)) {
1440 if (sync_page_io(rdev->bdev,
1441 r10_bio->devs[sl].addr +
1442 sect + rdev->data_offset,
1443 s<<9, conf->tmppage, WRITE) == 0)
1444 /* Well, this device is dead */
1445 md_error(mddev, rdev);
1446 }
1447 }
1448 sl = start;
1449 while (sl != r10_bio->read_slot) {
1450 int d;
1451 if (sl==0)
1452 sl = conf->copies;
1453 sl--;
1454 d = r10_bio->devs[sl].devnum;
1455 rdev = conf->mirrors[d].rdev;
1456 if (rdev &&
1457 test_bit(In_sync, &rdev->flags)) {
1458 if (sync_page_io(rdev->bdev,
1459 r10_bio->devs[sl].addr +
1460 sect + rdev->data_offset,
1461 s<<9, conf->tmppage, READ) == 0)
1462 /* Well, this device is dead */
1463 md_error(mddev, rdev);
1464 }
1465 }
1466 } else {
1467 /* Cannot read from anywhere -- bye bye array */
1468 md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev);
1469 break;
1470 }
1471 sectors -= s;
1472 sect += s;
1473 }
1474
1475 unfreeze_array(conf);
1476
1273 bio = r10_bio->devs[r10_bio->read_slot].bio; 1477 bio = r10_bio->devs[r10_bio->read_slot].bio;
1274 r10_bio->devs[r10_bio->read_slot].bio = NULL; 1478 r10_bio->devs[r10_bio->read_slot].bio =
1479 mddev->ro ? IO_BLOCKED : NULL;
1275 bio_put(bio); 1480 bio_put(bio);
1276 mirror = read_balance(conf, r10_bio); 1481 mirror = read_balance(conf, r10_bio);
1277 if (mirror == -1) { 1482 if (mirror == -1) {
@@ -1360,6 +1565,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1360 sector_t max_sector, nr_sectors; 1565 sector_t max_sector, nr_sectors;
1361 int disk; 1566 int disk;
1362 int i; 1567 int i;
1568 int max_sync;
1569 int sync_blocks;
1363 1570
1364 sector_t sectors_skipped = 0; 1571 sector_t sectors_skipped = 0;
1365 int chunks_skipped = 0; 1572 int chunks_skipped = 0;
@@ -1373,6 +1580,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1373 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 1580 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1374 max_sector = mddev->resync_max_sectors; 1581 max_sector = mddev->resync_max_sectors;
1375 if (sector_nr >= max_sector) { 1582 if (sector_nr >= max_sector) {
1583 /* If we aborted, we need to abort the
1584 * sync on the 'current' bitmap chucks (there can
1585 * be several when recovering multiple devices).
1586 * as we may have started syncing it but not finished.
1587 * We can find the current address in
1588 * mddev->curr_resync, but for recovery,
1589 * we need to convert that to several
1590 * virtual addresses.
1591 */
1592 if (mddev->curr_resync < max_sector) { /* aborted */
1593 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1594 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1595 &sync_blocks, 1);
1596 else for (i=0; i<conf->raid_disks; i++) {
1597 sector_t sect =
1598 raid10_find_virt(conf, mddev->curr_resync, i);
1599 bitmap_end_sync(mddev->bitmap, sect,
1600 &sync_blocks, 1);
1601 }
1602 } else /* completed sync */
1603 conf->fullsync = 0;
1604
1605 bitmap_close_sync(mddev->bitmap);
1376 close_sync(conf); 1606 close_sync(conf);
1377 *skipped = 1; 1607 *skipped = 1;
1378 return sectors_skipped; 1608 return sectors_skipped;
@@ -1395,9 +1625,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1395 * If there is non-resync activity waiting for us then 1625 * If there is non-resync activity waiting for us then
1396 * put in a delay to throttle resync. 1626 * put in a delay to throttle resync.
1397 */ 1627 */
1398 if (!go_faster && waitqueue_active(&conf->wait_resume)) 1628 if (!go_faster && conf->nr_waiting)
1399 msleep_interruptible(1000); 1629 msleep_interruptible(1000);
1400 device_barrier(conf, sector_nr + RESYNC_SECTORS);
1401 1630
1402 /* Again, very different code for resync and recovery. 1631 /* Again, very different code for resync and recovery.
1403 * Both must result in an r10bio with a list of bios that 1632 * Both must result in an r10bio with a list of bios that
@@ -1414,6 +1643,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1414 * end_sync_write if we will want to write. 1643 * end_sync_write if we will want to write.
1415 */ 1644 */
1416 1645
1646 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
1417 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 1647 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1418 /* recovery... the complicated one */ 1648 /* recovery... the complicated one */
1419 int i, j, k; 1649 int i, j, k;
@@ -1422,14 +1652,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1422 for (i=0 ; i<conf->raid_disks; i++) 1652 for (i=0 ; i<conf->raid_disks; i++)
1423 if (conf->mirrors[i].rdev && 1653 if (conf->mirrors[i].rdev &&
1424 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { 1654 !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
1655 int still_degraded = 0;
1425 /* want to reconstruct this device */ 1656 /* want to reconstruct this device */
1426 r10bio_t *rb2 = r10_bio; 1657 r10bio_t *rb2 = r10_bio;
1658 sector_t sect = raid10_find_virt(conf, sector_nr, i);
1659 int must_sync;
1660 /* Unless we are doing a full sync, we only need
1661 * to recover the block if it is set in the bitmap
1662 */
1663 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1664 &sync_blocks, 1);
1665 if (sync_blocks < max_sync)
1666 max_sync = sync_blocks;
1667 if (!must_sync &&
1668 !conf->fullsync) {
1669 /* yep, skip the sync_blocks here, but don't assume
1670 * that there will never be anything to do here
1671 */
1672 chunks_skipped = -1;
1673 continue;
1674 }
1427 1675
1428 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); 1676 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1429 spin_lock_irq(&conf->resync_lock); 1677 raise_barrier(conf, rb2 != NULL);
1430 conf->nr_pending++;
1431 if (rb2) conf->barrier++;
1432 spin_unlock_irq(&conf->resync_lock);
1433 atomic_set(&r10_bio->remaining, 0); 1678 atomic_set(&r10_bio->remaining, 0);
1434 1679
1435 r10_bio->master_bio = (struct bio*)rb2; 1680 r10_bio->master_bio = (struct bio*)rb2;
@@ -1437,8 +1682,23 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1437 atomic_inc(&rb2->remaining); 1682 atomic_inc(&rb2->remaining);
1438 r10_bio->mddev = mddev; 1683 r10_bio->mddev = mddev;
1439 set_bit(R10BIO_IsRecover, &r10_bio->state); 1684 set_bit(R10BIO_IsRecover, &r10_bio->state);
1440 r10_bio->sector = raid10_find_virt(conf, sector_nr, i); 1685 r10_bio->sector = sect;
1686
1441 raid10_find_phys(conf, r10_bio); 1687 raid10_find_phys(conf, r10_bio);
1688 /* Need to check if this section will still be
1689 * degraded
1690 */
1691 for (j=0; j<conf->copies;j++) {
1692 int d = r10_bio->devs[j].devnum;
1693 if (conf->mirrors[d].rdev == NULL ||
1694 test_bit(Faulty, &conf->mirrors[d].rdev->flags)) {
1695 still_degraded = 1;
1696 break;
1697 }
1698 }
1699 must_sync = bitmap_start_sync(mddev->bitmap, sect,
1700 &sync_blocks, still_degraded);
1701
1442 for (j=0; j<conf->copies;j++) { 1702 for (j=0; j<conf->copies;j++) {
1443 int d = r10_bio->devs[j].devnum; 1703 int d = r10_bio->devs[j].devnum;
1444 if (conf->mirrors[d].rdev && 1704 if (conf->mirrors[d].rdev &&
@@ -1498,14 +1758,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1498 } else { 1758 } else {
1499 /* resync. Schedule a read for every block at this virt offset */ 1759 /* resync. Schedule a read for every block at this virt offset */
1500 int count = 0; 1760 int count = 0;
1501 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1502 1761
1503 spin_lock_irq(&conf->resync_lock); 1762 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1504 conf->nr_pending++; 1763 &sync_blocks, mddev->degraded) &&
1505 spin_unlock_irq(&conf->resync_lock); 1764 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1765 /* We can skip this block */
1766 *skipped = 1;
1767 return sync_blocks + sectors_skipped;
1768 }
1769 if (sync_blocks < max_sync)
1770 max_sync = sync_blocks;
1771 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1506 1772
1507 r10_bio->mddev = mddev; 1773 r10_bio->mddev = mddev;
1508 atomic_set(&r10_bio->remaining, 0); 1774 atomic_set(&r10_bio->remaining, 0);
1775 raise_barrier(conf, 0);
1776 conf->next_resync = sector_nr;
1509 1777
1510 r10_bio->master_bio = NULL; 1778 r10_bio->master_bio = NULL;
1511 r10_bio->sector = sector_nr; 1779 r10_bio->sector = sector_nr;
@@ -1558,6 +1826,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1558 } 1826 }
1559 1827
1560 nr_sectors = 0; 1828 nr_sectors = 0;
1829 if (sector_nr + max_sync < max_sector)
1830 max_sector = sector_nr + max_sync;
1561 do { 1831 do {
1562 struct page *page; 1832 struct page *page;
1563 int len = PAGE_SIZE; 1833 int len = PAGE_SIZE;
@@ -1632,11 +1902,11 @@ static int run(mddev_t *mddev)
1632 int nc, fc; 1902 int nc, fc;
1633 sector_t stride, size; 1903 sector_t stride, size;
1634 1904
1635 if (mddev->level != 10) { 1905 if (mddev->chunk_size == 0) {
1636 printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n", 1906 printk(KERN_ERR "md/raid10: non-zero chunk size required.\n");
1637 mdname(mddev), mddev->level); 1907 return -EINVAL;
1638 goto out;
1639 } 1908 }
1909
1640 nc = mddev->layout & 255; 1910 nc = mddev->layout & 255;
1641 fc = (mddev->layout >> 8) & 255; 1911 fc = (mddev->layout >> 8) & 255;
1642 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || 1912 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
@@ -1650,22 +1920,24 @@ static int run(mddev_t *mddev)
1650 * bookkeeping area. [whatever we allocate in run(), 1920 * bookkeeping area. [whatever we allocate in run(),
1651 * should be freed in stop()] 1921 * should be freed in stop()]
1652 */ 1922 */
1653 conf = kmalloc(sizeof(conf_t), GFP_KERNEL); 1923 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1654 mddev->private = conf; 1924 mddev->private = conf;
1655 if (!conf) { 1925 if (!conf) {
1656 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", 1926 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1657 mdname(mddev)); 1927 mdname(mddev));
1658 goto out; 1928 goto out;
1659 } 1929 }
1660 memset(conf, 0, sizeof(*conf)); 1930 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1661 conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1662 GFP_KERNEL); 1931 GFP_KERNEL);
1663 if (!conf->mirrors) { 1932 if (!conf->mirrors) {
1664 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", 1933 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1665 mdname(mddev)); 1934 mdname(mddev));
1666 goto out_free_conf; 1935 goto out_free_conf;
1667 } 1936 }
1668 memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks); 1937
1938 conf->tmppage = alloc_page(GFP_KERNEL);
1939 if (!conf->tmppage)
1940 goto out_free_conf;
1669 1941
1670 conf->near_copies = nc; 1942 conf->near_copies = nc;
1671 conf->far_copies = fc; 1943 conf->far_copies = fc;
@@ -1713,8 +1985,7 @@ static int run(mddev_t *mddev)
1713 INIT_LIST_HEAD(&conf->retry_list); 1985 INIT_LIST_HEAD(&conf->retry_list);
1714 1986
1715 spin_lock_init(&conf->resync_lock); 1987 spin_lock_init(&conf->resync_lock);
1716 init_waitqueue_head(&conf->wait_idle); 1988 init_waitqueue_head(&conf->wait_barrier);
1717 init_waitqueue_head(&conf->wait_resume);
1718 1989
1719 /* need to check that every block has at least one working mirror */ 1990 /* need to check that every block has at least one working mirror */
1720 if (!enough(conf)) { 1991 if (!enough(conf)) {
@@ -1763,7 +2034,7 @@ static int run(mddev_t *mddev)
1763 * maybe... 2034 * maybe...
1764 */ 2035 */
1765 { 2036 {
1766 int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE; 2037 int stripe = conf->raid_disks * mddev->chunk_size / PAGE_SIZE;
1767 stripe /= conf->near_copies; 2038 stripe /= conf->near_copies;
1768 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) 2039 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
1769 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 2040 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
@@ -1776,6 +2047,7 @@ static int run(mddev_t *mddev)
1776out_free_conf: 2047out_free_conf:
1777 if (conf->r10bio_pool) 2048 if (conf->r10bio_pool)
1778 mempool_destroy(conf->r10bio_pool); 2049 mempool_destroy(conf->r10bio_pool);
2050 safe_put_page(conf->tmppage);
1779 kfree(conf->mirrors); 2051 kfree(conf->mirrors);
1780 kfree(conf); 2052 kfree(conf);
1781 mddev->private = NULL; 2053 mddev->private = NULL;
@@ -1798,10 +2070,31 @@ static int stop(mddev_t *mddev)
1798 return 0; 2070 return 0;
1799} 2071}
1800 2072
2073static void raid10_quiesce(mddev_t *mddev, int state)
2074{
2075 conf_t *conf = mddev_to_conf(mddev);
2076
2077 switch(state) {
2078 case 1:
2079 raise_barrier(conf, 0);
2080 break;
2081 case 0:
2082 lower_barrier(conf);
2083 break;
2084 }
2085 if (mddev->thread) {
2086 if (mddev->bitmap)
2087 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2088 else
2089 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2090 md_wakeup_thread(mddev->thread);
2091 }
2092}
1801 2093
1802static mdk_personality_t raid10_personality = 2094static struct mdk_personality raid10_personality =
1803{ 2095{
1804 .name = "raid10", 2096 .name = "raid10",
2097 .level = 10,
1805 .owner = THIS_MODULE, 2098 .owner = THIS_MODULE,
1806 .make_request = make_request, 2099 .make_request = make_request,
1807 .run = run, 2100 .run = run,
@@ -1812,19 +2105,22 @@ static mdk_personality_t raid10_personality =
1812 .hot_remove_disk= raid10_remove_disk, 2105 .hot_remove_disk= raid10_remove_disk,
1813 .spare_active = raid10_spare_active, 2106 .spare_active = raid10_spare_active,
1814 .sync_request = sync_request, 2107 .sync_request = sync_request,
2108 .quiesce = raid10_quiesce,
1815}; 2109};
1816 2110
1817static int __init raid_init(void) 2111static int __init raid_init(void)
1818{ 2112{
1819 return register_md_personality(RAID10, &raid10_personality); 2113 return register_md_personality(&raid10_personality);
1820} 2114}
1821 2115
1822static void raid_exit(void) 2116static void raid_exit(void)
1823{ 2117{
1824 unregister_md_personality(RAID10); 2118 unregister_md_personality(&raid10_personality);
1825} 2119}
1826 2120
1827module_init(raid_init); 2121module_init(raid_init);
1828module_exit(raid_exit); 2122module_exit(raid_exit);
1829MODULE_LICENSE("GPL"); 2123MODULE_LICENSE("GPL");
1830MODULE_ALIAS("md-personality-9"); /* RAID10 */ 2124MODULE_ALIAS("md-personality-9"); /* RAID10 */
2125MODULE_ALIAS("md-raid10");
2126MODULE_ALIAS("md-level-10");
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index fafc4bc045f7..54f4a9847e38 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -35,12 +35,10 @@
35#define STRIPE_SHIFT (PAGE_SHIFT - 9) 35#define STRIPE_SHIFT (PAGE_SHIFT - 9)
36#define STRIPE_SECTORS (STRIPE_SIZE>>9) 36#define STRIPE_SECTORS (STRIPE_SIZE>>9)
37#define IO_THRESHOLD 1 37#define IO_THRESHOLD 1
38#define HASH_PAGES 1 38#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
39#define HASH_PAGES_ORDER 0
40#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
41#define HASH_MASK (NR_HASH - 1) 39#define HASH_MASK (NR_HASH - 1)
42 40
43#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]) 41#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
44 42
45/* bio's attached to a stripe+device for I/O are linked together in bi_sector 43/* bio's attached to a stripe+device for I/O are linked together in bi_sector
46 * order without overlap. There may be several bio's per stripe+device, and 44 * order without overlap. There may be several bio's per stripe+device, and
@@ -113,29 +111,21 @@ static void release_stripe(struct stripe_head *sh)
113 spin_unlock_irqrestore(&conf->device_lock, flags); 111 spin_unlock_irqrestore(&conf->device_lock, flags);
114} 112}
115 113
116static void remove_hash(struct stripe_head *sh) 114static inline void remove_hash(struct stripe_head *sh)
117{ 115{
118 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); 116 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
119 117
120 if (sh->hash_pprev) { 118 hlist_del_init(&sh->hash);
121 if (sh->hash_next)
122 sh->hash_next->hash_pprev = sh->hash_pprev;
123 *sh->hash_pprev = sh->hash_next;
124 sh->hash_pprev = NULL;
125 }
126} 119}
127 120
128static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) 121static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
129{ 122{
130 struct stripe_head **shp = &stripe_hash(conf, sh->sector); 123 struct hlist_head *hp = stripe_hash(conf, sh->sector);
131 124
132 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); 125 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
133 126
134 CHECK_DEVLOCK(); 127 CHECK_DEVLOCK();
135 if ((sh->hash_next = *shp) != NULL) 128 hlist_add_head(&sh->hash, hp);
136 (*shp)->hash_pprev = &sh->hash_next;
137 *shp = sh;
138 sh->hash_pprev = shp;
139} 129}
140 130
141 131
@@ -167,7 +157,7 @@ static void shrink_buffers(struct stripe_head *sh, int num)
167 if (!p) 157 if (!p)
168 continue; 158 continue;
169 sh->dev[i].page = NULL; 159 sh->dev[i].page = NULL;
170 page_cache_release(p); 160 put_page(p);
171 } 161 }
172} 162}
173 163
@@ -228,10 +218,11 @@ static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_i
228static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector) 218static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
229{ 219{
230 struct stripe_head *sh; 220 struct stripe_head *sh;
221 struct hlist_node *hn;
231 222
232 CHECK_DEVLOCK(); 223 CHECK_DEVLOCK();
233 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); 224 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
234 for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) 225 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
235 if (sh->sector == sector) 226 if (sh->sector == sector)
236 return sh; 227 return sh;
237 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); 228 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
@@ -417,7 +408,7 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
417 set_bit(R5_UPTODATE, &sh->dev[i].flags); 408 set_bit(R5_UPTODATE, &sh->dev[i].flags);
418#endif 409#endif
419 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 410 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
420 printk("R5: read error corrected!!\n"); 411 printk(KERN_INFO "raid5: read error corrected!!\n");
421 clear_bit(R5_ReadError, &sh->dev[i].flags); 412 clear_bit(R5_ReadError, &sh->dev[i].flags);
422 clear_bit(R5_ReWrite, &sh->dev[i].flags); 413 clear_bit(R5_ReWrite, &sh->dev[i].flags);
423 } 414 }
@@ -428,13 +419,14 @@ static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done,
428 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 419 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
429 atomic_inc(&conf->disks[i].rdev->read_errors); 420 atomic_inc(&conf->disks[i].rdev->read_errors);
430 if (conf->mddev->degraded) 421 if (conf->mddev->degraded)
431 printk("R5: read error not correctable.\n"); 422 printk(KERN_WARNING "raid5: read error not correctable.\n");
432 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 423 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
433 /* Oh, no!!! */ 424 /* Oh, no!!! */
434 printk("R5: read error NOT corrected!!\n"); 425 printk(KERN_WARNING "raid5: read error NOT corrected!!\n");
435 else if (atomic_read(&conf->disks[i].rdev->read_errors) 426 else if (atomic_read(&conf->disks[i].rdev->read_errors)
436 > conf->max_nr_stripes) 427 > conf->max_nr_stripes)
437 printk("raid5: Too many read errors, failing device.\n"); 428 printk(KERN_WARNING
429 "raid5: Too many read errors, failing device.\n");
438 else 430 else
439 retry = 1; 431 retry = 1;
440 if (retry) 432 if (retry)
@@ -604,7 +596,7 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
604 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; 596 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
605 break; 597 break;
606 default: 598 default:
607 printk("raid5: unsupported algorithm %d\n", 599 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
608 conf->algorithm); 600 conf->algorithm);
609 } 601 }
610 602
@@ -645,7 +637,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
645 i -= (sh->pd_idx + 1); 637 i -= (sh->pd_idx + 1);
646 break; 638 break;
647 default: 639 default:
648 printk("raid5: unsupported algorithm %d\n", 640 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
649 conf->algorithm); 641 conf->algorithm);
650 } 642 }
651 643
@@ -654,7 +646,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
654 646
655 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); 647 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
656 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { 648 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
657 printk("compute_blocknr: map not correct\n"); 649 printk(KERN_ERR "compute_blocknr: map not correct\n");
658 return 0; 650 return 0;
659 } 651 }
660 return r_sector; 652 return r_sector;
@@ -737,7 +729,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
737 if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) 729 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
738 ptr[count++] = p; 730 ptr[count++] = p;
739 else 731 else
740 printk("compute_block() %d, stripe %llu, %d" 732 printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
741 " not present\n", dd_idx, 733 " not present\n", dd_idx,
742 (unsigned long long)sh->sector, i); 734 (unsigned long long)sh->sector, i);
743 735
@@ -960,11 +952,11 @@ static void handle_stripe(struct stripe_head *sh)
960 syncing = test_bit(STRIPE_SYNCING, &sh->state); 952 syncing = test_bit(STRIPE_SYNCING, &sh->state);
961 /* Now to look around and see what can be done */ 953 /* Now to look around and see what can be done */
962 954
955 rcu_read_lock();
963 for (i=disks; i--; ) { 956 for (i=disks; i--; ) {
964 mdk_rdev_t *rdev; 957 mdk_rdev_t *rdev;
965 dev = &sh->dev[i]; 958 dev = &sh->dev[i];
966 clear_bit(R5_Insync, &dev->flags); 959 clear_bit(R5_Insync, &dev->flags);
967 clear_bit(R5_Syncio, &dev->flags);
968 960
969 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", 961 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
970 i, dev->flags, dev->toread, dev->towrite, dev->written); 962 i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -1003,9 +995,9 @@ static void handle_stripe(struct stripe_head *sh)
1003 non_overwrite++; 995 non_overwrite++;
1004 } 996 }
1005 if (dev->written) written++; 997 if (dev->written) written++;
1006 rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ 998 rdev = rcu_dereference(conf->disks[i].rdev);
1007 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 999 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1008 /* The ReadError flag wil just be confusing now */ 1000 /* The ReadError flag will just be confusing now */
1009 clear_bit(R5_ReadError, &dev->flags); 1001 clear_bit(R5_ReadError, &dev->flags);
1010 clear_bit(R5_ReWrite, &dev->flags); 1002 clear_bit(R5_ReWrite, &dev->flags);
1011 } 1003 }
@@ -1016,6 +1008,7 @@ static void handle_stripe(struct stripe_head *sh)
1016 } else 1008 } else
1017 set_bit(R5_Insync, &dev->flags); 1009 set_bit(R5_Insync, &dev->flags);
1018 } 1010 }
1011 rcu_read_unlock();
1019 PRINTK("locked=%d uptodate=%d to_read=%d" 1012 PRINTK("locked=%d uptodate=%d to_read=%d"
1020 " to_write=%d failed=%d failed_num=%d\n", 1013 " to_write=%d failed=%d failed_num=%d\n",
1021 locked, uptodate, to_read, to_write, failed, failed_num); 1014 locked, uptodate, to_read, to_write, failed, failed_num);
@@ -1027,10 +1020,13 @@ static void handle_stripe(struct stripe_head *sh)
1027 int bitmap_end = 0; 1020 int bitmap_end = 0;
1028 1021
1029 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1022 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1030 mdk_rdev_t *rdev = conf->disks[i].rdev; 1023 mdk_rdev_t *rdev;
1024 rcu_read_lock();
1025 rdev = rcu_dereference(conf->disks[i].rdev);
1031 if (rdev && test_bit(In_sync, &rdev->flags)) 1026 if (rdev && test_bit(In_sync, &rdev->flags))
1032 /* multiple read failures in one stripe */ 1027 /* multiple read failures in one stripe */
1033 md_error(conf->mddev, rdev); 1028 md_error(conf->mddev, rdev);
1029 rcu_read_unlock();
1034 } 1030 }
1035 1031
1036 spin_lock_irq(&conf->device_lock); 1032 spin_lock_irq(&conf->device_lock);
@@ -1179,9 +1175,6 @@ static void handle_stripe(struct stripe_head *sh)
1179 locked++; 1175 locked++;
1180 PRINTK("Reading block %d (sync=%d)\n", 1176 PRINTK("Reading block %d (sync=%d)\n",
1181 i, syncing); 1177 i, syncing);
1182 if (syncing)
1183 md_sync_acct(conf->disks[i].rdev->bdev,
1184 STRIPE_SECTORS);
1185 } 1178 }
1186 } 1179 }
1187 } 1180 }
@@ -1288,7 +1281,7 @@ static void handle_stripe(struct stripe_head *sh)
1288 * is available 1281 * is available
1289 */ 1282 */
1290 if (syncing && locked == 0 && 1283 if (syncing && locked == 0 &&
1291 !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) { 1284 !test_bit(STRIPE_INSYNC, &sh->state)) {
1292 set_bit(STRIPE_HANDLE, &sh->state); 1285 set_bit(STRIPE_HANDLE, &sh->state);
1293 if (failed == 0) { 1286 if (failed == 0) {
1294 char *pagea; 1287 char *pagea;
@@ -1306,27 +1299,25 @@ static void handle_stripe(struct stripe_head *sh)
1306 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 1299 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1307 /* don't try to repair!! */ 1300 /* don't try to repair!! */
1308 set_bit(STRIPE_INSYNC, &sh->state); 1301 set_bit(STRIPE_INSYNC, &sh->state);
1302 else {
1303 compute_block(sh, sh->pd_idx);
1304 uptodate++;
1305 }
1309 } 1306 }
1310 } 1307 }
1311 if (!test_bit(STRIPE_INSYNC, &sh->state)) { 1308 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1309 /* either failed parity check, or recovery is happening */
1312 if (failed==0) 1310 if (failed==0)
1313 failed_num = sh->pd_idx; 1311 failed_num = sh->pd_idx;
1314 /* should be able to compute the missing block and write it to spare */
1315 if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) {
1316 if (uptodate+1 != disks)
1317 BUG();
1318 compute_block(sh, failed_num);
1319 uptodate++;
1320 }
1321 if (uptodate != disks)
1322 BUG();
1323 dev = &sh->dev[failed_num]; 1312 dev = &sh->dev[failed_num];
1313 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
1314 BUG_ON(uptodate != disks);
1315
1324 set_bit(R5_LOCKED, &dev->flags); 1316 set_bit(R5_LOCKED, &dev->flags);
1325 set_bit(R5_Wantwrite, &dev->flags); 1317 set_bit(R5_Wantwrite, &dev->flags);
1326 clear_bit(STRIPE_DEGRADED, &sh->state); 1318 clear_bit(STRIPE_DEGRADED, &sh->state);
1327 locked++; 1319 locked++;
1328 set_bit(STRIPE_INSYNC, &sh->state); 1320 set_bit(STRIPE_INSYNC, &sh->state);
1329 set_bit(R5_Syncio, &dev->flags);
1330 } 1321 }
1331 } 1322 }
1332 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 1323 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
@@ -1392,7 +1383,7 @@ static void handle_stripe(struct stripe_head *sh)
1392 rcu_read_unlock(); 1383 rcu_read_unlock();
1393 1384
1394 if (rdev) { 1385 if (rdev) {
1395 if (test_bit(R5_Syncio, &sh->dev[i].flags)) 1386 if (syncing)
1396 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1387 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1397 1388
1398 bi->bi_bdev = rdev->bdev; 1389 bi->bi_bdev = rdev->bdev;
@@ -1409,6 +1400,9 @@ static void handle_stripe(struct stripe_head *sh)
1409 bi->bi_io_vec[0].bv_offset = 0; 1400 bi->bi_io_vec[0].bv_offset = 0;
1410 bi->bi_size = STRIPE_SIZE; 1401 bi->bi_size = STRIPE_SIZE;
1411 bi->bi_next = NULL; 1402 bi->bi_next = NULL;
1403 if (rw == WRITE &&
1404 test_bit(R5_ReWrite, &sh->dev[i].flags))
1405 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1412 generic_make_request(bi); 1406 generic_make_request(bi);
1413 } else { 1407 } else {
1414 if (rw == 1) 1408 if (rw == 1)
@@ -1822,21 +1816,21 @@ static int run(mddev_t *mddev)
1822 struct list_head *tmp; 1816 struct list_head *tmp;
1823 1817
1824 if (mddev->level != 5 && mddev->level != 4) { 1818 if (mddev->level != 5 && mddev->level != 4) {
1825 printk("raid5: %s: raid level not set to 4/5 (%d)\n", mdname(mddev), mddev->level); 1819 printk(KERN_ERR "raid5: %s: raid level not set to 4/5 (%d)\n",
1820 mdname(mddev), mddev->level);
1826 return -EIO; 1821 return -EIO;
1827 } 1822 }
1828 1823
1829 mddev->private = kmalloc (sizeof (raid5_conf_t) 1824 mddev->private = kzalloc(sizeof (raid5_conf_t)
1830 + mddev->raid_disks * sizeof(struct disk_info), 1825 + mddev->raid_disks * sizeof(struct disk_info),
1831 GFP_KERNEL); 1826 GFP_KERNEL);
1832 if ((conf = mddev->private) == NULL) 1827 if ((conf = mddev->private) == NULL)
1833 goto abort; 1828 goto abort;
1834 memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) ); 1829
1835 conf->mddev = mddev; 1830 conf->mddev = mddev;
1836 1831
1837 if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) 1832 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
1838 goto abort; 1833 goto abort;
1839 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1840 1834
1841 spin_lock_init(&conf->device_lock); 1835 spin_lock_init(&conf->device_lock);
1842 init_waitqueue_head(&conf->wait_for_stripe); 1836 init_waitqueue_head(&conf->wait_for_stripe);
@@ -1903,10 +1897,17 @@ static int run(mddev_t *mddev)
1903 1897
1904 if (mddev->degraded == 1 && 1898 if (mddev->degraded == 1 &&
1905 mddev->recovery_cp != MaxSector) { 1899 mddev->recovery_cp != MaxSector) {
1906 printk(KERN_ERR 1900 if (mddev->ok_start_degraded)
1907 "raid5: cannot start dirty degraded array for %s\n", 1901 printk(KERN_WARNING
1908 mdname(mddev)); 1902 "raid5: starting dirty degraded array: %s"
1909 goto abort; 1903 "- data corruption possible.\n",
1904 mdname(mddev));
1905 else {
1906 printk(KERN_ERR
1907 "raid5: cannot start dirty degraded array for %s\n",
1908 mdname(mddev));
1909 goto abort;
1910 }
1910 } 1911 }
1911 1912
1912 { 1913 {
@@ -1948,7 +1949,7 @@ static int run(mddev_t *mddev)
1948 */ 1949 */
1949 { 1950 {
1950 int stripe = (mddev->raid_disks-1) * mddev->chunk_size 1951 int stripe = (mddev->raid_disks-1) * mddev->chunk_size
1951 / PAGE_CACHE_SIZE; 1952 / PAGE_SIZE;
1952 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 1953 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
1953 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 1954 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
1954 } 1955 }
@@ -1956,9 +1957,6 @@ static int run(mddev_t *mddev)
1956 /* Ok, everything is just fine now */ 1957 /* Ok, everything is just fine now */
1957 sysfs_create_group(&mddev->kobj, &raid5_attrs_group); 1958 sysfs_create_group(&mddev->kobj, &raid5_attrs_group);
1958 1959
1959 if (mddev->bitmap)
1960 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1961
1962 mddev->queue->unplug_fn = raid5_unplug_device; 1960 mddev->queue->unplug_fn = raid5_unplug_device;
1963 mddev->queue->issue_flush_fn = raid5_issue_flush; 1961 mddev->queue->issue_flush_fn = raid5_issue_flush;
1964 1962
@@ -1967,9 +1965,7 @@ static int run(mddev_t *mddev)
1967abort: 1965abort:
1968 if (conf) { 1966 if (conf) {
1969 print_raid5_conf(conf); 1967 print_raid5_conf(conf);
1970 if (conf->stripe_hashtbl) 1968 kfree(conf->stripe_hashtbl);
1971 free_pages((unsigned long) conf->stripe_hashtbl,
1972 HASH_PAGES_ORDER);
1973 kfree(conf); 1969 kfree(conf);
1974 } 1970 }
1975 mddev->private = NULL; 1971 mddev->private = NULL;
@@ -1986,7 +1982,7 @@ static int stop(mddev_t *mddev)
1986 md_unregister_thread(mddev->thread); 1982 md_unregister_thread(mddev->thread);
1987 mddev->thread = NULL; 1983 mddev->thread = NULL;
1988 shrink_stripes(conf); 1984 shrink_stripes(conf);
1989 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); 1985 kfree(conf->stripe_hashtbl);
1990 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 1986 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1991 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); 1987 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
1992 kfree(conf); 1988 kfree(conf);
@@ -2014,12 +2010,12 @@ static void print_sh (struct stripe_head *sh)
2014static void printall (raid5_conf_t *conf) 2010static void printall (raid5_conf_t *conf)
2015{ 2011{
2016 struct stripe_head *sh; 2012 struct stripe_head *sh;
2013 struct hlist_node *hn;
2017 int i; 2014 int i;
2018 2015
2019 spin_lock_irq(&conf->device_lock); 2016 spin_lock_irq(&conf->device_lock);
2020 for (i = 0; i < NR_HASH; i++) { 2017 for (i = 0; i < NR_HASH; i++) {
2021 sh = conf->stripe_hashtbl[i]; 2018 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
2022 for (; sh; sh = sh->hash_next) {
2023 if (sh->raid_conf != conf) 2019 if (sh->raid_conf != conf)
2024 continue; 2020 continue;
2025 print_sh(sh); 2021 print_sh(sh);
@@ -2192,17 +2188,12 @@ static void raid5_quiesce(mddev_t *mddev, int state)
2192 spin_unlock_irq(&conf->device_lock); 2188 spin_unlock_irq(&conf->device_lock);
2193 break; 2189 break;
2194 } 2190 }
2195 if (mddev->thread) {
2196 if (mddev->bitmap)
2197 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2198 else
2199 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2200 md_wakeup_thread(mddev->thread);
2201 }
2202} 2191}
2203static mdk_personality_t raid5_personality= 2192
2193static struct mdk_personality raid5_personality =
2204{ 2194{
2205 .name = "raid5", 2195 .name = "raid5",
2196 .level = 5,
2206 .owner = THIS_MODULE, 2197 .owner = THIS_MODULE,
2207 .make_request = make_request, 2198 .make_request = make_request,
2208 .run = run, 2199 .run = run,
@@ -2217,17 +2208,42 @@ static mdk_personality_t raid5_personality=
2217 .quiesce = raid5_quiesce, 2208 .quiesce = raid5_quiesce,
2218}; 2209};
2219 2210
2220static int __init raid5_init (void) 2211static struct mdk_personality raid4_personality =
2221{ 2212{
2222 return register_md_personality (RAID5, &raid5_personality); 2213 .name = "raid4",
2214 .level = 4,
2215 .owner = THIS_MODULE,
2216 .make_request = make_request,
2217 .run = run,
2218 .stop = stop,
2219 .status = status,
2220 .error_handler = error,
2221 .hot_add_disk = raid5_add_disk,
2222 .hot_remove_disk= raid5_remove_disk,
2223 .spare_active = raid5_spare_active,
2224 .sync_request = sync_request,
2225 .resize = raid5_resize,
2226 .quiesce = raid5_quiesce,
2227};
2228
2229static int __init raid5_init(void)
2230{
2231 register_md_personality(&raid5_personality);
2232 register_md_personality(&raid4_personality);
2233 return 0;
2223} 2234}
2224 2235
2225static void raid5_exit (void) 2236static void raid5_exit(void)
2226{ 2237{
2227 unregister_md_personality (RAID5); 2238 unregister_md_personality(&raid5_personality);
2239 unregister_md_personality(&raid4_personality);
2228} 2240}
2229 2241
2230module_init(raid5_init); 2242module_init(raid5_init);
2231module_exit(raid5_exit); 2243module_exit(raid5_exit);
2232MODULE_LICENSE("GPL"); 2244MODULE_LICENSE("GPL");
2233MODULE_ALIAS("md-personality-4"); /* RAID5 */ 2245MODULE_ALIAS("md-personality-4"); /* RAID5 */
2246MODULE_ALIAS("md-raid5");
2247MODULE_ALIAS("md-raid4");
2248MODULE_ALIAS("md-level-5");
2249MODULE_ALIAS("md-level-4");
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 0000d162d198..8c823d686a60 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -40,12 +40,10 @@
40#define STRIPE_SHIFT (PAGE_SHIFT - 9) 40#define STRIPE_SHIFT (PAGE_SHIFT - 9)
41#define STRIPE_SECTORS (STRIPE_SIZE>>9) 41#define STRIPE_SECTORS (STRIPE_SIZE>>9)
42#define IO_THRESHOLD 1 42#define IO_THRESHOLD 1
43#define HASH_PAGES 1 43#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
44#define HASH_PAGES_ORDER 0
45#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
46#define HASH_MASK (NR_HASH - 1) 44#define HASH_MASK (NR_HASH - 1)
47 45
48#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]) 46#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
49 47
50/* bio's attached to a stripe+device for I/O are linked together in bi_sector 48/* bio's attached to a stripe+device for I/O are linked together in bi_sector
51 * order without overlap. There may be several bio's per stripe+device, and 49 * order without overlap. There may be several bio's per stripe+device, and
@@ -132,29 +130,21 @@ static void release_stripe(struct stripe_head *sh)
132 spin_unlock_irqrestore(&conf->device_lock, flags); 130 spin_unlock_irqrestore(&conf->device_lock, flags);
133} 131}
134 132
135static void remove_hash(struct stripe_head *sh) 133static inline void remove_hash(struct stripe_head *sh)
136{ 134{
137 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector); 135 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
138 136
139 if (sh->hash_pprev) { 137 hlist_del_init(&sh->hash);
140 if (sh->hash_next)
141 sh->hash_next->hash_pprev = sh->hash_pprev;
142 *sh->hash_pprev = sh->hash_next;
143 sh->hash_pprev = NULL;
144 }
145} 138}
146 139
147static __inline__ void insert_hash(raid6_conf_t *conf, struct stripe_head *sh) 140static inline void insert_hash(raid6_conf_t *conf, struct stripe_head *sh)
148{ 141{
149 struct stripe_head **shp = &stripe_hash(conf, sh->sector); 142 struct hlist_head *hp = stripe_hash(conf, sh->sector);
150 143
151 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector); 144 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
152 145
153 CHECK_DEVLOCK(); 146 CHECK_DEVLOCK();
154 if ((sh->hash_next = *shp) != NULL) 147 hlist_add_head(&sh->hash, hp);
155 (*shp)->hash_pprev = &sh->hash_next;
156 *shp = sh;
157 sh->hash_pprev = shp;
158} 148}
159 149
160 150
@@ -186,7 +176,7 @@ static void shrink_buffers(struct stripe_head *sh, int num)
186 if (!p) 176 if (!p)
187 continue; 177 continue;
188 sh->dev[i].page = NULL; 178 sh->dev[i].page = NULL;
189 page_cache_release(p); 179 put_page(p);
190 } 180 }
191} 181}
192 182
@@ -247,10 +237,11 @@ static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_i
247static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector) 237static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector)
248{ 238{
249 struct stripe_head *sh; 239 struct stripe_head *sh;
240 struct hlist_node *hn;
250 241
251 CHECK_DEVLOCK(); 242 CHECK_DEVLOCK();
252 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector); 243 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
253 for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) 244 hlist_for_each_entry (sh, hn, stripe_hash(conf, sector), hash)
254 if (sh->sector == sector) 245 if (sh->sector == sector)
255 return sh; 246 return sh;
256 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector); 247 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
@@ -367,8 +358,8 @@ static void shrink_stripes(raid6_conf_t *conf)
367 conf->slab_cache = NULL; 358 conf->slab_cache = NULL;
368} 359}
369 360
370static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done, 361static int raid6_end_read_request(struct bio * bi, unsigned int bytes_done,
371 int error) 362 int error)
372{ 363{
373 struct stripe_head *sh = bi->bi_private; 364 struct stripe_head *sh = bi->bi_private;
374 raid6_conf_t *conf = sh->raid_conf; 365 raid6_conf_t *conf = sh->raid_conf;
@@ -420,9 +411,35 @@ static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done,
420#else 411#else
421 set_bit(R5_UPTODATE, &sh->dev[i].flags); 412 set_bit(R5_UPTODATE, &sh->dev[i].flags);
422#endif 413#endif
414 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
415 printk(KERN_INFO "raid6: read error corrected!!\n");
416 clear_bit(R5_ReadError, &sh->dev[i].flags);
417 clear_bit(R5_ReWrite, &sh->dev[i].flags);
418 }
419 if (atomic_read(&conf->disks[i].rdev->read_errors))
420 atomic_set(&conf->disks[i].rdev->read_errors, 0);
423 } else { 421 } else {
424 md_error(conf->mddev, conf->disks[i].rdev); 422 int retry = 0;
425 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 423 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
424 atomic_inc(&conf->disks[i].rdev->read_errors);
425 if (conf->mddev->degraded)
426 printk(KERN_WARNING "raid6: read error not correctable.\n");
427 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
428 /* Oh, no!!! */
429 printk(KERN_WARNING "raid6: read error NOT corrected!!\n");
430 else if (atomic_read(&conf->disks[i].rdev->read_errors)
431 > conf->max_nr_stripes)
432 printk(KERN_WARNING
433 "raid6: Too many read errors, failing device.\n");
434 else
435 retry = 1;
436 if (retry)
437 set_bit(R5_ReadError, &sh->dev[i].flags);
438 else {
439 clear_bit(R5_ReadError, &sh->dev[i].flags);
440 clear_bit(R5_ReWrite, &sh->dev[i].flags);
441 md_error(conf->mddev, conf->disks[i].rdev);
442 }
426 } 443 }
427 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 444 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
428#if 0 445#if 0
@@ -805,7 +822,7 @@ static void compute_parity(struct stripe_head *sh, int method)
805} 822}
806 823
807/* Compute one missing block */ 824/* Compute one missing block */
808static void compute_block_1(struct stripe_head *sh, int dd_idx) 825static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
809{ 826{
810 raid6_conf_t *conf = sh->raid_conf; 827 raid6_conf_t *conf = sh->raid_conf;
811 int i, count, disks = conf->raid_disks; 828 int i, count, disks = conf->raid_disks;
@@ -821,7 +838,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx)
821 compute_parity(sh, UPDATE_PARITY); 838 compute_parity(sh, UPDATE_PARITY);
822 } else { 839 } else {
823 ptr[0] = page_address(sh->dev[dd_idx].page); 840 ptr[0] = page_address(sh->dev[dd_idx].page);
824 memset(ptr[0], 0, STRIPE_SIZE); 841 if (!nozero) memset(ptr[0], 0, STRIPE_SIZE);
825 count = 1; 842 count = 1;
826 for (i = disks ; i--; ) { 843 for (i = disks ; i--; ) {
827 if (i == dd_idx || i == qd_idx) 844 if (i == dd_idx || i == qd_idx)
@@ -838,7 +855,8 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx)
838 } 855 }
839 if (count != 1) 856 if (count != 1)
840 xor_block(count, STRIPE_SIZE, ptr); 857 xor_block(count, STRIPE_SIZE, ptr);
841 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); 858 if (!nozero) set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
859 else clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
842 } 860 }
843} 861}
844 862
@@ -871,7 +889,7 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
871 return; 889 return;
872 } else { 890 } else {
873 /* We're missing D+Q; recompute D from P */ 891 /* We're missing D+Q; recompute D from P */
874 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1); 892 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
875 compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */ 893 compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
876 return; 894 return;
877 } 895 }
@@ -982,6 +1000,12 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
982} 1000}
983 1001
984 1002
1003static int page_is_zero(struct page *p)
1004{
1005 char *a = page_address(p);
1006 return ((*(u32*)a) == 0 &&
1007 memcmp(a, a+4, STRIPE_SIZE-4)==0);
1008}
985/* 1009/*
986 * handle_stripe - do things to a stripe. 1010 * handle_stripe - do things to a stripe.
987 * 1011 *
@@ -1000,7 +1024,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
1000 * 1024 *
1001 */ 1025 */
1002 1026
1003static void handle_stripe(struct stripe_head *sh) 1027static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
1004{ 1028{
1005 raid6_conf_t *conf = sh->raid_conf; 1029 raid6_conf_t *conf = sh->raid_conf;
1006 int disks = conf->raid_disks; 1030 int disks = conf->raid_disks;
@@ -1027,11 +1051,11 @@ static void handle_stripe(struct stripe_head *sh)
1027 syncing = test_bit(STRIPE_SYNCING, &sh->state); 1051 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1028 /* Now to look around and see what can be done */ 1052 /* Now to look around and see what can be done */
1029 1053
1054 rcu_read_lock();
1030 for (i=disks; i--; ) { 1055 for (i=disks; i--; ) {
1031 mdk_rdev_t *rdev; 1056 mdk_rdev_t *rdev;
1032 dev = &sh->dev[i]; 1057 dev = &sh->dev[i];
1033 clear_bit(R5_Insync, &dev->flags); 1058 clear_bit(R5_Insync, &dev->flags);
1034 clear_bit(R5_Syncio, &dev->flags);
1035 1059
1036 PRINTK("check %d: state 0x%lx read %p write %p written %p\n", 1060 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
1037 i, dev->flags, dev->toread, dev->towrite, dev->written); 1061 i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -1070,14 +1094,21 @@ static void handle_stripe(struct stripe_head *sh)
1070 non_overwrite++; 1094 non_overwrite++;
1071 } 1095 }
1072 if (dev->written) written++; 1096 if (dev->written) written++;
1073 rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */ 1097 rdev = rcu_dereference(conf->disks[i].rdev);
1074 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 1098 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
1099 /* The ReadError flag will just be confusing now */
1100 clear_bit(R5_ReadError, &dev->flags);
1101 clear_bit(R5_ReWrite, &dev->flags);
1102 }
1103 if (!rdev || !test_bit(In_sync, &rdev->flags)
1104 || test_bit(R5_ReadError, &dev->flags)) {
1075 if ( failed < 2 ) 1105 if ( failed < 2 )
1076 failed_num[failed] = i; 1106 failed_num[failed] = i;
1077 failed++; 1107 failed++;
1078 } else 1108 } else
1079 set_bit(R5_Insync, &dev->flags); 1109 set_bit(R5_Insync, &dev->flags);
1080 } 1110 }
1111 rcu_read_unlock();
1081 PRINTK("locked=%d uptodate=%d to_read=%d" 1112 PRINTK("locked=%d uptodate=%d to_read=%d"
1082 " to_write=%d failed=%d failed_num=%d,%d\n", 1113 " to_write=%d failed=%d failed_num=%d,%d\n",
1083 locked, uptodate, to_read, to_write, failed, 1114 locked, uptodate, to_read, to_write, failed,
@@ -1088,6 +1119,17 @@ static void handle_stripe(struct stripe_head *sh)
1088 if (failed > 2 && to_read+to_write+written) { 1119 if (failed > 2 && to_read+to_write+written) {
1089 for (i=disks; i--; ) { 1120 for (i=disks; i--; ) {
1090 int bitmap_end = 0; 1121 int bitmap_end = 0;
1122
1123 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1124 mdk_rdev_t *rdev;
1125 rcu_read_lock();
1126 rdev = rcu_dereference(conf->disks[i].rdev);
1127 if (rdev && test_bit(In_sync, &rdev->flags))
1128 /* multiple read failures in one stripe */
1129 md_error(conf->mddev, rdev);
1130 rcu_read_unlock();
1131 }
1132
1091 spin_lock_irq(&conf->device_lock); 1133 spin_lock_irq(&conf->device_lock);
1092 /* fail all writes first */ 1134 /* fail all writes first */
1093 bi = sh->dev[i].towrite; 1135 bi = sh->dev[i].towrite;
@@ -1123,7 +1165,8 @@ static void handle_stripe(struct stripe_head *sh)
1123 } 1165 }
1124 1166
1125 /* fail any reads if this device is non-operational */ 1167 /* fail any reads if this device is non-operational */
1126 if (!test_bit(R5_Insync, &sh->dev[i].flags)) { 1168 if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
1169 test_bit(R5_ReadError, &sh->dev[i].flags)) {
1127 bi = sh->dev[i].toread; 1170 bi = sh->dev[i].toread;
1128 sh->dev[i].toread = NULL; 1171 sh->dev[i].toread = NULL;
1129 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 1172 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
@@ -1228,7 +1271,7 @@ static void handle_stripe(struct stripe_head *sh)
1228 if (uptodate == disks-1) { 1271 if (uptodate == disks-1) {
1229 PRINTK("Computing stripe %llu block %d\n", 1272 PRINTK("Computing stripe %llu block %d\n",
1230 (unsigned long long)sh->sector, i); 1273 (unsigned long long)sh->sector, i);
1231 compute_block_1(sh, i); 1274 compute_block_1(sh, i, 0);
1232 uptodate++; 1275 uptodate++;
1233 } else if ( uptodate == disks-2 && failed >= 2 ) { 1276 } else if ( uptodate == disks-2 && failed >= 2 ) {
1234 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */ 1277 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
@@ -1259,9 +1302,6 @@ static void handle_stripe(struct stripe_head *sh)
1259 locked++; 1302 locked++;
1260 PRINTK("Reading block %d (sync=%d)\n", 1303 PRINTK("Reading block %d (sync=%d)\n",
1261 i, syncing); 1304 i, syncing);
1262 if (syncing)
1263 md_sync_acct(conf->disks[i].rdev->bdev,
1264 STRIPE_SECTORS);
1265 } 1305 }
1266 } 1306 }
1267 } 1307 }
@@ -1323,7 +1363,7 @@ static void handle_stripe(struct stripe_head *sh)
1323 /* We have failed blocks and need to compute them */ 1363 /* We have failed blocks and need to compute them */
1324 switch ( failed ) { 1364 switch ( failed ) {
1325 case 0: BUG(); 1365 case 0: BUG();
1326 case 1: compute_block_1(sh, failed_num[0]); break; 1366 case 1: compute_block_1(sh, failed_num[0], 0); break;
1327 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break; 1367 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
1328 default: BUG(); /* This request should have been failed? */ 1368 default: BUG(); /* This request should have been failed? */
1329 } 1369 }
@@ -1338,12 +1378,10 @@ static void handle_stripe(struct stripe_head *sh)
1338 (unsigned long long)sh->sector, i); 1378 (unsigned long long)sh->sector, i);
1339 locked++; 1379 locked++;
1340 set_bit(R5_Wantwrite, &sh->dev[i].flags); 1380 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1341#if 0 /**** FIX: I don't understand the logic here... ****/
1342 if (!test_bit(R5_Insync, &sh->dev[i].flags)
1343 || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */
1344 set_bit(STRIPE_INSYNC, &sh->state);
1345#endif
1346 } 1381 }
1382 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
1383 set_bit(STRIPE_INSYNC, &sh->state);
1384
1347 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 1385 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1348 atomic_dec(&conf->preread_active_stripes); 1386 atomic_dec(&conf->preread_active_stripes);
1349 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 1387 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
@@ -1356,84 +1394,119 @@ static void handle_stripe(struct stripe_head *sh)
1356 * Any reads will already have been scheduled, so we just see if enough data 1394 * Any reads will already have been scheduled, so we just see if enough data
1357 * is available 1395 * is available
1358 */ 1396 */
1359 if (syncing && locked == 0 && 1397 if (syncing && locked == 0 && !test_bit(STRIPE_INSYNC, &sh->state)) {
1360 !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) { 1398 int update_p = 0, update_q = 0;
1361 set_bit(STRIPE_HANDLE, &sh->state); 1399 struct r5dev *dev;
1362#if 0 /* RAID-6: Don't support CHECK PARITY yet */
1363 if (failed == 0) {
1364 char *pagea;
1365 if (uptodate != disks)
1366 BUG();
1367 compute_parity(sh, CHECK_PARITY);
1368 uptodate--;
1369 pagea = page_address(sh->dev[pd_idx].page);
1370 if ((*(u32*)pagea) == 0 &&
1371 !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
1372 /* parity is correct (on disc, not in buffer any more) */
1373 set_bit(STRIPE_INSYNC, &sh->state);
1374 }
1375 }
1376#endif
1377 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1378 int failed_needupdate[2];
1379 struct r5dev *adev, *bdev;
1380
1381 if ( failed < 1 )
1382 failed_num[0] = pd_idx;
1383 if ( failed < 2 )
1384 failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx;
1385 1400
1386 failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags); 1401 set_bit(STRIPE_HANDLE, &sh->state);
1387 failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags);
1388 1402
1389 PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n", 1403 BUG_ON(failed>2);
1390 failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]); 1404 BUG_ON(uptodate < disks);
1405 /* Want to check and possibly repair P and Q.
1406 * However there could be one 'failed' device, in which
1407 * case we can only check one of them, possibly using the
1408 * other to generate missing data
1409 */
1391 1410
1392#if 0 /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */ 1411 /* If !tmp_page, we cannot do the calculations,
1393 /* should be able to compute the missing block(s) and write to spare */ 1412 * but as we have set STRIPE_HANDLE, we will soon be called
1394 if ( failed_needupdate[0] ^ failed_needupdate[1] ) { 1413 * by stripe_handle with a tmp_page - just wait until then.
1395 if (uptodate+1 != disks) 1414 */
1396 BUG(); 1415 if (tmp_page) {
1397 compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]); 1416 if (failed == q_failed) {
1398 uptodate++; 1417 /* The only possible failed device holds 'Q', so it makes
1399 } else if ( failed_needupdate[0] & failed_needupdate[1] ) { 1418 * sense to check P (If anything else were failed, we would
1400 if (uptodate+2 != disks) 1419 * have used P to recreate it).
1401 BUG(); 1420 */
1402 compute_block_2(sh, failed_num[0], failed_num[1]); 1421 compute_block_1(sh, pd_idx, 1);
1403 uptodate += 2; 1422 if (!page_is_zero(sh->dev[pd_idx].page)) {
1423 compute_block_1(sh,pd_idx,0);
1424 update_p = 1;
1425 }
1426 }
1427 if (!q_failed && failed < 2) {
1428 /* q is not failed, and we didn't use it to generate
1429 * anything, so it makes sense to check it
1430 */
1431 memcpy(page_address(tmp_page),
1432 page_address(sh->dev[qd_idx].page),
1433 STRIPE_SIZE);
1434 compute_parity(sh, UPDATE_PARITY);
1435 if (memcmp(page_address(tmp_page),
1436 page_address(sh->dev[qd_idx].page),
1437 STRIPE_SIZE)!= 0) {
1438 clear_bit(STRIPE_INSYNC, &sh->state);
1439 update_q = 1;
1440 }
1441 }
1442 if (update_p || update_q) {
1443 conf->mddev->resync_mismatches += STRIPE_SECTORS;
1444 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
1445 /* don't try to repair!! */
1446 update_p = update_q = 0;
1404 } 1447 }
1405#else
1406 compute_block_2(sh, failed_num[0], failed_num[1]);
1407 uptodate += failed_needupdate[0] + failed_needupdate[1];
1408#endif
1409 1448
1410 if (uptodate != disks) 1449 /* now write out any block on a failed drive,
1411 BUG(); 1450 * or P or Q if they need it
1451 */
1412 1452
1413 PRINTK("Marking for sync stripe %llu blocks %d,%d\n", 1453 if (failed == 2) {
1414 (unsigned long long)sh->sector, failed_num[0], failed_num[1]); 1454 dev = &sh->dev[failed_num[1]];
1455 locked++;
1456 set_bit(R5_LOCKED, &dev->flags);
1457 set_bit(R5_Wantwrite, &dev->flags);
1458 }
1459 if (failed >= 1) {
1460 dev = &sh->dev[failed_num[0]];
1461 locked++;
1462 set_bit(R5_LOCKED, &dev->flags);
1463 set_bit(R5_Wantwrite, &dev->flags);
1464 }
1415 1465
1416 /**** FIX: Should we really do both of these unconditionally? ****/ 1466 if (update_p) {
1417 adev = &sh->dev[failed_num[0]]; 1467 dev = &sh->dev[pd_idx];
1418 locked += !test_bit(R5_LOCKED, &adev->flags); 1468 locked ++;
1419 set_bit(R5_LOCKED, &adev->flags); 1469 set_bit(R5_LOCKED, &dev->flags);
1420 set_bit(R5_Wantwrite, &adev->flags); 1470 set_bit(R5_Wantwrite, &dev->flags);
1421 bdev = &sh->dev[failed_num[1]]; 1471 }
1422 locked += !test_bit(R5_LOCKED, &bdev->flags); 1472 if (update_q) {
1423 set_bit(R5_LOCKED, &bdev->flags); 1473 dev = &sh->dev[qd_idx];
1474 locked++;
1475 set_bit(R5_LOCKED, &dev->flags);
1476 set_bit(R5_Wantwrite, &dev->flags);
1477 }
1424 clear_bit(STRIPE_DEGRADED, &sh->state); 1478 clear_bit(STRIPE_DEGRADED, &sh->state);
1425 set_bit(R5_Wantwrite, &bdev->flags);
1426 1479
1427 set_bit(STRIPE_INSYNC, &sh->state); 1480 set_bit(STRIPE_INSYNC, &sh->state);
1428 set_bit(R5_Syncio, &adev->flags);
1429 set_bit(R5_Syncio, &bdev->flags);
1430 } 1481 }
1431 } 1482 }
1483
1432 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 1484 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1433 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 1485 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1434 clear_bit(STRIPE_SYNCING, &sh->state); 1486 clear_bit(STRIPE_SYNCING, &sh->state);
1435 } 1487 }
1436 1488
1489 /* If the failed drives are just a ReadError, then we might need
1490 * to progress the repair/check process
1491 */
1492 if (failed <= 2 && ! conf->mddev->ro)
1493 for (i=0; i<failed;i++) {
1494 dev = &sh->dev[failed_num[i]];
1495 if (test_bit(R5_ReadError, &dev->flags)
1496 && !test_bit(R5_LOCKED, &dev->flags)
1497 && test_bit(R5_UPTODATE, &dev->flags)
1498 ) {
1499 if (!test_bit(R5_ReWrite, &dev->flags)) {
1500 set_bit(R5_Wantwrite, &dev->flags);
1501 set_bit(R5_ReWrite, &dev->flags);
1502 set_bit(R5_LOCKED, &dev->flags);
1503 } else {
1504 /* let's read it back */
1505 set_bit(R5_Wantread, &dev->flags);
1506 set_bit(R5_LOCKED, &dev->flags);
1507 }
1508 }
1509 }
1437 spin_unlock(&sh->lock); 1510 spin_unlock(&sh->lock);
1438 1511
1439 while ((bi=return_bi)) { 1512 while ((bi=return_bi)) {
@@ -1472,7 +1545,7 @@ static void handle_stripe(struct stripe_head *sh)
1472 rcu_read_unlock(); 1545 rcu_read_unlock();
1473 1546
1474 if (rdev) { 1547 if (rdev) {
1475 if (test_bit(R5_Syncio, &sh->dev[i].flags)) 1548 if (syncing)
1476 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 1549 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1477 1550
1478 bi->bi_bdev = rdev->bdev; 1551 bi->bi_bdev = rdev->bdev;
@@ -1489,6 +1562,9 @@ static void handle_stripe(struct stripe_head *sh)
1489 bi->bi_io_vec[0].bv_offset = 0; 1562 bi->bi_io_vec[0].bv_offset = 0;
1490 bi->bi_size = STRIPE_SIZE; 1563 bi->bi_size = STRIPE_SIZE;
1491 bi->bi_next = NULL; 1564 bi->bi_next = NULL;
1565 if (rw == WRITE &&
1566 test_bit(R5_ReWrite, &sh->dev[i].flags))
1567 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1492 generic_make_request(bi); 1568 generic_make_request(bi);
1493 } else { 1569 } else {
1494 if (rw == 1) 1570 if (rw == 1)
@@ -1664,7 +1740,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
1664 } 1740 }
1665 finish_wait(&conf->wait_for_overlap, &w); 1741 finish_wait(&conf->wait_for_overlap, &w);
1666 raid6_plug_device(conf); 1742 raid6_plug_device(conf);
1667 handle_stripe(sh); 1743 handle_stripe(sh, NULL);
1668 release_stripe(sh); 1744 release_stripe(sh);
1669 } else { 1745 } else {
1670 /* cannot get stripe for read-ahead, just give-up */ 1746 /* cannot get stripe for read-ahead, just give-up */
@@ -1728,6 +1804,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1728 return rv; 1804 return rv;
1729 } 1805 }
1730 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 1806 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1807 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
1731 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 1808 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
1732 /* we can skip this block, and probably more */ 1809 /* we can skip this block, and probably more */
1733 sync_blocks /= STRIPE_SECTORS; 1810 sync_blocks /= STRIPE_SECTORS;
@@ -1765,7 +1842,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
1765 clear_bit(STRIPE_INSYNC, &sh->state); 1842 clear_bit(STRIPE_INSYNC, &sh->state);
1766 spin_unlock(&sh->lock); 1843 spin_unlock(&sh->lock);
1767 1844
1768 handle_stripe(sh); 1845 handle_stripe(sh, NULL);
1769 release_stripe(sh); 1846 release_stripe(sh);
1770 1847
1771 return STRIPE_SECTORS; 1848 return STRIPE_SECTORS;
@@ -1821,7 +1898,7 @@ static void raid6d (mddev_t *mddev)
1821 spin_unlock_irq(&conf->device_lock); 1898 spin_unlock_irq(&conf->device_lock);
1822 1899
1823 handled++; 1900 handled++;
1824 handle_stripe(sh); 1901 handle_stripe(sh, conf->spare_page);
1825 release_stripe(sh); 1902 release_stripe(sh);
1826 1903
1827 spin_lock_irq(&conf->device_lock); 1904 spin_lock_irq(&conf->device_lock);
@@ -1848,17 +1925,19 @@ static int run(mddev_t *mddev)
1848 return -EIO; 1925 return -EIO;
1849 } 1926 }
1850 1927
1851 mddev->private = kmalloc (sizeof (raid6_conf_t) 1928 mddev->private = kzalloc(sizeof (raid6_conf_t)
1852 + mddev->raid_disks * sizeof(struct disk_info), 1929 + mddev->raid_disks * sizeof(struct disk_info),
1853 GFP_KERNEL); 1930 GFP_KERNEL);
1854 if ((conf = mddev->private) == NULL) 1931 if ((conf = mddev->private) == NULL)
1855 goto abort; 1932 goto abort;
1856 memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) );
1857 conf->mddev = mddev; 1933 conf->mddev = mddev;
1858 1934
1859 if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) 1935 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
1936 goto abort;
1937
1938 conf->spare_page = alloc_page(GFP_KERNEL);
1939 if (!conf->spare_page)
1860 goto abort; 1940 goto abort;
1861 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1862 1941
1863 spin_lock_init(&conf->device_lock); 1942 spin_lock_init(&conf->device_lock);
1864 init_waitqueue_head(&conf->wait_for_stripe); 1943 init_waitqueue_head(&conf->wait_for_stripe);
@@ -1929,13 +2008,18 @@ static int run(mddev_t *mddev)
1929 goto abort; 2008 goto abort;
1930 } 2009 }
1931 2010
1932#if 0 /* FIX: For now */
1933 if (mddev->degraded > 0 && 2011 if (mddev->degraded > 0 &&
1934 mddev->recovery_cp != MaxSector) { 2012 mddev->recovery_cp != MaxSector) {
1935 printk(KERN_ERR "raid6: cannot start dirty degraded array for %s\n", mdname(mddev)); 2013 if (mddev->ok_start_degraded)
1936 goto abort; 2014 printk(KERN_WARNING "raid6: starting dirty degraded array:%s"
2015 "- data corruption possible.\n",
2016 mdname(mddev));
2017 else {
2018 printk(KERN_ERR "raid6: cannot start dirty degraded array"
2019 " for %s\n", mdname(mddev));
2020 goto abort;
2021 }
1937 } 2022 }
1938#endif
1939 2023
1940 { 2024 {
1941 mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6"); 2025 mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6");
@@ -1977,7 +2061,7 @@ static int run(mddev_t *mddev)
1977 */ 2061 */
1978 { 2062 {
1979 int stripe = (mddev->raid_disks-2) * mddev->chunk_size 2063 int stripe = (mddev->raid_disks-2) * mddev->chunk_size
1980 / PAGE_CACHE_SIZE; 2064 / PAGE_SIZE;
1981 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 2065 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
1982 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 2066 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
1983 } 2067 }
@@ -1985,18 +2069,14 @@ static int run(mddev_t *mddev)
1985 /* Ok, everything is just fine now */ 2069 /* Ok, everything is just fine now */
1986 mddev->array_size = mddev->size * (mddev->raid_disks - 2); 2070 mddev->array_size = mddev->size * (mddev->raid_disks - 2);
1987 2071
1988 if (mddev->bitmap)
1989 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1990
1991 mddev->queue->unplug_fn = raid6_unplug_device; 2072 mddev->queue->unplug_fn = raid6_unplug_device;
1992 mddev->queue->issue_flush_fn = raid6_issue_flush; 2073 mddev->queue->issue_flush_fn = raid6_issue_flush;
1993 return 0; 2074 return 0;
1994abort: 2075abort:
1995 if (conf) { 2076 if (conf) {
1996 print_raid6_conf(conf); 2077 print_raid6_conf(conf);
1997 if (conf->stripe_hashtbl) 2078 safe_put_page(conf->spare_page);
1998 free_pages((unsigned long) conf->stripe_hashtbl, 2079 kfree(conf->stripe_hashtbl);
1999 HASH_PAGES_ORDER);
2000 kfree(conf); 2080 kfree(conf);
2001 } 2081 }
2002 mddev->private = NULL; 2082 mddev->private = NULL;
@@ -2013,7 +2093,7 @@ static int stop (mddev_t *mddev)
2013 md_unregister_thread(mddev->thread); 2093 md_unregister_thread(mddev->thread);
2014 mddev->thread = NULL; 2094 mddev->thread = NULL;
2015 shrink_stripes(conf); 2095 shrink_stripes(conf);
2016 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); 2096 kfree(conf->stripe_hashtbl);
2017 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 2097 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
2018 kfree(conf); 2098 kfree(conf);
2019 mddev->private = NULL; 2099 mddev->private = NULL;
@@ -2040,12 +2120,13 @@ static void print_sh (struct seq_file *seq, struct stripe_head *sh)
2040static void printall (struct seq_file *seq, raid6_conf_t *conf) 2120static void printall (struct seq_file *seq, raid6_conf_t *conf)
2041{ 2121{
2042 struct stripe_head *sh; 2122 struct stripe_head *sh;
2123 struct hlist_node *hn;
2043 int i; 2124 int i;
2044 2125
2045 spin_lock_irq(&conf->device_lock); 2126 spin_lock_irq(&conf->device_lock);
2046 for (i = 0; i < NR_HASH; i++) { 2127 for (i = 0; i < NR_HASH; i++) {
2047 sh = conf->stripe_hashtbl[i]; 2128 sh = conf->stripe_hashtbl[i];
2048 for (; sh; sh = sh->hash_next) { 2129 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
2049 if (sh->raid_conf != conf) 2130 if (sh->raid_conf != conf)
2050 continue; 2131 continue;
2051 print_sh(seq, sh); 2132 print_sh(seq, sh);
@@ -2223,17 +2304,12 @@ static void raid6_quiesce(mddev_t *mddev, int state)
2223 spin_unlock_irq(&conf->device_lock); 2304 spin_unlock_irq(&conf->device_lock);
2224 break; 2305 break;
2225 } 2306 }
2226 if (mddev->thread) {
2227 if (mddev->bitmap)
2228 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2229 else
2230 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2231 md_wakeup_thread(mddev->thread);
2232 }
2233} 2307}
2234static mdk_personality_t raid6_personality= 2308
2309static struct mdk_personality raid6_personality =
2235{ 2310{
2236 .name = "raid6", 2311 .name = "raid6",
2312 .level = 6,
2237 .owner = THIS_MODULE, 2313 .owner = THIS_MODULE,
2238 .make_request = make_request, 2314 .make_request = make_request,
2239 .run = run, 2315 .run = run,
@@ -2248,7 +2324,7 @@ static mdk_personality_t raid6_personality=
2248 .quiesce = raid6_quiesce, 2324 .quiesce = raid6_quiesce,
2249}; 2325};
2250 2326
2251static int __init raid6_init (void) 2327static int __init raid6_init(void)
2252{ 2328{
2253 int e; 2329 int e;
2254 2330
@@ -2256,15 +2332,17 @@ static int __init raid6_init (void)
2256 if ( e ) 2332 if ( e )
2257 return e; 2333 return e;
2258 2334
2259 return register_md_personality (RAID6, &raid6_personality); 2335 return register_md_personality(&raid6_personality);
2260} 2336}
2261 2337
2262static void raid6_exit (void) 2338static void raid6_exit (void)
2263{ 2339{
2264 unregister_md_personality (RAID6); 2340 unregister_md_personality(&raid6_personality);
2265} 2341}
2266 2342
2267module_init(raid6_init); 2343module_init(raid6_init);
2268module_exit(raid6_exit); 2344module_exit(raid6_exit);
2269MODULE_LICENSE("GPL"); 2345MODULE_LICENSE("GPL");
2270MODULE_ALIAS("md-personality-8"); /* RAID6 */ 2346MODULE_ALIAS("md-personality-8"); /* RAID6 */
2347MODULE_ALIAS("md-raid6");
2348MODULE_ALIAS("md-level-6");
diff --git a/drivers/media/video/cpia_pp.c b/drivers/media/video/cpia_pp.c
index ddf184f95d80..6861d408f1b3 100644
--- a/drivers/media/video/cpia_pp.c
+++ b/drivers/media/video/cpia_pp.c
@@ -170,16 +170,9 @@ static size_t cpia_read_nibble (struct parport *port,
170 /* Does the error line indicate end of data? */ 170 /* Does the error line indicate end of data? */
171 if (((i /*& 1*/) == 0) && 171 if (((i /*& 1*/) == 0) &&
172 (parport_read_status(port) & PARPORT_STATUS_ERROR)) { 172 (parport_read_status(port) & PARPORT_STATUS_ERROR)) {
173 port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DNA; 173 DBG("%s: No more nibble data (%d bytes)\n",
174 DBG("%s: No more nibble data (%d bytes)\n", 174 port->name, i/2);
175 port->name, i/2); 175 goto end_of_data;
176
177 /* Go to reverse idle phase. */
178 parport_frob_control (port,
179 PARPORT_CONTROL_AUTOFD,
180 PARPORT_CONTROL_AUTOFD);
181 port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
182 break;
183 } 176 }
184 177
185 /* Event 7: Set nAutoFd low. */ 178 /* Event 7: Set nAutoFd low. */
@@ -227,18 +220,21 @@ static size_t cpia_read_nibble (struct parport *port,
227 byte = nibble; 220 byte = nibble;
228 } 221 }
229 222
230 i /= 2; /* i is now in bytes */
231
232 if (i == len) { 223 if (i == len) {
233 /* Read the last nibble without checking data avail. */ 224 /* Read the last nibble without checking data avail. */
234 port = port->physport; 225 if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
235 if (parport_read_status (port) & PARPORT_STATUS_ERROR) 226 end_of_data:
236 port->ieee1284.phase = IEEE1284_PH_HBUSY_DNA; 227 /* Go to reverse idle phase. */
228 parport_frob_control (port,
229 PARPORT_CONTROL_AUTOFD,
230 PARPORT_CONTROL_AUTOFD);
231 port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
232 }
237 else 233 else
238 port->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL; 234 port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
239 } 235 }
240 236
241 return i; 237 return i/2;
242} 238}
243 239
244/* CPiA nonstandard "Nibble Stream" mode (2 nibbles per cycle, instead of 1) 240/* CPiA nonstandard "Nibble Stream" mode (2 nibbles per cycle, instead of 1)
diff --git a/drivers/message/i2o/Kconfig b/drivers/message/i2o/Kconfig
index 43a942a29c2e..fef677103880 100644
--- a/drivers/message/i2o/Kconfig
+++ b/drivers/message/i2o/Kconfig
@@ -24,6 +24,18 @@ config I2O
24 24
25 If unsure, say N. 25 If unsure, say N.
26 26
27config I2O_LCT_NOTIFY_ON_CHANGES
28 bool "Enable LCT notification"
29 depends on I2O
30 default y
31 ---help---
32 Only say N here if you have a I2O controller from SUN. The SUN
33 firmware doesn't support LCT notification on changes. If this option
34 is enabled on such a controller the driver will hang up in a endless
35 loop. On all other controllers say Y.
36
37 If unsure, say Y.
38
27config I2O_EXT_ADAPTEC 39config I2O_EXT_ADAPTEC
28 bool "Enable Adaptec extensions" 40 bool "Enable Adaptec extensions"
29 depends on I2O 41 depends on I2O
diff --git a/drivers/message/i2o/bus-osm.c b/drivers/message/i2o/bus-osm.c
index 151b228e1cb3..ac06f10c54ec 100644
--- a/drivers/message/i2o/bus-osm.c
+++ b/drivers/message/i2o/bus-osm.c
@@ -17,7 +17,7 @@
17#include <linux/i2o.h> 17#include <linux/i2o.h>
18 18
19#define OSM_NAME "bus-osm" 19#define OSM_NAME "bus-osm"
20#define OSM_VERSION "$Rev$" 20#define OSM_VERSION "1.317"
21#define OSM_DESCRIPTION "I2O Bus Adapter OSM" 21#define OSM_DESCRIPTION "I2O Bus Adapter OSM"
22 22
23static struct i2o_driver i2o_bus_driver; 23static struct i2o_driver i2o_bus_driver;
@@ -39,18 +39,18 @@ static struct i2o_class_id i2o_bus_class_id[] = {
39 */ 39 */
40static int i2o_bus_scan(struct i2o_device *dev) 40static int i2o_bus_scan(struct i2o_device *dev)
41{ 41{
42 struct i2o_message __iomem *msg; 42 struct i2o_message *msg;
43 u32 m;
44 43
45 m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET); 44 msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
46 if (m == I2O_QUEUE_EMPTY) 45 if (IS_ERR(msg))
47 return -ETIMEDOUT; 46 return -ETIMEDOUT;
48 47
49 writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 48 msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
50 writel(I2O_CMD_BUS_SCAN << 24 | HOST_TID << 12 | dev->lct_data.tid, 49 msg->u.head[1] =
51 &msg->u.head[1]); 50 cpu_to_le32(I2O_CMD_BUS_SCAN << 24 | HOST_TID << 12 | dev->lct_data.
51 tid);
52 52
53 return i2o_msg_post_wait(dev->iop, m, 60); 53 return i2o_msg_post_wait(dev->iop, msg, 60);
54}; 54};
55 55
56/** 56/**
@@ -59,8 +59,9 @@ static int i2o_bus_scan(struct i2o_device *dev)
59 * 59 *
60 * Returns count. 60 * Returns count.
61 */ 61 */
62static ssize_t i2o_bus_store_scan(struct device *d, struct device_attribute *attr, const char *buf, 62static ssize_t i2o_bus_store_scan(struct device *d,
63 size_t count) 63 struct device_attribute *attr,
64 const char *buf, size_t count)
64{ 65{
65 struct i2o_device *i2o_dev = to_i2o_device(d); 66 struct i2o_device *i2o_dev = to_i2o_device(d);
66 int rc; 67 int rc;
diff --git a/drivers/message/i2o/config-osm.c b/drivers/message/i2o/config-osm.c
index 10432f665201..3bba7aa82e58 100644
--- a/drivers/message/i2o/config-osm.c
+++ b/drivers/message/i2o/config-osm.c
@@ -22,7 +22,7 @@
22#include <asm/uaccess.h> 22#include <asm/uaccess.h>
23 23
24#define OSM_NAME "config-osm" 24#define OSM_NAME "config-osm"
25#define OSM_VERSION "1.248" 25#define OSM_VERSION "1.323"
26#define OSM_DESCRIPTION "I2O Configuration OSM" 26#define OSM_DESCRIPTION "I2O Configuration OSM"
27 27
28/* access mode user rw */ 28/* access mode user rw */
diff --git a/drivers/message/i2o/core.h b/drivers/message/i2o/core.h
index 9eefedb16211..90628562851e 100644
--- a/drivers/message/i2o/core.h
+++ b/drivers/message/i2o/core.h
@@ -14,8 +14,6 @@
14 */ 14 */
15 15
16/* Exec-OSM */ 16/* Exec-OSM */
17extern struct bus_type i2o_bus_type;
18
19extern struct i2o_driver i2o_exec_driver; 17extern struct i2o_driver i2o_exec_driver;
20extern int i2o_exec_lct_get(struct i2o_controller *); 18extern int i2o_exec_lct_get(struct i2o_controller *);
21 19
@@ -23,6 +21,8 @@ extern int __init i2o_exec_init(void);
23extern void __exit i2o_exec_exit(void); 21extern void __exit i2o_exec_exit(void);
24 22
25/* driver */ 23/* driver */
24extern struct bus_type i2o_bus_type;
25
26extern int i2o_driver_dispatch(struct i2o_controller *, u32); 26extern int i2o_driver_dispatch(struct i2o_controller *, u32);
27 27
28extern int __init i2o_driver_init(void); 28extern int __init i2o_driver_init(void);
@@ -33,19 +33,27 @@ extern int __init i2o_pci_init(void);
33extern void __exit i2o_pci_exit(void); 33extern void __exit i2o_pci_exit(void);
34 34
35/* device */ 35/* device */
36extern struct device_attribute i2o_device_attrs[];
37
36extern void i2o_device_remove(struct i2o_device *); 38extern void i2o_device_remove(struct i2o_device *);
37extern int i2o_device_parse_lct(struct i2o_controller *); 39extern int i2o_device_parse_lct(struct i2o_controller *);
38 40
39/* IOP */ 41/* IOP */
40extern struct i2o_controller *i2o_iop_alloc(void); 42extern struct i2o_controller *i2o_iop_alloc(void);
41extern void i2o_iop_free(struct i2o_controller *); 43
44/**
45 * i2o_iop_free - Free the i2o_controller struct
46 * @c: I2O controller to free
47 */
48static inline void i2o_iop_free(struct i2o_controller *c)
49{
50 i2o_pool_free(&c->in_msg);
51 kfree(c);
52}
42 53
43extern int i2o_iop_add(struct i2o_controller *); 54extern int i2o_iop_add(struct i2o_controller *);
44extern void i2o_iop_remove(struct i2o_controller *); 55extern void i2o_iop_remove(struct i2o_controller *);
45 56
46/* config */
47extern int i2o_parm_issue(struct i2o_device *, int, void *, int, void *, int);
48
49/* control registers relative to c->base */ 57/* control registers relative to c->base */
50#define I2O_IRQ_STATUS 0x30 58#define I2O_IRQ_STATUS 0x30
51#define I2O_IRQ_MASK 0x34 59#define I2O_IRQ_MASK 0x34
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index 8eb50cdb8ae1..ee183053fa23 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -35,18 +35,18 @@
35static inline int i2o_device_issue_claim(struct i2o_device *dev, u32 cmd, 35static inline int i2o_device_issue_claim(struct i2o_device *dev, u32 cmd,
36 u32 type) 36 u32 type)
37{ 37{
38 struct i2o_message __iomem *msg; 38 struct i2o_message *msg;
39 u32 m;
40 39
41 m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET); 40 msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
42 if (m == I2O_QUEUE_EMPTY) 41 if (IS_ERR(msg))
43 return -ETIMEDOUT; 42 return PTR_ERR(msg);
44 43
45 writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 44 msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
46 writel(cmd << 24 | HOST_TID << 12 | dev->lct_data.tid, &msg->u.head[1]); 45 msg->u.head[1] =
47 writel(type, &msg->body[0]); 46 cpu_to_le32(cmd << 24 | HOST_TID << 12 | dev->lct_data.tid);
47 msg->body[0] = cpu_to_le32(type);
48 48
49 return i2o_msg_post_wait(dev->iop, m, 60); 49 return i2o_msg_post_wait(dev->iop, msg, 60);
50} 50}
51 51
52/** 52/**
@@ -123,7 +123,6 @@ int i2o_device_claim_release(struct i2o_device *dev)
123 return rc; 123 return rc;
124} 124}
125 125
126
127/** 126/**
128 * i2o_device_release - release the memory for a I2O device 127 * i2o_device_release - release the memory for a I2O device
129 * @dev: I2O device which should be released 128 * @dev: I2O device which should be released
@@ -140,10 +139,10 @@ static void i2o_device_release(struct device *dev)
140 kfree(i2o_dev); 139 kfree(i2o_dev);
141} 140}
142 141
143
144/** 142/**
145 * i2o_device_class_show_class_id - Displays class id of I2O device 143 * i2o_device_show_class_id - Displays class id of I2O device
146 * @cd: class device of which the class id should be displayed 144 * @dev: device of which the class id should be displayed
145 * @attr: pointer to device attribute
147 * @buf: buffer into which the class id should be printed 146 * @buf: buffer into which the class id should be printed
148 * 147 *
149 * Returns the number of bytes which are printed into the buffer. 148 * Returns the number of bytes which are printed into the buffer.
@@ -159,15 +158,15 @@ static ssize_t i2o_device_show_class_id(struct device *dev,
159} 158}
160 159
161/** 160/**
162 * i2o_device_class_show_tid - Displays TID of I2O device 161 * i2o_device_show_tid - Displays TID of I2O device
163 * @cd: class device of which the TID should be displayed 162 * @dev: device of which the TID should be displayed
164 * @buf: buffer into which the class id should be printed 163 * @attr: pointer to device attribute
164 * @buf: buffer into which the TID should be printed
165 * 165 *
166 * Returns the number of bytes which are printed into the buffer. 166 * Returns the number of bytes which are printed into the buffer.
167 */ 167 */
168static ssize_t i2o_device_show_tid(struct device *dev, 168static ssize_t i2o_device_show_tid(struct device *dev,
169 struct device_attribute *attr, 169 struct device_attribute *attr, char *buf)
170 char *buf)
171{ 170{
172 struct i2o_device *i2o_dev = to_i2o_device(dev); 171 struct i2o_device *i2o_dev = to_i2o_device(dev);
173 172
@@ -175,6 +174,7 @@ static ssize_t i2o_device_show_tid(struct device *dev,
175 return strlen(buf) + 1; 174 return strlen(buf) + 1;
176} 175}
177 176
177/* I2O device attributes */
178struct device_attribute i2o_device_attrs[] = { 178struct device_attribute i2o_device_attrs[] = {
179 __ATTR(class_id, S_IRUGO, i2o_device_show_class_id, NULL), 179 __ATTR(class_id, S_IRUGO, i2o_device_show_class_id, NULL),
180 __ATTR(tid, S_IRUGO, i2o_device_show_tid, NULL), 180 __ATTR(tid, S_IRUGO, i2o_device_show_tid, NULL),
@@ -193,12 +193,10 @@ static struct i2o_device *i2o_device_alloc(void)
193{ 193{
194 struct i2o_device *dev; 194 struct i2o_device *dev;
195 195
196 dev = kmalloc(sizeof(*dev), GFP_KERNEL); 196 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
197 if (!dev) 197 if (!dev)
198 return ERR_PTR(-ENOMEM); 198 return ERR_PTR(-ENOMEM);
199 199
200 memset(dev, 0, sizeof(*dev));
201
202 INIT_LIST_HEAD(&dev->list); 200 INIT_LIST_HEAD(&dev->list);
203 init_MUTEX(&dev->lock); 201 init_MUTEX(&dev->lock);
204 202
@@ -209,66 +207,6 @@ static struct i2o_device *i2o_device_alloc(void)
209} 207}
210 208
211/** 209/**
212 * i2o_setup_sysfs_links - Adds attributes to the I2O device
213 * @cd: I2O class device which is added to the I2O device class
214 *
215 * This function get called when a I2O device is added to the class. It
216 * creates the attributes for each device and creates user/parent symlink
217 * if necessary.
218 *
219 * Returns 0 on success or negative error code on failure.
220 */
221static void i2o_setup_sysfs_links(struct i2o_device *i2o_dev)
222{
223 struct i2o_controller *c = i2o_dev->iop;
224 struct i2o_device *tmp;
225
226 /* create user entries for this device */
227 tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.user_tid);
228 if (tmp && tmp != i2o_dev)
229 sysfs_create_link(&i2o_dev->device.kobj,
230 &tmp->device.kobj, "user");
231
232 /* create user entries refering to this device */
233 list_for_each_entry(tmp, &c->devices, list)
234 if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid &&
235 tmp != i2o_dev)
236 sysfs_create_link(&tmp->device.kobj,
237 &i2o_dev->device.kobj, "user");
238
239 /* create parent entries for this device */
240 tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.parent_tid);
241 if (tmp && tmp != i2o_dev)
242 sysfs_create_link(&i2o_dev->device.kobj,
243 &tmp->device.kobj, "parent");
244
245 /* create parent entries refering to this device */
246 list_for_each_entry(tmp, &c->devices, list)
247 if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid &&
248 tmp != i2o_dev)
249 sysfs_create_link(&tmp->device.kobj,
250 &i2o_dev->device.kobj, "parent");
251}
252
253static void i2o_remove_sysfs_links(struct i2o_device *i2o_dev)
254{
255 struct i2o_controller *c = i2o_dev->iop;
256 struct i2o_device *tmp;
257
258 sysfs_remove_link(&i2o_dev->device.kobj, "parent");
259 sysfs_remove_link(&i2o_dev->device.kobj, "user");
260
261 list_for_each_entry(tmp, &c->devices, list) {
262 if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
263 sysfs_remove_link(&tmp->device.kobj, "parent");
264 if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
265 sysfs_remove_link(&tmp->device.kobj, "user");
266 }
267}
268
269
270
271/**
272 * i2o_device_add - allocate a new I2O device and add it to the IOP 210 * i2o_device_add - allocate a new I2O device and add it to the IOP
273 * @iop: I2O controller where the device is on 211 * @iop: I2O controller where the device is on
274 * @entry: LCT entry of the I2O device 212 * @entry: LCT entry of the I2O device
@@ -282,33 +220,57 @@ static void i2o_remove_sysfs_links(struct i2o_device *i2o_dev)
282static struct i2o_device *i2o_device_add(struct i2o_controller *c, 220static struct i2o_device *i2o_device_add(struct i2o_controller *c,
283 i2o_lct_entry * entry) 221 i2o_lct_entry * entry)
284{ 222{
285 struct i2o_device *dev; 223 struct i2o_device *i2o_dev, *tmp;
286 224
287 dev = i2o_device_alloc(); 225 i2o_dev = i2o_device_alloc();
288 if (IS_ERR(dev)) { 226 if (IS_ERR(i2o_dev)) {
289 printk(KERN_ERR "i2o: unable to allocate i2o device\n"); 227 printk(KERN_ERR "i2o: unable to allocate i2o device\n");
290 return dev; 228 return i2o_dev;
291 } 229 }
292 230
293 dev->lct_data = *entry; 231 i2o_dev->lct_data = *entry;
294 dev->iop = c;
295 232
296 snprintf(dev->device.bus_id, BUS_ID_SIZE, "%d:%03x", c->unit, 233 snprintf(i2o_dev->device.bus_id, BUS_ID_SIZE, "%d:%03x", c->unit,
297 dev->lct_data.tid); 234 i2o_dev->lct_data.tid);
298 235
299 dev->device.parent = &c->device; 236 i2o_dev->iop = c;
237 i2o_dev->device.parent = &c->device;
300 238
301 device_register(&dev->device); 239 device_register(&i2o_dev->device);
302 240
303 list_add_tail(&dev->list, &c->devices); 241 list_add_tail(&i2o_dev->list, &c->devices);
304 242
305 i2o_setup_sysfs_links(dev); 243 /* create user entries for this device */
244 tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.user_tid);
245 if (tmp && (tmp != i2o_dev))
246 sysfs_create_link(&i2o_dev->device.kobj, &tmp->device.kobj,
247 "user");
306 248
307 i2o_driver_notify_device_add_all(dev); 249 /* create user entries refering to this device */
250 list_for_each_entry(tmp, &c->devices, list)
251 if ((tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
252 && (tmp != i2o_dev))
253 sysfs_create_link(&tmp->device.kobj,
254 &i2o_dev->device.kobj, "user");
308 255
309 pr_debug("i2o: device %s added\n", dev->device.bus_id); 256 /* create parent entries for this device */
257 tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.parent_tid);
258 if (tmp && (tmp != i2o_dev))
259 sysfs_create_link(&i2o_dev->device.kobj, &tmp->device.kobj,
260 "parent");
310 261
311 return dev; 262 /* create parent entries refering to this device */
263 list_for_each_entry(tmp, &c->devices, list)
264 if ((tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
265 && (tmp != i2o_dev))
266 sysfs_create_link(&tmp->device.kobj,
267 &i2o_dev->device.kobj, "parent");
268
269 i2o_driver_notify_device_add_all(i2o_dev);
270
271 pr_debug("i2o: device %s added\n", i2o_dev->device.bus_id);
272
273 return i2o_dev;
312} 274}
313 275
314/** 276/**
@@ -321,9 +283,22 @@ static struct i2o_device *i2o_device_add(struct i2o_controller *c,
321 */ 283 */
322void i2o_device_remove(struct i2o_device *i2o_dev) 284void i2o_device_remove(struct i2o_device *i2o_dev)
323{ 285{
286 struct i2o_device *tmp;
287 struct i2o_controller *c = i2o_dev->iop;
288
324 i2o_driver_notify_device_remove_all(i2o_dev); 289 i2o_driver_notify_device_remove_all(i2o_dev);
325 i2o_remove_sysfs_links(i2o_dev); 290
291 sysfs_remove_link(&i2o_dev->device.kobj, "parent");
292 sysfs_remove_link(&i2o_dev->device.kobj, "user");
293
294 list_for_each_entry(tmp, &c->devices, list) {
295 if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
296 sysfs_remove_link(&tmp->device.kobj, "parent");
297 if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
298 sysfs_remove_link(&tmp->device.kobj, "user");
299 }
326 list_del(&i2o_dev->list); 300 list_del(&i2o_dev->list);
301
327 device_unregister(&i2o_dev->device); 302 device_unregister(&i2o_dev->device);
328} 303}
329 304
@@ -341,56 +316,83 @@ int i2o_device_parse_lct(struct i2o_controller *c)
341{ 316{
342 struct i2o_device *dev, *tmp; 317 struct i2o_device *dev, *tmp;
343 i2o_lct *lct; 318 i2o_lct *lct;
344 int i; 319 u32 *dlct = c->dlct.virt;
345 int max; 320 int max = 0, i = 0;
321 u16 table_size;
322 u32 buf;
346 323
347 down(&c->lct_lock); 324 down(&c->lct_lock);
348 325
349 kfree(c->lct); 326 kfree(c->lct);
350 327
351 lct = c->dlct.virt; 328 buf = le32_to_cpu(*dlct++);
329 table_size = buf & 0xffff;
352 330
353 c->lct = kmalloc(lct->table_size * 4, GFP_KERNEL); 331 lct = c->lct = kmalloc(table_size * 4, GFP_KERNEL);
354 if (!c->lct) { 332 if (!lct) {
355 up(&c->lct_lock); 333 up(&c->lct_lock);
356 return -ENOMEM; 334 return -ENOMEM;
357 } 335 }
358 336
359 if (lct->table_size * 4 > c->dlct.len) { 337 lct->lct_ver = buf >> 28;
360 memcpy(c->lct, c->dlct.virt, c->dlct.len); 338 lct->boot_tid = buf >> 16 & 0xfff;
361 up(&c->lct_lock); 339 lct->table_size = table_size;
362 return -EAGAIN; 340 lct->change_ind = le32_to_cpu(*dlct++);
363 } 341 lct->iop_flags = le32_to_cpu(*dlct++);
364 342
365 memcpy(c->lct, c->dlct.virt, lct->table_size * 4); 343 table_size -= 3;
366
367 lct = c->lct;
368
369 max = (lct->table_size - 3) / 9;
370 344
371 pr_debug("%s: LCT has %d entries (LCT size: %d)\n", c->name, max, 345 pr_debug("%s: LCT has %d entries (LCT size: %d)\n", c->name, max,
372 lct->table_size); 346 lct->table_size);
373 347
374 /* remove devices, which are not in the LCT anymore */ 348 while (table_size > 0) {
375 list_for_each_entry_safe(dev, tmp, &c->devices, list) { 349 i2o_lct_entry *entry = &lct->lct_entry[max];
376 int found = 0; 350 int found = 0;
377 351
378 for (i = 0; i < max; i++) { 352 buf = le32_to_cpu(*dlct++);
379 if (lct->lct_entry[i].tid == dev->lct_data.tid) { 353 entry->entry_size = buf & 0xffff;
354 entry->tid = buf >> 16 & 0xfff;
355
356 entry->change_ind = le32_to_cpu(*dlct++);
357 entry->device_flags = le32_to_cpu(*dlct++);
358
359 buf = le32_to_cpu(*dlct++);
360 entry->class_id = buf & 0xfff;
361 entry->version = buf >> 12 & 0xf;
362 entry->vendor_id = buf >> 16;
363
364 entry->sub_class = le32_to_cpu(*dlct++);
365
366 buf = le32_to_cpu(*dlct++);
367 entry->user_tid = buf & 0xfff;
368 entry->parent_tid = buf >> 12 & 0xfff;
369 entry->bios_info = buf >> 24;
370
371 memcpy(&entry->identity_tag, dlct, 8);
372 dlct += 2;
373
374 entry->event_capabilities = le32_to_cpu(*dlct++);
375
376 /* add new devices, which are new in the LCT */
377 list_for_each_entry_safe(dev, tmp, &c->devices, list) {
378 if (entry->tid == dev->lct_data.tid) {
380 found = 1; 379 found = 1;
381 break; 380 break;
382 } 381 }
383 } 382 }
384 383
385 if (!found) 384 if (!found)
386 i2o_device_remove(dev); 385 i2o_device_add(c, entry);
386
387 table_size -= 9;
388 max++;
387 } 389 }
388 390
389 /* add new devices, which are new in the LCT */ 391 /* remove devices, which are not in the LCT anymore */
390 for (i = 0; i < max; i++) { 392 list_for_each_entry_safe(dev, tmp, &c->devices, list) {
391 int found = 0; 393 int found = 0;
392 394
393 list_for_each_entry_safe(dev, tmp, &c->devices, list) { 395 for (i = 0; i < max; i++) {
394 if (lct->lct_entry[i].tid == dev->lct_data.tid) { 396 if (lct->lct_entry[i].tid == dev->lct_data.tid) {
395 found = 1; 397 found = 1;
396 break; 398 break;
@@ -398,14 +400,14 @@ int i2o_device_parse_lct(struct i2o_controller *c)
398 } 400 }
399 401
400 if (!found) 402 if (!found)
401 i2o_device_add(c, &lct->lct_entry[i]); 403 i2o_device_remove(dev);
402 } 404 }
405
403 up(&c->lct_lock); 406 up(&c->lct_lock);
404 407
405 return 0; 408 return 0;
406} 409}
407 410
408
409/* 411/*
410 * Run time support routines 412 * Run time support routines
411 */ 413 */
@@ -419,13 +421,9 @@ int i2o_device_parse_lct(struct i2o_controller *c)
419 * ResultCount, ErrorInfoSize, BlockStatus and BlockSize. 421 * ResultCount, ErrorInfoSize, BlockStatus and BlockSize.
420 */ 422 */
421int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist, 423int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
422 int oplen, void *reslist, int reslen) 424 int oplen, void *reslist, int reslen)
423{ 425{
424 struct i2o_message __iomem *msg; 426 struct i2o_message *msg;
425 u32 m;
426 u32 *res32 = (u32 *) reslist;
427 u32 *restmp = (u32 *) reslist;
428 int len = 0;
429 int i = 0; 427 int i = 0;
430 int rc; 428 int rc;
431 struct i2o_dma res; 429 struct i2o_dma res;
@@ -437,26 +435,27 @@ int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
437 if (i2o_dma_alloc(dev, &res, reslen, GFP_KERNEL)) 435 if (i2o_dma_alloc(dev, &res, reslen, GFP_KERNEL))
438 return -ENOMEM; 436 return -ENOMEM;
439 437
440 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 438 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
441 if (m == I2O_QUEUE_EMPTY) { 439 if (IS_ERR(msg)) {
442 i2o_dma_free(dev, &res); 440 i2o_dma_free(dev, &res);
443 return -ETIMEDOUT; 441 return PTR_ERR(msg);
444 } 442 }
445 443
446 i = 0; 444 i = 0;
447 writel(cmd << 24 | HOST_TID << 12 | i2o_dev->lct_data.tid, 445 msg->u.head[1] =
448 &msg->u.head[1]); 446 cpu_to_le32(cmd << 24 | HOST_TID << 12 | i2o_dev->lct_data.tid);
449 writel(0, &msg->body[i++]); 447 msg->body[i++] = cpu_to_le32(0x00000000);
450 writel(0x4C000000 | oplen, &msg->body[i++]); /* OperationList */ 448 msg->body[i++] = cpu_to_le32(0x4C000000 | oplen); /* OperationList */
451 memcpy_toio(&msg->body[i], oplist, oplen); 449 memcpy(&msg->body[i], oplist, oplen);
452 i += (oplen / 4 + (oplen % 4 ? 1 : 0)); 450 i += (oplen / 4 + (oplen % 4 ? 1 : 0));
453 writel(0xD0000000 | res.len, &msg->body[i++]); /* ResultList */ 451 msg->body[i++] = cpu_to_le32(0xD0000000 | res.len); /* ResultList */
454 writel(res.phys, &msg->body[i++]); 452 msg->body[i++] = cpu_to_le32(res.phys);
455 453
456 writel(I2O_MESSAGE_SIZE(i + sizeof(struct i2o_message) / 4) | 454 msg->u.head[0] =
457 SGL_OFFSET_5, &msg->u.head[0]); 455 cpu_to_le32(I2O_MESSAGE_SIZE(i + sizeof(struct i2o_message) / 4) |
456 SGL_OFFSET_5);
458 457
459 rc = i2o_msg_post_wait_mem(c, m, 10, &res); 458 rc = i2o_msg_post_wait_mem(c, msg, 10, &res);
460 459
461 /* This only looks like a memory leak - don't "fix" it. */ 460 /* This only looks like a memory leak - don't "fix" it. */
462 if (rc == -ETIMEDOUT) 461 if (rc == -ETIMEDOUT)
@@ -465,36 +464,7 @@ int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
465 memcpy(reslist, res.virt, res.len); 464 memcpy(reslist, res.virt, res.len);
466 i2o_dma_free(dev, &res); 465 i2o_dma_free(dev, &res);
467 466
468 /* Query failed */ 467 return rc;
469 if (rc)
470 return rc;
471 /*
472 * Calculate number of bytes of Result LIST
473 * We need to loop through each Result BLOCK and grab the length
474 */
475 restmp = res32 + 1;
476 len = 1;
477 for (i = 0; i < (res32[0] & 0X0000FFFF); i++) {
478 if (restmp[0] & 0x00FF0000) { /* BlockStatus != SUCCESS */
479 printk(KERN_WARNING
480 "%s - Error:\n ErrorInfoSize = 0x%02x, "
481 "BlockStatus = 0x%02x, BlockSize = 0x%04x\n",
482 (cmd ==
483 I2O_CMD_UTIL_PARAMS_SET) ? "PARAMS_SET" :
484 "PARAMS_GET", res32[1] >> 24,
485 (res32[1] >> 16) & 0xFF, res32[1] & 0xFFFF);
486
487 /*
488 * If this is the only request,than we return an error
489 */
490 if ((res32[0] & 0x0000FFFF) == 1) {
491 return -((res32[1] >> 16) & 0xFF); /* -BlockStatus */
492 }
493 }
494 len += restmp[0] & 0x0000FFFF; /* Length of res BLOCK */
495 restmp += restmp[0] & 0x0000FFFF; /* Skip to next BLOCK */
496 }
497 return (len << 2); /* bytes used by result list */
498} 468}
499 469
500/* 470/*
@@ -503,28 +473,25 @@ int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
503int i2o_parm_field_get(struct i2o_device *i2o_dev, int group, int field, 473int i2o_parm_field_get(struct i2o_device *i2o_dev, int group, int field,
504 void *buf, int buflen) 474 void *buf, int buflen)
505{ 475{
506 u16 opblk[] = { 1, 0, I2O_PARAMS_FIELD_GET, group, 1, field }; 476 u32 opblk[] = { cpu_to_le32(0x00000001),
477 cpu_to_le32((u16) group << 16 | I2O_PARAMS_FIELD_GET),
478 cpu_to_le32((s16) field << 16 | 0x00000001)
479 };
507 u8 *resblk; /* 8 bytes for header */ 480 u8 *resblk; /* 8 bytes for header */
508 int size; 481 int rc;
509
510 if (field == -1) /* whole group */
511 opblk[4] = -1;
512 482
513 resblk = kmalloc(buflen + 8, GFP_KERNEL | GFP_ATOMIC); 483 resblk = kmalloc(buflen + 8, GFP_KERNEL | GFP_ATOMIC);
514 if (!resblk) 484 if (!resblk)
515 return -ENOMEM; 485 return -ENOMEM;
516 486
517 size = i2o_parm_issue(i2o_dev, I2O_CMD_UTIL_PARAMS_GET, opblk, 487 rc = i2o_parm_issue(i2o_dev, I2O_CMD_UTIL_PARAMS_GET, opblk,
518 sizeof(opblk), resblk, buflen + 8); 488 sizeof(opblk), resblk, buflen + 8);
519 489
520 memcpy(buf, resblk + 8, buflen); /* cut off header */ 490 memcpy(buf, resblk + 8, buflen); /* cut off header */
521 491
522 kfree(resblk); 492 kfree(resblk);
523 493
524 if (size > buflen) 494 return rc;
525 return buflen;
526
527 return size;
528} 495}
529 496
530/* 497/*
@@ -534,12 +501,12 @@ int i2o_parm_field_get(struct i2o_device *i2o_dev, int group, int field,
534 * else return specific fields 501 * else return specific fields
535 * ibuf contains fieldindexes 502 * ibuf contains fieldindexes
536 * 503 *
537 * if oper == I2O_PARAMS_LIST_GET, get from specific rows 504 * if oper == I2O_PARAMS_LIST_GET, get from specific rows
538 * if fieldcount == -1 return all fields 505 * if fieldcount == -1 return all fields
539 * ibuf contains rowcount, keyvalues 506 * ibuf contains rowcount, keyvalues
540 * else return specific fields 507 * else return specific fields
541 * fieldcount is # of fieldindexes 508 * fieldcount is # of fieldindexes
542 * ibuf contains fieldindexes, rowcount, keyvalues 509 * ibuf contains fieldindexes, rowcount, keyvalues
543 * 510 *
544 * You could also use directly function i2o_issue_params(). 511 * You could also use directly function i2o_issue_params().
545 */ 512 */
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index 0fb9c4e2ad4c..64130227574f 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -61,12 +61,10 @@ static int i2o_bus_match(struct device *dev, struct device_driver *drv)
61}; 61};
62 62
63/* I2O bus type */ 63/* I2O bus type */
64extern struct device_attribute i2o_device_attrs[];
65
66struct bus_type i2o_bus_type = { 64struct bus_type i2o_bus_type = {
67 .name = "i2o", 65 .name = "i2o",
68 .match = i2o_bus_match, 66 .match = i2o_bus_match,
69 .dev_attrs = i2o_device_attrs, 67 .dev_attrs = i2o_device_attrs
70}; 68};
71 69
72/** 70/**
@@ -219,14 +217,14 @@ int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
219 /* cut of header from message size (in 32-bit words) */ 217 /* cut of header from message size (in 32-bit words) */
220 size = (le32_to_cpu(msg->u.head[0]) >> 16) - 5; 218 size = (le32_to_cpu(msg->u.head[0]) >> 16) - 5;
221 219
222 evt = kmalloc(size * 4 + sizeof(*evt), GFP_ATOMIC | __GFP_ZERO); 220 evt = kzalloc(size * 4 + sizeof(*evt), GFP_ATOMIC);
223 if (!evt) 221 if (!evt)
224 return -ENOMEM; 222 return -ENOMEM;
225 223
226 evt->size = size; 224 evt->size = size;
227 evt->tcntxt = le32_to_cpu(msg->u.s.tcntxt); 225 evt->tcntxt = le32_to_cpu(msg->u.s.tcntxt);
228 evt->event_indicator = le32_to_cpu(msg->body[0]); 226 evt->event_indicator = le32_to_cpu(msg->body[0]);
229 memcpy(&evt->tcntxt, &msg->u.s.tcntxt, size * 4); 227 memcpy(&evt->data, &msg->body[1], size * 4);
230 228
231 list_for_each_entry_safe(dev, tmp, &c->devices, list) 229 list_for_each_entry_safe(dev, tmp, &c->devices, list)
232 if (dev->lct_data.tid == tid) { 230 if (dev->lct_data.tid == tid) {
@@ -349,12 +347,10 @@ int __init i2o_driver_init(void)
349 osm_info("max drivers = %d\n", i2o_max_drivers); 347 osm_info("max drivers = %d\n", i2o_max_drivers);
350 348
351 i2o_drivers = 349 i2o_drivers =
352 kmalloc(i2o_max_drivers * sizeof(*i2o_drivers), GFP_KERNEL); 350 kzalloc(i2o_max_drivers * sizeof(*i2o_drivers), GFP_KERNEL);
353 if (!i2o_drivers) 351 if (!i2o_drivers)
354 return -ENOMEM; 352 return -ENOMEM;
355 353
356 memset(i2o_drivers, 0, i2o_max_drivers * sizeof(*i2o_drivers));
357
358 rc = bus_register(&i2o_bus_type); 354 rc = bus_register(&i2o_bus_type);
359 355
360 if (rc < 0) 356 if (rc < 0)
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index 9c339a2505b0..9bb9859f6dfe 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -33,7 +33,7 @@
33#include <linux/workqueue.h> 33#include <linux/workqueue.h>
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/slab.h> 35#include <linux/slab.h>
36#include <linux/sched.h> /* wait_event_interruptible_timeout() needs this */ 36#include <linux/sched.h> /* wait_event_interruptible_timeout() needs this */
37#include <asm/param.h> /* HZ */ 37#include <asm/param.h> /* HZ */
38#include "core.h" 38#include "core.h"
39 39
@@ -75,11 +75,9 @@ static struct i2o_exec_wait *i2o_exec_wait_alloc(void)
75{ 75{
76 struct i2o_exec_wait *wait; 76 struct i2o_exec_wait *wait;
77 77
78 wait = kmalloc(sizeof(*wait), GFP_KERNEL); 78 wait = kzalloc(sizeof(*wait), GFP_KERNEL);
79 if (!wait) 79 if (!wait)
80 return ERR_PTR(-ENOMEM); 80 return NULL;
81
82 memset(wait, 0, sizeof(*wait));
83 81
84 INIT_LIST_HEAD(&wait->list); 82 INIT_LIST_HEAD(&wait->list);
85 83
@@ -114,13 +112,12 @@ static void i2o_exec_wait_free(struct i2o_exec_wait *wait)
114 * Returns 0 on success, negative error code on timeout or positive error 112 * Returns 0 on success, negative error code on timeout or positive error
115 * code from reply. 113 * code from reply.
116 */ 114 */
117int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long 115int i2o_msg_post_wait_mem(struct i2o_controller *c, struct i2o_message *msg,
118 timeout, struct i2o_dma *dma) 116 unsigned long timeout, struct i2o_dma *dma)
119{ 117{
120 DECLARE_WAIT_QUEUE_HEAD(wq); 118 DECLARE_WAIT_QUEUE_HEAD(wq);
121 struct i2o_exec_wait *wait; 119 struct i2o_exec_wait *wait;
122 static u32 tcntxt = 0x80000000; 120 static u32 tcntxt = 0x80000000;
123 struct i2o_message __iomem *msg = i2o_msg_in_to_virt(c, m);
124 int rc = 0; 121 int rc = 0;
125 122
126 wait = i2o_exec_wait_alloc(); 123 wait = i2o_exec_wait_alloc();
@@ -138,15 +135,15 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long
138 * We will only use transaction contexts >= 0x80000000 for POST WAIT, 135 * We will only use transaction contexts >= 0x80000000 for POST WAIT,
139 * so we could find a POST WAIT reply easier in the reply handler. 136 * so we could find a POST WAIT reply easier in the reply handler.
140 */ 137 */
141 writel(i2o_exec_driver.context, &msg->u.s.icntxt); 138 msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
142 wait->tcntxt = tcntxt++; 139 wait->tcntxt = tcntxt++;
143 writel(wait->tcntxt, &msg->u.s.tcntxt); 140 msg->u.s.tcntxt = cpu_to_le32(wait->tcntxt);
144 141
145 /* 142 /*
146 * Post the message to the controller. At some point later it will 143 * Post the message to the controller. At some point later it will
147 * return. If we time out before it returns then complete will be zero. 144 * return. If we time out before it returns then complete will be zero.
148 */ 145 */
149 i2o_msg_post(c, m); 146 i2o_msg_post(c, msg);
150 147
151 if (!wait->complete) { 148 if (!wait->complete) {
152 wait->wq = &wq; 149 wait->wq = &wq;
@@ -266,13 +263,14 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
266 * 263 *
267 * Returns number of bytes printed into buffer. 264 * Returns number of bytes printed into buffer.
268 */ 265 */
269static ssize_t i2o_exec_show_vendor_id(struct device *d, struct device_attribute *attr, char *buf) 266static ssize_t i2o_exec_show_vendor_id(struct device *d,
267 struct device_attribute *attr, char *buf)
270{ 268{
271 struct i2o_device *dev = to_i2o_device(d); 269 struct i2o_device *dev = to_i2o_device(d);
272 u16 id; 270 u16 id;
273 271
274 if (i2o_parm_field_get(dev, 0x0000, 0, &id, 2)) { 272 if (!i2o_parm_field_get(dev, 0x0000, 0, &id, 2)) {
275 sprintf(buf, "0x%04x", id); 273 sprintf(buf, "0x%04x", le16_to_cpu(id));
276 return strlen(buf) + 1; 274 return strlen(buf) + 1;
277 } 275 }
278 276
@@ -286,13 +284,15 @@ static ssize_t i2o_exec_show_vendor_id(struct device *d, struct device_attribute
286 * 284 *
287 * Returns number of bytes printed into buffer. 285 * Returns number of bytes printed into buffer.
288 */ 286 */
289static ssize_t i2o_exec_show_product_id(struct device *d, struct device_attribute *attr, char *buf) 287static ssize_t i2o_exec_show_product_id(struct device *d,
288 struct device_attribute *attr,
289 char *buf)
290{ 290{
291 struct i2o_device *dev = to_i2o_device(d); 291 struct i2o_device *dev = to_i2o_device(d);
292 u16 id; 292 u16 id;
293 293
294 if (i2o_parm_field_get(dev, 0x0000, 1, &id, 2)) { 294 if (!i2o_parm_field_get(dev, 0x0000, 1, &id, 2)) {
295 sprintf(buf, "0x%04x", id); 295 sprintf(buf, "0x%04x", le16_to_cpu(id));
296 return strlen(buf) + 1; 296 return strlen(buf) + 1;
297 } 297 }
298 298
@@ -362,7 +362,9 @@ static void i2o_exec_lct_modified(struct i2o_controller *c)
362 if (i2o_device_parse_lct(c) != -EAGAIN) 362 if (i2o_device_parse_lct(c) != -EAGAIN)
363 change_ind = c->lct->change_ind + 1; 363 change_ind = c->lct->change_ind + 1;
364 364
365#ifdef CONFIG_I2O_LCT_NOTIFY_ON_CHANGES
365 i2o_exec_lct_notify(c, change_ind); 366 i2o_exec_lct_notify(c, change_ind);
367#endif
366}; 368};
367 369
368/** 370/**
@@ -385,23 +387,22 @@ static int i2o_exec_reply(struct i2o_controller *c, u32 m,
385 u32 context; 387 u32 context;
386 388
387 if (le32_to_cpu(msg->u.head[0]) & MSG_FAIL) { 389 if (le32_to_cpu(msg->u.head[0]) & MSG_FAIL) {
390 struct i2o_message __iomem *pmsg;
391 u32 pm;
392
388 /* 393 /*
389 * If Fail bit is set we must take the transaction context of 394 * If Fail bit is set we must take the transaction context of
390 * the preserved message to find the right request again. 395 * the preserved message to find the right request again.
391 */ 396 */
392 struct i2o_message __iomem *pmsg;
393 u32 pm;
394 397
395 pm = le32_to_cpu(msg->body[3]); 398 pm = le32_to_cpu(msg->body[3]);
396
397 pmsg = i2o_msg_in_to_virt(c, pm); 399 pmsg = i2o_msg_in_to_virt(c, pm);
400 context = readl(&pmsg->u.s.tcntxt);
398 401
399 i2o_report_status(KERN_INFO, "i2o_core", msg); 402 i2o_report_status(KERN_INFO, "i2o_core", msg);
400 403
401 context = readl(&pmsg->u.s.tcntxt);
402
403 /* Release the preserved msg */ 404 /* Release the preserved msg */
404 i2o_msg_nop(c, pm); 405 i2o_msg_nop_mfa(c, pm);
405 } else 406 } else
406 context = le32_to_cpu(msg->u.s.tcntxt); 407 context = le32_to_cpu(msg->u.s.tcntxt);
407 408
@@ -462,25 +463,26 @@ static void i2o_exec_event(struct i2o_event *evt)
462 */ 463 */
463int i2o_exec_lct_get(struct i2o_controller *c) 464int i2o_exec_lct_get(struct i2o_controller *c)
464{ 465{
465 struct i2o_message __iomem *msg; 466 struct i2o_message *msg;
466 u32 m;
467 int i = 0; 467 int i = 0;
468 int rc = -EAGAIN; 468 int rc = -EAGAIN;
469 469
470 for (i = 1; i <= I2O_LCT_GET_TRIES; i++) { 470 for (i = 1; i <= I2O_LCT_GET_TRIES; i++) {
471 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 471 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
472 if (m == I2O_QUEUE_EMPTY) 472 if (IS_ERR(msg))
473 return -ETIMEDOUT; 473 return PTR_ERR(msg);
474 474
475 writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6, &msg->u.head[0]); 475 msg->u.head[0] =
476 writel(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 | ADAPTER_TID, 476 cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
477 &msg->u.head[1]); 477 msg->u.head[1] =
478 writel(0xffffffff, &msg->body[0]); 478 cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
479 writel(0x00000000, &msg->body[1]); 479 ADAPTER_TID);
480 writel(0xd0000000 | c->dlct.len, &msg->body[2]); 480 msg->body[0] = cpu_to_le32(0xffffffff);
481 writel(c->dlct.phys, &msg->body[3]); 481 msg->body[1] = cpu_to_le32(0x00000000);
482 482 msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
483 rc = i2o_msg_post_wait(c, m, I2O_TIMEOUT_LCT_GET); 483 msg->body[3] = cpu_to_le32(c->dlct.phys);
484
485 rc = i2o_msg_post_wait(c, msg, I2O_TIMEOUT_LCT_GET);
484 if (rc < 0) 486 if (rc < 0)
485 break; 487 break;
486 488
@@ -506,29 +508,29 @@ static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind)
506{ 508{
507 i2o_status_block *sb = c->status_block.virt; 509 i2o_status_block *sb = c->status_block.virt;
508 struct device *dev; 510 struct device *dev;
509 struct i2o_message __iomem *msg; 511 struct i2o_message *msg;
510 u32 m;
511 512
512 dev = &c->pdev->dev; 513 dev = &c->pdev->dev;
513 514
514 if (i2o_dma_realloc(dev, &c->dlct, sb->expected_lct_size, GFP_KERNEL)) 515 if (i2o_dma_realloc
516 (dev, &c->dlct, le32_to_cpu(sb->expected_lct_size), GFP_KERNEL))
515 return -ENOMEM; 517 return -ENOMEM;
516 518
517 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 519 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
518 if (m == I2O_QUEUE_EMPTY) 520 if (IS_ERR(msg))
519 return -ETIMEDOUT; 521 return PTR_ERR(msg);
520 522
521 writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6, &msg->u.head[0]); 523 msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
522 writel(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 | ADAPTER_TID, 524 msg->u.head[1] = cpu_to_le32(I2O_CMD_LCT_NOTIFY << 24 | HOST_TID << 12 |
523 &msg->u.head[1]); 525 ADAPTER_TID);
524 writel(i2o_exec_driver.context, &msg->u.s.icntxt); 526 msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
525 writel(0, &msg->u.s.tcntxt); /* FIXME */ 527 msg->u.s.tcntxt = cpu_to_le32(0x00000000);
526 writel(0xffffffff, &msg->body[0]); 528 msg->body[0] = cpu_to_le32(0xffffffff);
527 writel(change_ind, &msg->body[1]); 529 msg->body[1] = cpu_to_le32(change_ind);
528 writel(0xd0000000 | c->dlct.len, &msg->body[2]); 530 msg->body[2] = cpu_to_le32(0xd0000000 | c->dlct.len);
529 writel(c->dlct.phys, &msg->body[3]); 531 msg->body[3] = cpu_to_le32(c->dlct.phys);
530 532
531 i2o_msg_post(c, m); 533 i2o_msg_post(c, msg);
532 534
533 return 0; 535 return 0;
534}; 536};
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index f283b5bafdd3..5b1febed3133 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -59,10 +59,12 @@
59#include <linux/blkdev.h> 59#include <linux/blkdev.h>
60#include <linux/hdreg.h> 60#include <linux/hdreg.h>
61 61
62#include <scsi/scsi.h>
63
62#include "i2o_block.h" 64#include "i2o_block.h"
63 65
64#define OSM_NAME "block-osm" 66#define OSM_NAME "block-osm"
65#define OSM_VERSION "1.287" 67#define OSM_VERSION "1.325"
66#define OSM_DESCRIPTION "I2O Block Device OSM" 68#define OSM_DESCRIPTION "I2O Block Device OSM"
67 69
68static struct i2o_driver i2o_block_driver; 70static struct i2o_driver i2o_block_driver;
@@ -130,20 +132,20 @@ static int i2o_block_remove(struct device *dev)
130 */ 132 */
131static int i2o_block_device_flush(struct i2o_device *dev) 133static int i2o_block_device_flush(struct i2o_device *dev)
132{ 134{
133 struct i2o_message __iomem *msg; 135 struct i2o_message *msg;
134 u32 m;
135 136
136 m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET); 137 msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
137 if (m == I2O_QUEUE_EMPTY) 138 if (IS_ERR(msg))
138 return -ETIMEDOUT; 139 return PTR_ERR(msg);
139 140
140 writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 141 msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
141 writel(I2O_CMD_BLOCK_CFLUSH << 24 | HOST_TID << 12 | dev->lct_data.tid, 142 msg->u.head[1] =
142 &msg->u.head[1]); 143 cpu_to_le32(I2O_CMD_BLOCK_CFLUSH << 24 | HOST_TID << 12 | dev->
143 writel(60 << 16, &msg->body[0]); 144 lct_data.tid);
145 msg->body[0] = cpu_to_le32(60 << 16);
144 osm_debug("Flushing...\n"); 146 osm_debug("Flushing...\n");
145 147
146 return i2o_msg_post_wait(dev->iop, m, 60); 148 return i2o_msg_post_wait(dev->iop, msg, 60);
147}; 149};
148 150
149/** 151/**
@@ -181,21 +183,21 @@ static int i2o_block_issue_flush(request_queue_t * queue, struct gendisk *disk,
181 */ 183 */
182static int i2o_block_device_mount(struct i2o_device *dev, u32 media_id) 184static int i2o_block_device_mount(struct i2o_device *dev, u32 media_id)
183{ 185{
184 struct i2o_message __iomem *msg; 186 struct i2o_message *msg;
185 u32 m; 187
186 188 msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
187 m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET); 189 if (IS_ERR(msg))
188 if (m == I2O_QUEUE_EMPTY) 190 return PTR_ERR(msg);
189 return -ETIMEDOUT; 191
190 192 msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
191 writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 193 msg->u.head[1] =
192 writel(I2O_CMD_BLOCK_MMOUNT << 24 | HOST_TID << 12 | dev->lct_data.tid, 194 cpu_to_le32(I2O_CMD_BLOCK_MMOUNT << 24 | HOST_TID << 12 | dev->
193 &msg->u.head[1]); 195 lct_data.tid);
194 writel(-1, &msg->body[0]); 196 msg->body[0] = cpu_to_le32(-1);
195 writel(0, &msg->body[1]); 197 msg->body[1] = cpu_to_le32(0x00000000);
196 osm_debug("Mounting...\n"); 198 osm_debug("Mounting...\n");
197 199
198 return i2o_msg_post_wait(dev->iop, m, 2); 200 return i2o_msg_post_wait(dev->iop, msg, 2);
199}; 201};
200 202
201/** 203/**
@@ -210,20 +212,20 @@ static int i2o_block_device_mount(struct i2o_device *dev, u32 media_id)
210 */ 212 */
211static int i2o_block_device_lock(struct i2o_device *dev, u32 media_id) 213static int i2o_block_device_lock(struct i2o_device *dev, u32 media_id)
212{ 214{
213 struct i2o_message __iomem *msg; 215 struct i2o_message *msg;
214 u32 m;
215 216
216 m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET); 217 msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
217 if (m == I2O_QUEUE_EMPTY) 218 if (IS_ERR(msg) == I2O_QUEUE_EMPTY)
218 return -ETIMEDOUT; 219 return PTR_ERR(msg);
219 220
220 writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 221 msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
221 writel(I2O_CMD_BLOCK_MLOCK << 24 | HOST_TID << 12 | dev->lct_data.tid, 222 msg->u.head[1] =
222 &msg->u.head[1]); 223 cpu_to_le32(I2O_CMD_BLOCK_MLOCK << 24 | HOST_TID << 12 | dev->
223 writel(-1, &msg->body[0]); 224 lct_data.tid);
225 msg->body[0] = cpu_to_le32(-1);
224 osm_debug("Locking...\n"); 226 osm_debug("Locking...\n");
225 227
226 return i2o_msg_post_wait(dev->iop, m, 2); 228 return i2o_msg_post_wait(dev->iop, msg, 2);
227}; 229};
228 230
229/** 231/**
@@ -238,20 +240,20 @@ static int i2o_block_device_lock(struct i2o_device *dev, u32 media_id)
238 */ 240 */
239static int i2o_block_device_unlock(struct i2o_device *dev, u32 media_id) 241static int i2o_block_device_unlock(struct i2o_device *dev, u32 media_id)
240{ 242{
241 struct i2o_message __iomem *msg; 243 struct i2o_message *msg;
242 u32 m;
243 244
244 m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET); 245 msg = i2o_msg_get_wait(dev->iop, I2O_TIMEOUT_MESSAGE_GET);
245 if (m == I2O_QUEUE_EMPTY) 246 if (IS_ERR(msg))
246 return -ETIMEDOUT; 247 return PTR_ERR(msg);
247 248
248 writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 249 msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
249 writel(I2O_CMD_BLOCK_MUNLOCK << 24 | HOST_TID << 12 | dev->lct_data.tid, 250 msg->u.head[1] =
250 &msg->u.head[1]); 251 cpu_to_le32(I2O_CMD_BLOCK_MUNLOCK << 24 | HOST_TID << 12 | dev->
251 writel(media_id, &msg->body[0]); 252 lct_data.tid);
253 msg->body[0] = cpu_to_le32(media_id);
252 osm_debug("Unlocking...\n"); 254 osm_debug("Unlocking...\n");
253 255
254 return i2o_msg_post_wait(dev->iop, m, 2); 256 return i2o_msg_post_wait(dev->iop, msg, 2);
255}; 257};
256 258
257/** 259/**
@@ -267,21 +269,21 @@ static int i2o_block_device_power(struct i2o_block_device *dev, u8 op)
267{ 269{
268 struct i2o_device *i2o_dev = dev->i2o_dev; 270 struct i2o_device *i2o_dev = dev->i2o_dev;
269 struct i2o_controller *c = i2o_dev->iop; 271 struct i2o_controller *c = i2o_dev->iop;
270 struct i2o_message __iomem *msg; 272 struct i2o_message *msg;
271 u32 m;
272 int rc; 273 int rc;
273 274
274 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 275 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
275 if (m == I2O_QUEUE_EMPTY) 276 if (IS_ERR(msg))
276 return -ETIMEDOUT; 277 return PTR_ERR(msg);
277 278
278 writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 279 msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
279 writel(I2O_CMD_BLOCK_POWER << 24 | HOST_TID << 12 | i2o_dev->lct_data. 280 msg->u.head[1] =
280 tid, &msg->u.head[1]); 281 cpu_to_le32(I2O_CMD_BLOCK_POWER << 24 | HOST_TID << 12 | i2o_dev->
281 writel(op << 24, &msg->body[0]); 282 lct_data.tid);
283 msg->body[0] = cpu_to_le32(op << 24);
282 osm_debug("Power...\n"); 284 osm_debug("Power...\n");
283 285
284 rc = i2o_msg_post_wait(c, m, 60); 286 rc = i2o_msg_post_wait(c, msg, 60);
285 if (!rc) 287 if (!rc)
286 dev->power = op; 288 dev->power = op;
287 289
@@ -331,7 +333,7 @@ static inline void i2o_block_request_free(struct i2o_block_request *ireq)
331 */ 333 */
332static inline int i2o_block_sglist_alloc(struct i2o_controller *c, 334static inline int i2o_block_sglist_alloc(struct i2o_controller *c,
333 struct i2o_block_request *ireq, 335 struct i2o_block_request *ireq,
334 u32 __iomem ** mptr) 336 u32 ** mptr)
335{ 337{
336 int nents; 338 int nents;
337 enum dma_data_direction direction; 339 enum dma_data_direction direction;
@@ -466,7 +468,7 @@ static void i2o_block_end_request(struct request *req, int uptodate,
466 468
467 spin_lock_irqsave(q->queue_lock, flags); 469 spin_lock_irqsave(q->queue_lock, flags);
468 470
469 end_that_request_last(req); 471 end_that_request_last(req, uptodate);
470 472
471 if (likely(dev)) { 473 if (likely(dev)) {
472 dev->open_queue_depth--; 474 dev->open_queue_depth--;
@@ -745,10 +747,9 @@ static int i2o_block_transfer(struct request *req)
745 struct i2o_block_device *dev = req->rq_disk->private_data; 747 struct i2o_block_device *dev = req->rq_disk->private_data;
746 struct i2o_controller *c; 748 struct i2o_controller *c;
747 int tid = dev->i2o_dev->lct_data.tid; 749 int tid = dev->i2o_dev->lct_data.tid;
748 struct i2o_message __iomem *msg; 750 struct i2o_message *msg;
749 u32 __iomem *mptr; 751 u32 *mptr;
750 struct i2o_block_request *ireq = req->special; 752 struct i2o_block_request *ireq = req->special;
751 u32 m;
752 u32 tcntxt; 753 u32 tcntxt;
753 u32 sgl_offset = SGL_OFFSET_8; 754 u32 sgl_offset = SGL_OFFSET_8;
754 u32 ctl_flags = 0x00000000; 755 u32 ctl_flags = 0x00000000;
@@ -763,9 +764,9 @@ static int i2o_block_transfer(struct request *req)
763 764
764 c = dev->i2o_dev->iop; 765 c = dev->i2o_dev->iop;
765 766
766 m = i2o_msg_get(c, &msg); 767 msg = i2o_msg_get(c);
767 if (m == I2O_QUEUE_EMPTY) { 768 if (IS_ERR(msg)) {
768 rc = -EBUSY; 769 rc = PTR_ERR(msg);
769 goto exit; 770 goto exit;
770 } 771 }
771 772
@@ -775,8 +776,8 @@ static int i2o_block_transfer(struct request *req)
775 goto nop_msg; 776 goto nop_msg;
776 } 777 }
777 778
778 writel(i2o_block_driver.context, &msg->u.s.icntxt); 779 msg->u.s.icntxt = cpu_to_le32(i2o_block_driver.context);
779 writel(tcntxt, &msg->u.s.tcntxt); 780 msg->u.s.tcntxt = cpu_to_le32(tcntxt);
780 781
781 mptr = &msg->body[0]; 782 mptr = &msg->body[0];
782 783
@@ -834,11 +835,11 @@ static int i2o_block_transfer(struct request *req)
834 835
835 sgl_offset = SGL_OFFSET_12; 836 sgl_offset = SGL_OFFSET_12;
836 837
837 writel(I2O_CMD_PRIVATE << 24 | HOST_TID << 12 | tid, 838 msg->u.head[1] =
838 &msg->u.head[1]); 839 cpu_to_le32(I2O_CMD_PRIVATE << 24 | HOST_TID << 12 | tid);
839 840
840 writel(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC, mptr++); 841 *mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
841 writel(tid, mptr++); 842 *mptr++ = cpu_to_le32(tid);
842 843
843 /* 844 /*
844 * ENABLE_DISCONNECT 845 * ENABLE_DISCONNECT
@@ -846,29 +847,31 @@ static int i2o_block_transfer(struct request *req)
846 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME 847 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME
847 */ 848 */
848 if (rq_data_dir(req) == READ) { 849 if (rq_data_dir(req) == READ) {
849 cmd[0] = 0x28; 850 cmd[0] = READ_10;
850 scsi_flags = 0x60a0000a; 851 scsi_flags = 0x60a0000a;
851 } else { 852 } else {
852 cmd[0] = 0x2A; 853 cmd[0] = WRITE_10;
853 scsi_flags = 0xa0a0000a; 854 scsi_flags = 0xa0a0000a;
854 } 855 }
855 856
856 writel(scsi_flags, mptr++); 857 *mptr++ = cpu_to_le32(scsi_flags);
857 858
858 *((u32 *) & cmd[2]) = cpu_to_be32(req->sector * hwsec); 859 *((u32 *) & cmd[2]) = cpu_to_be32(req->sector * hwsec);
859 *((u16 *) & cmd[7]) = cpu_to_be16(req->nr_sectors * hwsec); 860 *((u16 *) & cmd[7]) = cpu_to_be16(req->nr_sectors * hwsec);
860 861
861 memcpy_toio(mptr, cmd, 10); 862 memcpy(mptr, cmd, 10);
862 mptr += 4; 863 mptr += 4;
863 writel(req->nr_sectors << KERNEL_SECTOR_SHIFT, mptr++); 864 *mptr++ = cpu_to_le32(req->nr_sectors << KERNEL_SECTOR_SHIFT);
864 } else 865 } else
865#endif 866#endif
866 { 867 {
867 writel(cmd | HOST_TID << 12 | tid, &msg->u.head[1]); 868 msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
868 writel(ctl_flags, mptr++); 869 *mptr++ = cpu_to_le32(ctl_flags);
869 writel(req->nr_sectors << KERNEL_SECTOR_SHIFT, mptr++); 870 *mptr++ = cpu_to_le32(req->nr_sectors << KERNEL_SECTOR_SHIFT);
870 writel((u32) (req->sector << KERNEL_SECTOR_SHIFT), mptr++); 871 *mptr++ =
871 writel(req->sector >> (32 - KERNEL_SECTOR_SHIFT), mptr++); 872 cpu_to_le32((u32) (req->sector << KERNEL_SECTOR_SHIFT));
873 *mptr++ =
874 cpu_to_le32(req->sector >> (32 - KERNEL_SECTOR_SHIFT));
872 } 875 }
873 876
874 if (!i2o_block_sglist_alloc(c, ireq, &mptr)) { 877 if (!i2o_block_sglist_alloc(c, ireq, &mptr)) {
@@ -876,13 +879,13 @@ static int i2o_block_transfer(struct request *req)
876 goto context_remove; 879 goto context_remove;
877 } 880 }
878 881
879 writel(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | 882 msg->u.head[0] =
880 sgl_offset, &msg->u.head[0]); 883 cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
881 884
882 list_add_tail(&ireq->queue, &dev->open_queue); 885 list_add_tail(&ireq->queue, &dev->open_queue);
883 dev->open_queue_depth++; 886 dev->open_queue_depth++;
884 887
885 i2o_msg_post(c, m); 888 i2o_msg_post(c, msg);
886 889
887 return 0; 890 return 0;
888 891
@@ -890,7 +893,7 @@ static int i2o_block_transfer(struct request *req)
890 i2o_cntxt_list_remove(c, req); 893 i2o_cntxt_list_remove(c, req);
891 894
892 nop_msg: 895 nop_msg:
893 i2o_msg_nop(c, m); 896 i2o_msg_nop(c, msg);
894 897
895 exit: 898 exit:
896 return rc; 899 return rc;
@@ -978,13 +981,12 @@ static struct i2o_block_device *i2o_block_device_alloc(void)
978 struct request_queue *queue; 981 struct request_queue *queue;
979 int rc; 982 int rc;
980 983
981 dev = kmalloc(sizeof(*dev), GFP_KERNEL); 984 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
982 if (!dev) { 985 if (!dev) {
983 osm_err("Insufficient memory to allocate I2O Block disk.\n"); 986 osm_err("Insufficient memory to allocate I2O Block disk.\n");
984 rc = -ENOMEM; 987 rc = -ENOMEM;
985 goto exit; 988 goto exit;
986 } 989 }
987 memset(dev, 0, sizeof(*dev));
988 990
989 INIT_LIST_HEAD(&dev->open_queue); 991 INIT_LIST_HEAD(&dev->open_queue);
990 spin_lock_init(&dev->lock); 992 spin_lock_init(&dev->lock);
@@ -1049,8 +1051,8 @@ static int i2o_block_probe(struct device *dev)
1049 int rc; 1051 int rc;
1050 u64 size; 1052 u64 size;
1051 u32 blocksize; 1053 u32 blocksize;
1052 u32 flags, status;
1053 u16 body_size = 4; 1054 u16 body_size = 4;
1055 u16 power;
1054 unsigned short max_sectors; 1056 unsigned short max_sectors;
1055 1057
1056#ifdef CONFIG_I2O_EXT_ADAPTEC 1058#ifdef CONFIG_I2O_EXT_ADAPTEC
@@ -1108,22 +1110,20 @@ static int i2o_block_probe(struct device *dev)
1108 * Ask for the current media data. If that isn't supported 1110 * Ask for the current media data. If that isn't supported
1109 * then we ask for the device capacity data 1111 * then we ask for the device capacity data
1110 */ 1112 */
1111 if (i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) || 1113 if (!i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) ||
1112 i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) { 1114 !i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
1113 blk_queue_hardsect_size(queue, blocksize); 1115 blk_queue_hardsect_size(queue, le32_to_cpu(blocksize));
1114 } else 1116 } else
1115 osm_warn("unable to get blocksize of %s\n", gd->disk_name); 1117 osm_warn("unable to get blocksize of %s\n", gd->disk_name);
1116 1118
1117 if (i2o_parm_field_get(i2o_dev, 0x0004, 0, &size, 8) || 1119 if (!i2o_parm_field_get(i2o_dev, 0x0004, 0, &size, 8) ||
1118 i2o_parm_field_get(i2o_dev, 0x0000, 4, &size, 8)) { 1120 !i2o_parm_field_get(i2o_dev, 0x0000, 4, &size, 8)) {
1119 set_capacity(gd, size >> KERNEL_SECTOR_SHIFT); 1121 set_capacity(gd, le64_to_cpu(size) >> KERNEL_SECTOR_SHIFT);
1120 } else 1122 } else
1121 osm_warn("could not get size of %s\n", gd->disk_name); 1123 osm_warn("could not get size of %s\n", gd->disk_name);
1122 1124
1123 if (!i2o_parm_field_get(i2o_dev, 0x0000, 2, &i2o_blk_dev->power, 2)) 1125 if (!i2o_parm_field_get(i2o_dev, 0x0000, 2, &power, 2))
1124 i2o_blk_dev->power = 0; 1126 i2o_blk_dev->power = power;
1125 i2o_parm_field_get(i2o_dev, 0x0000, 5, &flags, 4);
1126 i2o_parm_field_get(i2o_dev, 0x0000, 6, &status, 4);
1127 1127
1128 i2o_event_register(i2o_dev, &i2o_block_driver, 0, 0xffffffff); 1128 i2o_event_register(i2o_dev, &i2o_block_driver, 0, 0xffffffff);
1129 1129
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 3c3a7abebb1b..89daf67b764d 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -36,12 +36,12 @@
36 36
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39#include "core.h"
40
41#define SG_TABLESIZE 30 39#define SG_TABLESIZE 30
42 40
43static int i2o_cfg_ioctl(struct inode *inode, struct file *fp, unsigned int cmd, 41extern int i2o_parm_issue(struct i2o_device *, int, void *, int, void *, int);
44 unsigned long arg); 42
43static int i2o_cfg_ioctl(struct inode *, struct file *, unsigned int,
44 unsigned long);
45 45
46static spinlock_t i2o_config_lock; 46static spinlock_t i2o_config_lock;
47 47
@@ -230,8 +230,7 @@ static int i2o_cfg_swdl(unsigned long arg)
230 struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg; 230 struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
231 unsigned char maxfrag = 0, curfrag = 1; 231 unsigned char maxfrag = 0, curfrag = 1;
232 struct i2o_dma buffer; 232 struct i2o_dma buffer;
233 struct i2o_message __iomem *msg; 233 struct i2o_message *msg;
234 u32 m;
235 unsigned int status = 0, swlen = 0, fragsize = 8192; 234 unsigned int status = 0, swlen = 0, fragsize = 8192;
236 struct i2o_controller *c; 235 struct i2o_controller *c;
237 236
@@ -257,31 +256,34 @@ static int i2o_cfg_swdl(unsigned long arg)
257 if (!c) 256 if (!c)
258 return -ENXIO; 257 return -ENXIO;
259 258
260 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 259 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
261 if (m == I2O_QUEUE_EMPTY) 260 if (IS_ERR(msg))
262 return -EBUSY; 261 return PTR_ERR(msg);
263 262
264 if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize, GFP_KERNEL)) { 263 if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize, GFP_KERNEL)) {
265 i2o_msg_nop(c, m); 264 i2o_msg_nop(c, msg);
266 return -ENOMEM; 265 return -ENOMEM;
267 } 266 }
268 267
269 __copy_from_user(buffer.virt, kxfer.buf, fragsize); 268 __copy_from_user(buffer.virt, kxfer.buf, fragsize);
270 269
271 writel(NINE_WORD_MSG_SIZE | SGL_OFFSET_7, &msg->u.head[0]); 270 msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
272 writel(I2O_CMD_SW_DOWNLOAD << 24 | HOST_TID << 12 | ADAPTER_TID, 271 msg->u.head[1] =
273 &msg->u.head[1]); 272 cpu_to_le32(I2O_CMD_SW_DOWNLOAD << 24 | HOST_TID << 12 |
274 writel(i2o_config_driver.context, &msg->u.head[2]); 273 ADAPTER_TID);
275 writel(0, &msg->u.head[3]); 274 msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
276 writel((((u32) kxfer.flags) << 24) | (((u32) kxfer.sw_type) << 16) | 275 msg->u.head[3] = cpu_to_le32(0);
277 (((u32) maxfrag) << 8) | (((u32) curfrag)), &msg->body[0]); 276 msg->body[0] =
278 writel(swlen, &msg->body[1]); 277 cpu_to_le32((((u32) kxfer.flags) << 24) | (((u32) kxfer.
279 writel(kxfer.sw_id, &msg->body[2]); 278 sw_type) << 16) |
280 writel(0xD0000000 | fragsize, &msg->body[3]); 279 (((u32) maxfrag) << 8) | (((u32) curfrag)));
281 writel(buffer.phys, &msg->body[4]); 280 msg->body[1] = cpu_to_le32(swlen);
281 msg->body[2] = cpu_to_le32(kxfer.sw_id);
282 msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
283 msg->body[4] = cpu_to_le32(buffer.phys);
282 284
283 osm_debug("swdl frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize); 285 osm_debug("swdl frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
284 status = i2o_msg_post_wait_mem(c, m, 60, &buffer); 286 status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
285 287
286 if (status != -ETIMEDOUT) 288 if (status != -ETIMEDOUT)
287 i2o_dma_free(&c->pdev->dev, &buffer); 289 i2o_dma_free(&c->pdev->dev, &buffer);
@@ -302,8 +304,7 @@ static int i2o_cfg_swul(unsigned long arg)
302 struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg; 304 struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
303 unsigned char maxfrag = 0, curfrag = 1; 305 unsigned char maxfrag = 0, curfrag = 1;
304 struct i2o_dma buffer; 306 struct i2o_dma buffer;
305 struct i2o_message __iomem *msg; 307 struct i2o_message *msg;
306 u32 m;
307 unsigned int status = 0, swlen = 0, fragsize = 8192; 308 unsigned int status = 0, swlen = 0, fragsize = 8192;
308 struct i2o_controller *c; 309 struct i2o_controller *c;
309 int ret = 0; 310 int ret = 0;
@@ -330,30 +331,30 @@ static int i2o_cfg_swul(unsigned long arg)
330 if (!c) 331 if (!c)
331 return -ENXIO; 332 return -ENXIO;
332 333
333 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 334 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
334 if (m == I2O_QUEUE_EMPTY) 335 if (IS_ERR(msg))
335 return -EBUSY; 336 return PTR_ERR(msg);
336 337
337 if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize, GFP_KERNEL)) { 338 if (i2o_dma_alloc(&c->pdev->dev, &buffer, fragsize, GFP_KERNEL)) {
338 i2o_msg_nop(c, m); 339 i2o_msg_nop(c, msg);
339 return -ENOMEM; 340 return -ENOMEM;
340 } 341 }
341 342
342 writel(NINE_WORD_MSG_SIZE | SGL_OFFSET_7, &msg->u.head[0]); 343 msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_7);
343 writel(I2O_CMD_SW_UPLOAD << 24 | HOST_TID << 12 | ADAPTER_TID, 344 msg->u.head[1] =
344 &msg->u.head[1]); 345 cpu_to_le32(I2O_CMD_SW_UPLOAD << 24 | HOST_TID << 12 | ADAPTER_TID);
345 writel(i2o_config_driver.context, &msg->u.head[2]); 346 msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
346 writel(0, &msg->u.head[3]); 347 msg->u.head[3] = cpu_to_le32(0);
347 writel((u32) kxfer.flags << 24 | (u32) kxfer. 348 msg->body[0] =
348 sw_type << 16 | (u32) maxfrag << 8 | (u32) curfrag, 349 cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.
349 &msg->body[0]); 350 sw_type << 16 | (u32) maxfrag << 8 | (u32) curfrag);
350 writel(swlen, &msg->body[1]); 351 msg->body[1] = cpu_to_le32(swlen);
351 writel(kxfer.sw_id, &msg->body[2]); 352 msg->body[2] = cpu_to_le32(kxfer.sw_id);
352 writel(0xD0000000 | fragsize, &msg->body[3]); 353 msg->body[3] = cpu_to_le32(0xD0000000 | fragsize);
353 writel(buffer.phys, &msg->body[4]); 354 msg->body[4] = cpu_to_le32(buffer.phys);
354 355
355 osm_debug("swul frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize); 356 osm_debug("swul frag %d/%d (size %d)\n", curfrag, maxfrag, fragsize);
356 status = i2o_msg_post_wait_mem(c, m, 60, &buffer); 357 status = i2o_msg_post_wait_mem(c, msg, 60, &buffer);
357 358
358 if (status != I2O_POST_WAIT_OK) { 359 if (status != I2O_POST_WAIT_OK) {
359 if (status != -ETIMEDOUT) 360 if (status != -ETIMEDOUT)
@@ -380,8 +381,7 @@ static int i2o_cfg_swdel(unsigned long arg)
380 struct i2o_controller *c; 381 struct i2o_controller *c;
381 struct i2o_sw_xfer kxfer; 382 struct i2o_sw_xfer kxfer;
382 struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg; 383 struct i2o_sw_xfer __user *pxfer = (struct i2o_sw_xfer __user *)arg;
383 struct i2o_message __iomem *msg; 384 struct i2o_message *msg;
384 u32 m;
385 unsigned int swlen; 385 unsigned int swlen;
386 int token; 386 int token;
387 387
@@ -395,21 +395,21 @@ static int i2o_cfg_swdel(unsigned long arg)
395 if (!c) 395 if (!c)
396 return -ENXIO; 396 return -ENXIO;
397 397
398 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 398 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
399 if (m == I2O_QUEUE_EMPTY) 399 if (IS_ERR(msg))
400 return -EBUSY; 400 return PTR_ERR(msg);
401 401
402 writel(SEVEN_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 402 msg->u.head[0] = cpu_to_le32(SEVEN_WORD_MSG_SIZE | SGL_OFFSET_0);
403 writel(I2O_CMD_SW_REMOVE << 24 | HOST_TID << 12 | ADAPTER_TID, 403 msg->u.head[1] =
404 &msg->u.head[1]); 404 cpu_to_le32(I2O_CMD_SW_REMOVE << 24 | HOST_TID << 12 | ADAPTER_TID);
405 writel(i2o_config_driver.context, &msg->u.head[2]); 405 msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
406 writel(0, &msg->u.head[3]); 406 msg->u.head[3] = cpu_to_le32(0);
407 writel((u32) kxfer.flags << 24 | (u32) kxfer.sw_type << 16, 407 msg->body[0] =
408 &msg->body[0]); 408 cpu_to_le32((u32) kxfer.flags << 24 | (u32) kxfer.sw_type << 16);
409 writel(swlen, &msg->body[1]); 409 msg->body[1] = cpu_to_le32(swlen);
410 writel(kxfer.sw_id, &msg->body[2]); 410 msg->body[2] = cpu_to_le32(kxfer.sw_id);
411 411
412 token = i2o_msg_post_wait(c, m, 10); 412 token = i2o_msg_post_wait(c, msg, 10);
413 413
414 if (token != I2O_POST_WAIT_OK) { 414 if (token != I2O_POST_WAIT_OK) {
415 osm_info("swdel failed, DetailedStatus = %d\n", token); 415 osm_info("swdel failed, DetailedStatus = %d\n", token);
@@ -423,25 +423,24 @@ static int i2o_cfg_validate(unsigned long arg)
423{ 423{
424 int token; 424 int token;
425 int iop = (int)arg; 425 int iop = (int)arg;
426 struct i2o_message __iomem *msg; 426 struct i2o_message *msg;
427 u32 m;
428 struct i2o_controller *c; 427 struct i2o_controller *c;
429 428
430 c = i2o_find_iop(iop); 429 c = i2o_find_iop(iop);
431 if (!c) 430 if (!c)
432 return -ENXIO; 431 return -ENXIO;
433 432
434 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 433 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
435 if (m == I2O_QUEUE_EMPTY) 434 if (IS_ERR(msg))
436 return -EBUSY; 435 return PTR_ERR(msg);
437 436
438 writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 437 msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
439 writel(I2O_CMD_CONFIG_VALIDATE << 24 | HOST_TID << 12 | iop, 438 msg->u.head[1] =
440 &msg->u.head[1]); 439 cpu_to_le32(I2O_CMD_CONFIG_VALIDATE << 24 | HOST_TID << 12 | iop);
441 writel(i2o_config_driver.context, &msg->u.head[2]); 440 msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
442 writel(0, &msg->u.head[3]); 441 msg->u.head[3] = cpu_to_le32(0);
443 442
444 token = i2o_msg_post_wait(c, m, 10); 443 token = i2o_msg_post_wait(c, msg, 10);
445 444
446 if (token != I2O_POST_WAIT_OK) { 445 if (token != I2O_POST_WAIT_OK) {
447 osm_info("Can't validate configuration, ErrorStatus = %d\n", 446 osm_info("Can't validate configuration, ErrorStatus = %d\n",
@@ -454,8 +453,7 @@ static int i2o_cfg_validate(unsigned long arg)
454 453
455static int i2o_cfg_evt_reg(unsigned long arg, struct file *fp) 454static int i2o_cfg_evt_reg(unsigned long arg, struct file *fp)
456{ 455{
457 struct i2o_message __iomem *msg; 456 struct i2o_message *msg;
458 u32 m;
459 struct i2o_evt_id __user *pdesc = (struct i2o_evt_id __user *)arg; 457 struct i2o_evt_id __user *pdesc = (struct i2o_evt_id __user *)arg;
460 struct i2o_evt_id kdesc; 458 struct i2o_evt_id kdesc;
461 struct i2o_controller *c; 459 struct i2o_controller *c;
@@ -474,18 +472,19 @@ static int i2o_cfg_evt_reg(unsigned long arg, struct file *fp)
474 if (!d) 472 if (!d)
475 return -ENODEV; 473 return -ENODEV;
476 474
477 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 475 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
478 if (m == I2O_QUEUE_EMPTY) 476 if (IS_ERR(msg))
479 return -EBUSY; 477 return PTR_ERR(msg);
480 478
481 writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 479 msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
482 writel(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 | kdesc.tid, 480 msg->u.head[1] =
483 &msg->u.head[1]); 481 cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 |
484 writel(i2o_config_driver.context, &msg->u.head[2]); 482 kdesc.tid);
485 writel(i2o_cntxt_list_add(c, fp->private_data), &msg->u.head[3]); 483 msg->u.head[2] = cpu_to_le32(i2o_config_driver.context);
486 writel(kdesc.evt_mask, &msg->body[0]); 484 msg->u.head[3] = cpu_to_le32(i2o_cntxt_list_add(c, fp->private_data));
485 msg->body[0] = cpu_to_le32(kdesc.evt_mask);
487 486
488 i2o_msg_post(c, m); 487 i2o_msg_post(c, msg);
489 488
490 return 0; 489 return 0;
491} 490}
@@ -537,7 +536,6 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
537 u32 sg_index = 0; 536 u32 sg_index = 0;
538 i2o_status_block *sb; 537 i2o_status_block *sb;
539 struct i2o_message *msg; 538 struct i2o_message *msg;
540 u32 m;
541 unsigned int iop; 539 unsigned int iop;
542 540
543 cmd = (struct i2o_cmd_passthru32 __user *)arg; 541 cmd = (struct i2o_cmd_passthru32 __user *)arg;
@@ -553,7 +551,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
553 return -ENXIO; 551 return -ENXIO;
554 } 552 }
555 553
556 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 554 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
557 555
558 sb = c->status_block.virt; 556 sb = c->status_block.virt;
559 557
@@ -585,19 +583,15 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
585 reply_size >>= 16; 583 reply_size >>= 16;
586 reply_size <<= 2; 584 reply_size <<= 2;
587 585
588 reply = kmalloc(reply_size, GFP_KERNEL); 586 reply = kzalloc(reply_size, GFP_KERNEL);
589 if (!reply) { 587 if (!reply) {
590 printk(KERN_WARNING "%s: Could not allocate reply buffer\n", 588 printk(KERN_WARNING "%s: Could not allocate reply buffer\n",
591 c->name); 589 c->name);
592 return -ENOMEM; 590 return -ENOMEM;
593 } 591 }
594 memset(reply, 0, reply_size);
595 592
596 sg_offset = (msg->u.head[0] >> 4) & 0x0f; 593 sg_offset = (msg->u.head[0] >> 4) & 0x0f;
597 594
598 writel(i2o_config_driver.context, &msg->u.s.icntxt);
599 writel(i2o_cntxt_list_add(c, reply), &msg->u.s.tcntxt);
600
601 memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE); 595 memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
602 if (sg_offset) { 596 if (sg_offset) {
603 struct sg_simple_element *sg; 597 struct sg_simple_element *sg;
@@ -631,7 +625,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
631 goto cleanup; 625 goto cleanup;
632 } 626 }
633 sg_size = sg[i].flag_count & 0xffffff; 627 sg_size = sg[i].flag_count & 0xffffff;
634 p = &(sg_list[sg_index++]); 628 p = &(sg_list[sg_index]);
635 /* Allocate memory for the transfer */ 629 /* Allocate memory for the transfer */
636 if (i2o_dma_alloc 630 if (i2o_dma_alloc
637 (&c->pdev->dev, p, sg_size, 631 (&c->pdev->dev, p, sg_size,
@@ -642,6 +636,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
642 rcode = -ENOMEM; 636 rcode = -ENOMEM;
643 goto sg_list_cleanup; 637 goto sg_list_cleanup;
644 } 638 }
639 sg_index++;
645 /* Copy in the user's SG buffer if necessary */ 640 /* Copy in the user's SG buffer if necessary */
646 if (sg[i]. 641 if (sg[i].
647 flag_count & 0x04000000 /*I2O_SGL_FLAGS_DIR */ ) { 642 flag_count & 0x04000000 /*I2O_SGL_FLAGS_DIR */ ) {
@@ -662,9 +657,11 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
662 } 657 }
663 } 658 }
664 659
665 rcode = i2o_msg_post_wait(c, m, 60); 660 rcode = i2o_msg_post_wait(c, msg, 60);
666 if (rcode) 661 if (rcode) {
662 reply[4] = ((u32) rcode) << 24;
667 goto sg_list_cleanup; 663 goto sg_list_cleanup;
664 }
668 665
669 if (sg_offset) { 666 if (sg_offset) {
670 u32 msg[I2O_OUTBOUND_MSG_FRAME_SIZE]; 667 u32 msg[I2O_OUTBOUND_MSG_FRAME_SIZE];
@@ -714,6 +711,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
714 } 711 }
715 } 712 }
716 713
714 sg_list_cleanup:
717 /* Copy back the reply to user space */ 715 /* Copy back the reply to user space */
718 if (reply_size) { 716 if (reply_size) {
719 // we wrote our own values for context - now restore the user supplied ones 717 // we wrote our own values for context - now restore the user supplied ones
@@ -731,7 +729,6 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
731 } 729 }
732 } 730 }
733 731
734 sg_list_cleanup:
735 for (i = 0; i < sg_index; i++) 732 for (i = 0; i < sg_index; i++)
736 i2o_dma_free(&c->pdev->dev, &sg_list[i]); 733 i2o_dma_free(&c->pdev->dev, &sg_list[i]);
737 734
@@ -780,8 +777,7 @@ static int i2o_cfg_passthru(unsigned long arg)
780 u32 i = 0; 777 u32 i = 0;
781 void *p = NULL; 778 void *p = NULL;
782 i2o_status_block *sb; 779 i2o_status_block *sb;
783 struct i2o_message __iomem *msg; 780 struct i2o_message *msg;
784 u32 m;
785 unsigned int iop; 781 unsigned int iop;
786 782
787 if (get_user(iop, &cmd->iop) || get_user(user_msg, &cmd->msg)) 783 if (get_user(iop, &cmd->iop) || get_user(user_msg, &cmd->msg))
@@ -793,7 +789,7 @@ static int i2o_cfg_passthru(unsigned long arg)
793 return -ENXIO; 789 return -ENXIO;
794 } 790 }
795 791
796 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 792 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
797 793
798 sb = c->status_block.virt; 794 sb = c->status_block.virt;
799 795
@@ -820,19 +816,15 @@ static int i2o_cfg_passthru(unsigned long arg)
820 reply_size >>= 16; 816 reply_size >>= 16;
821 reply_size <<= 2; 817 reply_size <<= 2;
822 818
823 reply = kmalloc(reply_size, GFP_KERNEL); 819 reply = kzalloc(reply_size, GFP_KERNEL);
824 if (!reply) { 820 if (!reply) {
825 printk(KERN_WARNING "%s: Could not allocate reply buffer\n", 821 printk(KERN_WARNING "%s: Could not allocate reply buffer\n",
826 c->name); 822 c->name);
827 return -ENOMEM; 823 return -ENOMEM;
828 } 824 }
829 memset(reply, 0, reply_size);
830 825
831 sg_offset = (msg->u.head[0] >> 4) & 0x0f; 826 sg_offset = (msg->u.head[0] >> 4) & 0x0f;
832 827
833 writel(i2o_config_driver.context, &msg->u.s.icntxt);
834 writel(i2o_cntxt_list_add(c, reply), &msg->u.s.tcntxt);
835
836 memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE); 828 memset(sg_list, 0, sizeof(sg_list[0]) * SG_TABLESIZE);
837 if (sg_offset) { 829 if (sg_offset) {
838 struct sg_simple_element *sg; 830 struct sg_simple_element *sg;
@@ -894,9 +886,11 @@ static int i2o_cfg_passthru(unsigned long arg)
894 } 886 }
895 } 887 }
896 888
897 rcode = i2o_msg_post_wait(c, m, 60); 889 rcode = i2o_msg_post_wait(c, msg, 60);
898 if (rcode) 890 if (rcode) {
891 reply[4] = ((u32) rcode) << 24;
899 goto sg_list_cleanup; 892 goto sg_list_cleanup;
893 }
900 894
901 if (sg_offset) { 895 if (sg_offset) {
902 u32 msg[128]; 896 u32 msg[128];
@@ -946,6 +940,7 @@ static int i2o_cfg_passthru(unsigned long arg)
946 } 940 }
947 } 941 }
948 942
943 sg_list_cleanup:
949 /* Copy back the reply to user space */ 944 /* Copy back the reply to user space */
950 if (reply_size) { 945 if (reply_size) {
951 // we wrote our own values for context - now restore the user supplied ones 946 // we wrote our own values for context - now restore the user supplied ones
@@ -962,7 +957,6 @@ static int i2o_cfg_passthru(unsigned long arg)
962 } 957 }
963 } 958 }
964 959
965 sg_list_cleanup:
966 for (i = 0; i < sg_index; i++) 960 for (i = 0; i < sg_index; i++)
967 kfree(sg_list[i]); 961 kfree(sg_list[i]);
968 962
diff --git a/drivers/message/i2o/i2o_lan.h b/drivers/message/i2o/i2o_lan.h
index 561d63304d7e..6502b817df58 100644
--- a/drivers/message/i2o/i2o_lan.h
+++ b/drivers/message/i2o/i2o_lan.h
@@ -103,14 +103,14 @@
103#define I2O_LAN_DSC_SUSPENDED 0x11 103#define I2O_LAN_DSC_SUSPENDED 0x11
104 104
105struct i2o_packet_info { 105struct i2o_packet_info {
106 u32 offset : 24; 106 u32 offset:24;
107 u32 flags : 8; 107 u32 flags:8;
108 u32 len : 24; 108 u32 len:24;
109 u32 status : 8; 109 u32 status:8;
110}; 110};
111 111
112struct i2o_bucket_descriptor { 112struct i2o_bucket_descriptor {
113 u32 context; /* FIXME: 64bit support */ 113 u32 context; /* FIXME: 64bit support */
114 struct i2o_packet_info packet_info[1]; 114 struct i2o_packet_info packet_info[1];
115}; 115};
116 116
@@ -127,14 +127,14 @@ struct i2o_lan_local {
127 u8 unit; 127 u8 unit;
128 struct i2o_device *i2o_dev; 128 struct i2o_device *i2o_dev;
129 129
130 struct fddi_statistics stats; /* see also struct net_device_stats */ 130 struct fddi_statistics stats; /* see also struct net_device_stats */
131 unsigned short (*type_trans)(struct sk_buff *, struct net_device *); 131 unsigned short (*type_trans) (struct sk_buff *, struct net_device *);
132 atomic_t buckets_out; /* nbr of unused buckets on DDM */ 132 atomic_t buckets_out; /* nbr of unused buckets on DDM */
133 atomic_t tx_out; /* outstanding TXes */ 133 atomic_t tx_out; /* outstanding TXes */
134 u8 tx_count; /* packets in one TX message frame */ 134 u8 tx_count; /* packets in one TX message frame */
135 u16 tx_max_out; /* DDM's Tx queue len */ 135 u16 tx_max_out; /* DDM's Tx queue len */
136 u8 sgl_max; /* max SGLs in one message frame */ 136 u8 sgl_max; /* max SGLs in one message frame */
137 u32 m; /* IOP address of the batch msg frame */ 137 u32 m; /* IOP address of the batch msg frame */
138 138
139 struct work_struct i2o_batch_send_task; 139 struct work_struct i2o_batch_send_task;
140 int send_active; 140 int send_active;
@@ -144,16 +144,16 @@ struct i2o_lan_local {
144 144
145 spinlock_t tx_lock; 145 spinlock_t tx_lock;
146 146
147 u32 max_size_mc_table; /* max number of multicast addresses */ 147 u32 max_size_mc_table; /* max number of multicast addresses */
148 148
149 /* LAN OSM configurable parameters are here: */ 149 /* LAN OSM configurable parameters are here: */
150 150
151 u16 max_buckets_out; /* max nbr of buckets to send to DDM */ 151 u16 max_buckets_out; /* max nbr of buckets to send to DDM */
152 u16 bucket_thresh; /* send more when this many used */ 152 u16 bucket_thresh; /* send more when this many used */
153 u16 rx_copybreak; 153 u16 rx_copybreak;
154 154
155 u8 tx_batch_mode; /* Set when using batch mode sends */ 155 u8 tx_batch_mode; /* Set when using batch mode sends */
156 u32 i2o_event_mask; /* To turn on interesting event flags */ 156 u32 i2o_event_mask; /* To turn on interesting event flags */
157}; 157};
158 158
159#endif /* _I2O_LAN_H */ 159#endif /* _I2O_LAN_H */
diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
index d559a1758363..2a0c42b8cda5 100644
--- a/drivers/message/i2o/i2o_proc.c
+++ b/drivers/message/i2o/i2o_proc.c
@@ -28,7 +28,7 @@
28 */ 28 */
29 29
30#define OSM_NAME "proc-osm" 30#define OSM_NAME "proc-osm"
31#define OSM_VERSION "1.145" 31#define OSM_VERSION "1.316"
32#define OSM_DESCRIPTION "I2O ProcFS OSM" 32#define OSM_DESCRIPTION "I2O ProcFS OSM"
33 33
34#define I2O_MAX_MODULES 4 34#define I2O_MAX_MODULES 4
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index 9f1744c3933b..f9e5a23697a1 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -70,7 +70,7 @@
70#include <scsi/sg_request.h> 70#include <scsi/sg_request.h>
71 71
72#define OSM_NAME "scsi-osm" 72#define OSM_NAME "scsi-osm"
73#define OSM_VERSION "1.282" 73#define OSM_VERSION "1.316"
74#define OSM_DESCRIPTION "I2O SCSI Peripheral OSM" 74#define OSM_DESCRIPTION "I2O SCSI Peripheral OSM"
75 75
76static struct i2o_driver i2o_scsi_driver; 76static struct i2o_driver i2o_scsi_driver;
@@ -113,7 +113,7 @@ static struct i2o_scsi_host *i2o_scsi_host_alloc(struct i2o_controller *c)
113 113
114 list_for_each_entry(i2o_dev, &c->devices, list) 114 list_for_each_entry(i2o_dev, &c->devices, list)
115 if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) { 115 if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
116 if (i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1) 116 if (!i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
117 && (type == 0x01)) /* SCSI bus */ 117 && (type == 0x01)) /* SCSI bus */
118 max_channel++; 118 max_channel++;
119 } 119 }
@@ -146,7 +146,7 @@ static struct i2o_scsi_host *i2o_scsi_host_alloc(struct i2o_controller *c)
146 i = 0; 146 i = 0;
147 list_for_each_entry(i2o_dev, &c->devices, list) 147 list_for_each_entry(i2o_dev, &c->devices, list)
148 if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) { 148 if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
149 if (i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1) 149 if (!i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
150 && (type == 0x01)) /* only SCSI bus */ 150 && (type == 0x01)) /* only SCSI bus */
151 i2o_shost->channel[i++] = i2o_dev; 151 i2o_shost->channel[i++] = i2o_dev;
152 152
@@ -238,13 +238,15 @@ static int i2o_scsi_probe(struct device *dev)
238 u8 type; 238 u8 type;
239 struct i2o_device *d = i2o_shost->channel[0]; 239 struct i2o_device *d = i2o_shost->channel[0];
240 240
241 if (i2o_parm_field_get(d, 0x0000, 0, &type, 1) 241 if (!i2o_parm_field_get(d, 0x0000, 0, &type, 1)
242 && (type == 0x01)) /* SCSI bus */ 242 && (type == 0x01)) /* SCSI bus */
243 if (i2o_parm_field_get(d, 0x0200, 4, &id, 4)) { 243 if (!i2o_parm_field_get(d, 0x0200, 4, &id, 4)) {
244 channel = 0; 244 channel = 0;
245 if (i2o_dev->lct_data.class_id == 245 if (i2o_dev->lct_data.class_id ==
246 I2O_CLASS_RANDOM_BLOCK_STORAGE) 246 I2O_CLASS_RANDOM_BLOCK_STORAGE)
247 lun = i2o_shost->lun++; 247 lun =
248 cpu_to_le64(i2o_shost->
249 lun++);
248 else 250 else
249 lun = 0; 251 lun = 0;
250 } 252 }
@@ -253,10 +255,10 @@ static int i2o_scsi_probe(struct device *dev)
253 break; 255 break;
254 256
255 case I2O_CLASS_SCSI_PERIPHERAL: 257 case I2O_CLASS_SCSI_PERIPHERAL:
256 if (i2o_parm_field_get(i2o_dev, 0x0000, 3, &id, 4) < 0) 258 if (i2o_parm_field_get(i2o_dev, 0x0000, 3, &id, 4))
257 return -EFAULT; 259 return -EFAULT;
258 260
259 if (i2o_parm_field_get(i2o_dev, 0x0000, 4, &lun, 8) < 0) 261 if (i2o_parm_field_get(i2o_dev, 0x0000, 4, &lun, 8))
260 return -EFAULT; 262 return -EFAULT;
261 263
262 parent = i2o_iop_find_device(c, i2o_dev->lct_data.parent_tid); 264 parent = i2o_iop_find_device(c, i2o_dev->lct_data.parent_tid);
@@ -281,20 +283,22 @@ static int i2o_scsi_probe(struct device *dev)
281 return -EFAULT; 283 return -EFAULT;
282 } 284 }
283 285
284 if (id >= scsi_host->max_id) { 286 if (le32_to_cpu(id) >= scsi_host->max_id) {
285 osm_warn("SCSI device id (%d) >= max_id of I2O host (%d)", id, 287 osm_warn("SCSI device id (%d) >= max_id of I2O host (%d)",
286 scsi_host->max_id); 288 le32_to_cpu(id), scsi_host->max_id);
287 return -EFAULT; 289 return -EFAULT;
288 } 290 }
289 291
290 if (lun >= scsi_host->max_lun) { 292 if (le64_to_cpu(lun) >= scsi_host->max_lun) {
291 osm_warn("SCSI device id (%d) >= max_lun of I2O host (%d)", 293 osm_warn("SCSI device lun (%lu) >= max_lun of I2O host (%d)",
292 (unsigned int)lun, scsi_host->max_lun); 294 (long unsigned int)le64_to_cpu(lun),
295 scsi_host->max_lun);
293 return -EFAULT; 296 return -EFAULT;
294 } 297 }
295 298
296 scsi_dev = 299 scsi_dev =
297 __scsi_add_device(i2o_shost->scsi_host, channel, id, lun, i2o_dev); 300 __scsi_add_device(i2o_shost->scsi_host, channel, le32_to_cpu(id),
301 le64_to_cpu(lun), i2o_dev);
298 302
299 if (IS_ERR(scsi_dev)) { 303 if (IS_ERR(scsi_dev)) {
300 osm_warn("can not add SCSI device %03x\n", 304 osm_warn("can not add SCSI device %03x\n",
@@ -305,8 +309,9 @@ static int i2o_scsi_probe(struct device *dev)
305 sysfs_create_link(&i2o_dev->device.kobj, &scsi_dev->sdev_gendev.kobj, 309 sysfs_create_link(&i2o_dev->device.kobj, &scsi_dev->sdev_gendev.kobj,
306 "scsi"); 310 "scsi");
307 311
308 osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %d\n", 312 osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %ld\n",
309 i2o_dev->lct_data.tid, channel, id, (unsigned int)lun); 313 i2o_dev->lct_data.tid, channel, le32_to_cpu(id),
314 (long unsigned int)le64_to_cpu(lun));
310 315
311 return 0; 316 return 0;
312}; 317};
@@ -510,8 +515,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
510 struct i2o_controller *c; 515 struct i2o_controller *c;
511 struct i2o_device *i2o_dev; 516 struct i2o_device *i2o_dev;
512 int tid; 517 int tid;
513 struct i2o_message __iomem *msg; 518 struct i2o_message *msg;
514 u32 m;
515 /* 519 /*
516 * ENABLE_DISCONNECT 520 * ENABLE_DISCONNECT
517 * SIMPLE_TAG 521 * SIMPLE_TAG
@@ -519,7 +523,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
519 */ 523 */
520 u32 scsi_flags = 0x20a00000; 524 u32 scsi_flags = 0x20a00000;
521 u32 sgl_offset; 525 u32 sgl_offset;
522 u32 __iomem *mptr; 526 u32 *mptr;
523 u32 cmd = I2O_CMD_SCSI_EXEC << 24; 527 u32 cmd = I2O_CMD_SCSI_EXEC << 24;
524 int rc = 0; 528 int rc = 0;
525 529
@@ -576,8 +580,8 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
576 * throw it back to the scsi layer 580 * throw it back to the scsi layer
577 */ 581 */
578 582
579 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 583 msg = i2o_msg_get(c);
580 if (m == I2O_QUEUE_EMPTY) { 584 if (IS_ERR(msg)) {
581 rc = SCSI_MLQUEUE_HOST_BUSY; 585 rc = SCSI_MLQUEUE_HOST_BUSY;
582 goto exit; 586 goto exit;
583 } 587 }
@@ -617,16 +621,16 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
617 if (sgl_offset == SGL_OFFSET_10) 621 if (sgl_offset == SGL_OFFSET_10)
618 sgl_offset = SGL_OFFSET_12; 622 sgl_offset = SGL_OFFSET_12;
619 cmd = I2O_CMD_PRIVATE << 24; 623 cmd = I2O_CMD_PRIVATE << 24;
620 writel(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC, mptr++); 624 *mptr++ = cpu_to_le32(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC);
621 writel(adpt_flags | tid, mptr++); 625 *mptr++ = cpu_to_le32(adpt_flags | tid);
622 } 626 }
623#endif 627#endif
624 628
625 writel(cmd | HOST_TID << 12 | tid, &msg->u.head[1]); 629 msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
626 writel(i2o_scsi_driver.context, &msg->u.s.icntxt); 630 msg->u.s.icntxt = cpu_to_le32(i2o_scsi_driver.context);
627 631
628 /* We want the SCSI control block back */ 632 /* We want the SCSI control block back */
629 writel(i2o_cntxt_list_add(c, SCpnt), &msg->u.s.tcntxt); 633 msg->u.s.tcntxt = cpu_to_le32(i2o_cntxt_list_add(c, SCpnt));
630 634
631 /* LSI_920_PCI_QUIRK 635 /* LSI_920_PCI_QUIRK
632 * 636 *
@@ -649,15 +653,15 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
649 } 653 }
650 */ 654 */
651 655
652 writel(scsi_flags | SCpnt->cmd_len, mptr++); 656 *mptr++ = cpu_to_le32(scsi_flags | SCpnt->cmd_len);
653 657
654 /* Write SCSI command into the message - always 16 byte block */ 658 /* Write SCSI command into the message - always 16 byte block */
655 memcpy_toio(mptr, SCpnt->cmnd, 16); 659 memcpy(mptr, SCpnt->cmnd, 16);
656 mptr += 4; 660 mptr += 4;
657 661
658 if (sgl_offset != SGL_OFFSET_0) { 662 if (sgl_offset != SGL_OFFSET_0) {
659 /* write size of data addressed by SGL */ 663 /* write size of data addressed by SGL */
660 writel(SCpnt->request_bufflen, mptr++); 664 *mptr++ = cpu_to_le32(SCpnt->request_bufflen);
661 665
662 /* Now fill in the SGList and command */ 666 /* Now fill in the SGList and command */
663 if (SCpnt->use_sg) { 667 if (SCpnt->use_sg) {
@@ -676,11 +680,11 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
676 } 680 }
677 681
678 /* Stick the headers on */ 682 /* Stick the headers on */
679 writel(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset, 683 msg->u.head[0] =
680 &msg->u.head[0]); 684 cpu_to_le32(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset);
681 685
682 /* Queue the message */ 686 /* Queue the message */
683 i2o_msg_post(c, m); 687 i2o_msg_post(c, msg);
684 688
685 osm_debug("Issued %ld\n", SCpnt->serial_number); 689 osm_debug("Issued %ld\n", SCpnt->serial_number);
686 690
@@ -688,7 +692,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
688 692
689 nomem: 693 nomem:
690 rc = -ENOMEM; 694 rc = -ENOMEM;
691 i2o_msg_nop(c, m); 695 i2o_msg_nop(c, msg);
692 696
693 exit: 697 exit:
694 return rc; 698 return rc;
@@ -709,8 +713,7 @@ static int i2o_scsi_abort(struct scsi_cmnd *SCpnt)
709{ 713{
710 struct i2o_device *i2o_dev; 714 struct i2o_device *i2o_dev;
711 struct i2o_controller *c; 715 struct i2o_controller *c;
712 struct i2o_message __iomem *msg; 716 struct i2o_message *msg;
713 u32 m;
714 int tid; 717 int tid;
715 int status = FAILED; 718 int status = FAILED;
716 719
@@ -720,16 +723,16 @@ static int i2o_scsi_abort(struct scsi_cmnd *SCpnt)
720 c = i2o_dev->iop; 723 c = i2o_dev->iop;
721 tid = i2o_dev->lct_data.tid; 724 tid = i2o_dev->lct_data.tid;
722 725
723 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 726 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
724 if (m == I2O_QUEUE_EMPTY) 727 if (IS_ERR(msg))
725 return SCSI_MLQUEUE_HOST_BUSY; 728 return SCSI_MLQUEUE_HOST_BUSY;
726 729
727 writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 730 msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
728 writel(I2O_CMD_SCSI_ABORT << 24 | HOST_TID << 12 | tid, 731 msg->u.head[1] =
729 &msg->u.head[1]); 732 cpu_to_le32(I2O_CMD_SCSI_ABORT << 24 | HOST_TID << 12 | tid);
730 writel(i2o_cntxt_list_get_ptr(c, SCpnt), &msg->body[0]); 733 msg->body[0] = cpu_to_le32(i2o_cntxt_list_get_ptr(c, SCpnt));
731 734
732 if (i2o_msg_post_wait(c, m, I2O_TIMEOUT_SCSI_SCB_ABORT)) 735 if (i2o_msg_post_wait(c, msg, I2O_TIMEOUT_SCSI_SCB_ABORT))
733 status = SUCCESS; 736 status = SUCCESS;
734 737
735 return status; 738 return status;
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index 4eb53258842e..492167446936 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -32,7 +32,7 @@
32#include "core.h" 32#include "core.h"
33 33
34#define OSM_NAME "i2o" 34#define OSM_NAME "i2o"
35#define OSM_VERSION "1.288" 35#define OSM_VERSION "1.325"
36#define OSM_DESCRIPTION "I2O subsystem" 36#define OSM_DESCRIPTION "I2O subsystem"
37 37
38/* global I2O controller list */ 38/* global I2O controller list */
@@ -47,27 +47,6 @@ static struct i2o_dma i2o_systab;
47static int i2o_hrt_get(struct i2o_controller *c); 47static int i2o_hrt_get(struct i2o_controller *c);
48 48
49/** 49/**
50 * i2o_msg_nop - Returns a message which is not used
51 * @c: I2O controller from which the message was created
52 * @m: message which should be returned
53 *
54 * If you fetch a message via i2o_msg_get, and can't use it, you must
55 * return the message with this function. Otherwise the message frame
56 * is lost.
57 */
58void i2o_msg_nop(struct i2o_controller *c, u32 m)
59{
60 struct i2o_message __iomem *msg = i2o_msg_in_to_virt(c, m);
61
62 writel(THREE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
63 writel(I2O_CMD_UTIL_NOP << 24 | HOST_TID << 12 | ADAPTER_TID,
64 &msg->u.head[1]);
65 writel(0, &msg->u.head[2]);
66 writel(0, &msg->u.head[3]);
67 i2o_msg_post(c, m);
68};
69
70/**
71 * i2o_msg_get_wait - obtain an I2O message from the IOP 50 * i2o_msg_get_wait - obtain an I2O message from the IOP
72 * @c: I2O controller 51 * @c: I2O controller
73 * @msg: pointer to a I2O message pointer 52 * @msg: pointer to a I2O message pointer
@@ -81,22 +60,21 @@ void i2o_msg_nop(struct i2o_controller *c, u32 m)
81 * address from the read port (see the i2o spec). If no message is 60 * address from the read port (see the i2o spec). If no message is
82 * available returns I2O_QUEUE_EMPTY and msg is leaved untouched. 61 * available returns I2O_QUEUE_EMPTY and msg is leaved untouched.
83 */ 62 */
84u32 i2o_msg_get_wait(struct i2o_controller *c, 63struct i2o_message *i2o_msg_get_wait(struct i2o_controller *c, int wait)
85 struct i2o_message __iomem ** msg, int wait)
86{ 64{
87 unsigned long timeout = jiffies + wait * HZ; 65 unsigned long timeout = jiffies + wait * HZ;
88 u32 m; 66 struct i2o_message *msg;
89 67
90 while ((m = i2o_msg_get(c, msg)) == I2O_QUEUE_EMPTY) { 68 while (IS_ERR(msg = i2o_msg_get(c))) {
91 if (time_after(jiffies, timeout)) { 69 if (time_after(jiffies, timeout)) {
92 osm_debug("%s: Timeout waiting for message frame.\n", 70 osm_debug("%s: Timeout waiting for message frame.\n",
93 c->name); 71 c->name);
94 return I2O_QUEUE_EMPTY; 72 return ERR_PTR(-ETIMEDOUT);
95 } 73 }
96 schedule_timeout_uninterruptible(1); 74 schedule_timeout_uninterruptible(1);
97 } 75 }
98 76
99 return m; 77 return msg;
100}; 78};
101 79
102#if BITS_PER_LONG == 64 80#if BITS_PER_LONG == 64
@@ -301,8 +279,7 @@ struct i2o_device *i2o_iop_find_device(struct i2o_controller *c, u16 tid)
301 */ 279 */
302static int i2o_iop_quiesce(struct i2o_controller *c) 280static int i2o_iop_quiesce(struct i2o_controller *c)
303{ 281{
304 struct i2o_message __iomem *msg; 282 struct i2o_message *msg;
305 u32 m;
306 i2o_status_block *sb = c->status_block.virt; 283 i2o_status_block *sb = c->status_block.virt;
307 int rc; 284 int rc;
308 285
@@ -313,16 +290,17 @@ static int i2o_iop_quiesce(struct i2o_controller *c)
313 (sb->iop_state != ADAPTER_STATE_OPERATIONAL)) 290 (sb->iop_state != ADAPTER_STATE_OPERATIONAL))
314 return 0; 291 return 0;
315 292
316 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 293 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
317 if (m == I2O_QUEUE_EMPTY) 294 if (IS_ERR(msg))
318 return -ETIMEDOUT; 295 return PTR_ERR(msg);
319 296
320 writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 297 msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
321 writel(I2O_CMD_SYS_QUIESCE << 24 | HOST_TID << 12 | ADAPTER_TID, 298 msg->u.head[1] =
322 &msg->u.head[1]); 299 cpu_to_le32(I2O_CMD_SYS_QUIESCE << 24 | HOST_TID << 12 |
300 ADAPTER_TID);
323 301
324 /* Long timeout needed for quiesce if lots of devices */ 302 /* Long timeout needed for quiesce if lots of devices */
325 if ((rc = i2o_msg_post_wait(c, m, 240))) 303 if ((rc = i2o_msg_post_wait(c, msg, 240)))
326 osm_info("%s: Unable to quiesce (status=%#x).\n", c->name, -rc); 304 osm_info("%s: Unable to quiesce (status=%#x).\n", c->name, -rc);
327 else 305 else
328 osm_debug("%s: Quiesced.\n", c->name); 306 osm_debug("%s: Quiesced.\n", c->name);
@@ -342,8 +320,7 @@ static int i2o_iop_quiesce(struct i2o_controller *c)
342 */ 320 */
343static int i2o_iop_enable(struct i2o_controller *c) 321static int i2o_iop_enable(struct i2o_controller *c)
344{ 322{
345 struct i2o_message __iomem *msg; 323 struct i2o_message *msg;
346 u32 m;
347 i2o_status_block *sb = c->status_block.virt; 324 i2o_status_block *sb = c->status_block.virt;
348 int rc; 325 int rc;
349 326
@@ -353,16 +330,17 @@ static int i2o_iop_enable(struct i2o_controller *c)
353 if (sb->iop_state != ADAPTER_STATE_READY) 330 if (sb->iop_state != ADAPTER_STATE_READY)
354 return -EINVAL; 331 return -EINVAL;
355 332
356 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 333 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
357 if (m == I2O_QUEUE_EMPTY) 334 if (IS_ERR(msg))
358 return -ETIMEDOUT; 335 return PTR_ERR(msg);
359 336
360 writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 337 msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
361 writel(I2O_CMD_SYS_ENABLE << 24 | HOST_TID << 12 | ADAPTER_TID, 338 msg->u.head[1] =
362 &msg->u.head[1]); 339 cpu_to_le32(I2O_CMD_SYS_ENABLE << 24 | HOST_TID << 12 |
340 ADAPTER_TID);
363 341
364 /* How long of a timeout do we need? */ 342 /* How long of a timeout do we need? */
365 if ((rc = i2o_msg_post_wait(c, m, 240))) 343 if ((rc = i2o_msg_post_wait(c, msg, 240)))
366 osm_err("%s: Could not enable (status=%#x).\n", c->name, -rc); 344 osm_err("%s: Could not enable (status=%#x).\n", c->name, -rc);
367 else 345 else
368 osm_debug("%s: Enabled.\n", c->name); 346 osm_debug("%s: Enabled.\n", c->name);
@@ -413,22 +391,22 @@ static inline void i2o_iop_enable_all(void)
413 */ 391 */
414static int i2o_iop_clear(struct i2o_controller *c) 392static int i2o_iop_clear(struct i2o_controller *c)
415{ 393{
416 struct i2o_message __iomem *msg; 394 struct i2o_message *msg;
417 u32 m;
418 int rc; 395 int rc;
419 396
420 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 397 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
421 if (m == I2O_QUEUE_EMPTY) 398 if (IS_ERR(msg))
422 return -ETIMEDOUT; 399 return PTR_ERR(msg);
423 400
424 /* Quiesce all IOPs first */ 401 /* Quiesce all IOPs first */
425 i2o_iop_quiesce_all(); 402 i2o_iop_quiesce_all();
426 403
427 writel(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 404 msg->u.head[0] = cpu_to_le32(FOUR_WORD_MSG_SIZE | SGL_OFFSET_0);
428 writel(I2O_CMD_ADAPTER_CLEAR << 24 | HOST_TID << 12 | ADAPTER_TID, 405 msg->u.head[1] =
429 &msg->u.head[1]); 406 cpu_to_le32(I2O_CMD_ADAPTER_CLEAR << 24 | HOST_TID << 12 |
407 ADAPTER_TID);
430 408
431 if ((rc = i2o_msg_post_wait(c, m, 30))) 409 if ((rc = i2o_msg_post_wait(c, msg, 30)))
432 osm_info("%s: Unable to clear (status=%#x).\n", c->name, -rc); 410 osm_info("%s: Unable to clear (status=%#x).\n", c->name, -rc);
433 else 411 else
434 osm_debug("%s: Cleared.\n", c->name); 412 osm_debug("%s: Cleared.\n", c->name);
@@ -446,13 +424,13 @@ static int i2o_iop_clear(struct i2o_controller *c)
446 * Clear and (re)initialize IOP's outbound queue and post the message 424 * Clear and (re)initialize IOP's outbound queue and post the message
447 * frames to the IOP. 425 * frames to the IOP.
448 * 426 *
449 * Returns 0 on success or a negative errno code on failure. 427 * Returns 0 on success or negative error code on failure.
450 */ 428 */
451static int i2o_iop_init_outbound_queue(struct i2o_controller *c) 429static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
452{ 430{
453 volatile u8 *status = c->status.virt;
454 u32 m; 431 u32 m;
455 struct i2o_message __iomem *msg; 432 volatile u8 *status = c->status.virt;
433 struct i2o_message *msg;
456 ulong timeout; 434 ulong timeout;
457 int i; 435 int i;
458 436
@@ -460,23 +438,24 @@ static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
460 438
461 memset(c->status.virt, 0, 4); 439 memset(c->status.virt, 0, 4);
462 440
463 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 441 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
464 if (m == I2O_QUEUE_EMPTY) 442 if (IS_ERR(msg))
465 return -ETIMEDOUT; 443 return PTR_ERR(msg);
466 444
467 writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6, &msg->u.head[0]); 445 msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6);
468 writel(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 | ADAPTER_TID, 446 msg->u.head[1] =
469 &msg->u.head[1]); 447 cpu_to_le32(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 |
470 writel(i2o_exec_driver.context, &msg->u.s.icntxt); 448 ADAPTER_TID);
471 writel(0x00000000, &msg->u.s.tcntxt); 449 msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
472 writel(PAGE_SIZE, &msg->body[0]); 450 msg->u.s.tcntxt = cpu_to_le32(0x00000000);
451 msg->body[0] = cpu_to_le32(PAGE_SIZE);
473 /* Outbound msg frame size in words and Initcode */ 452 /* Outbound msg frame size in words and Initcode */
474 writel(I2O_OUTBOUND_MSG_FRAME_SIZE << 16 | 0x80, &msg->body[1]); 453 msg->body[1] = cpu_to_le32(I2O_OUTBOUND_MSG_FRAME_SIZE << 16 | 0x80);
475 writel(0xd0000004, &msg->body[2]); 454 msg->body[2] = cpu_to_le32(0xd0000004);
476 writel(i2o_dma_low(c->status.phys), &msg->body[3]); 455 msg->body[3] = cpu_to_le32(i2o_dma_low(c->status.phys));
477 writel(i2o_dma_high(c->status.phys), &msg->body[4]); 456 msg->body[4] = cpu_to_le32(i2o_dma_high(c->status.phys));
478 457
479 i2o_msg_post(c, m); 458 i2o_msg_post(c, msg);
480 459
481 timeout = jiffies + I2O_TIMEOUT_INIT_OUTBOUND_QUEUE * HZ; 460 timeout = jiffies + I2O_TIMEOUT_INIT_OUTBOUND_QUEUE * HZ;
482 while (*status <= I2O_CMD_IN_PROGRESS) { 461 while (*status <= I2O_CMD_IN_PROGRESS) {
@@ -511,34 +490,34 @@ static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
511static int i2o_iop_reset(struct i2o_controller *c) 490static int i2o_iop_reset(struct i2o_controller *c)
512{ 491{
513 volatile u8 *status = c->status.virt; 492 volatile u8 *status = c->status.virt;
514 struct i2o_message __iomem *msg; 493 struct i2o_message *msg;
515 u32 m;
516 unsigned long timeout; 494 unsigned long timeout;
517 i2o_status_block *sb = c->status_block.virt; 495 i2o_status_block *sb = c->status_block.virt;
518 int rc = 0; 496 int rc = 0;
519 497
520 osm_debug("%s: Resetting controller\n", c->name); 498 osm_debug("%s: Resetting controller\n", c->name);
521 499
522 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 500 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
523 if (m == I2O_QUEUE_EMPTY) 501 if (IS_ERR(msg))
524 return -ETIMEDOUT; 502 return PTR_ERR(msg);
525 503
526 memset(c->status_block.virt, 0, 8); 504 memset(c->status_block.virt, 0, 8);
527 505
528 /* Quiesce all IOPs first */ 506 /* Quiesce all IOPs first */
529 i2o_iop_quiesce_all(); 507 i2o_iop_quiesce_all();
530 508
531 writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 509 msg->u.head[0] = cpu_to_le32(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_0);
532 writel(I2O_CMD_ADAPTER_RESET << 24 | HOST_TID << 12 | ADAPTER_TID, 510 msg->u.head[1] =
533 &msg->u.head[1]); 511 cpu_to_le32(I2O_CMD_ADAPTER_RESET << 24 | HOST_TID << 12 |
534 writel(i2o_exec_driver.context, &msg->u.s.icntxt); 512 ADAPTER_TID);
535 writel(0, &msg->u.s.tcntxt); //FIXME: use reasonable transaction context 513 msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
536 writel(0, &msg->body[0]); 514 msg->u.s.tcntxt = cpu_to_le32(0x00000000);
537 writel(0, &msg->body[1]); 515 msg->body[0] = cpu_to_le32(0x00000000);
538 writel(i2o_dma_low(c->status.phys), &msg->body[2]); 516 msg->body[1] = cpu_to_le32(0x00000000);
539 writel(i2o_dma_high(c->status.phys), &msg->body[3]); 517 msg->body[2] = cpu_to_le32(i2o_dma_low(c->status.phys));
518 msg->body[3] = cpu_to_le32(i2o_dma_high(c->status.phys));
540 519
541 i2o_msg_post(c, m); 520 i2o_msg_post(c, msg);
542 521
543 /* Wait for a reply */ 522 /* Wait for a reply */
544 timeout = jiffies + I2O_TIMEOUT_RESET * HZ; 523 timeout = jiffies + I2O_TIMEOUT_RESET * HZ;
@@ -567,18 +546,15 @@ static int i2o_iop_reset(struct i2o_controller *c)
567 osm_debug("%s: Reset in progress, waiting for reboot...\n", 546 osm_debug("%s: Reset in progress, waiting for reboot...\n",
568 c->name); 547 c->name);
569 548
570 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_RESET); 549 while (IS_ERR(msg = i2o_msg_get_wait(c, I2O_TIMEOUT_RESET))) {
571 while (m == I2O_QUEUE_EMPTY) {
572 if (time_after(jiffies, timeout)) { 550 if (time_after(jiffies, timeout)) {
573 osm_err("%s: IOP reset timeout.\n", c->name); 551 osm_err("%s: IOP reset timeout.\n", c->name);
574 rc = -ETIMEDOUT; 552 rc = PTR_ERR(msg);
575 goto exit; 553 goto exit;
576 } 554 }
577 schedule_timeout_uninterruptible(1); 555 schedule_timeout_uninterruptible(1);
578
579 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_RESET);
580 } 556 }
581 i2o_msg_nop(c, m); 557 i2o_msg_nop(c, msg);
582 558
583 /* from here all quiesce commands are safe */ 559 /* from here all quiesce commands are safe */
584 c->no_quiesce = 0; 560 c->no_quiesce = 0;
@@ -686,8 +662,7 @@ static int i2o_iop_activate(struct i2o_controller *c)
686 */ 662 */
687static int i2o_iop_systab_set(struct i2o_controller *c) 663static int i2o_iop_systab_set(struct i2o_controller *c)
688{ 664{
689 struct i2o_message __iomem *msg; 665 struct i2o_message *msg;
690 u32 m;
691 i2o_status_block *sb = c->status_block.virt; 666 i2o_status_block *sb = c->status_block.virt;
692 struct device *dev = &c->pdev->dev; 667 struct device *dev = &c->pdev->dev;
693 struct resource *root; 668 struct resource *root;
@@ -735,41 +710,38 @@ static int i2o_iop_systab_set(struct i2o_controller *c)
735 } 710 }
736 } 711 }
737 712
738 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 713 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
739 if (m == I2O_QUEUE_EMPTY) 714 if (IS_ERR(msg))
740 return -ETIMEDOUT; 715 return PTR_ERR(msg);
741 716
742 i2o_systab.phys = dma_map_single(dev, i2o_systab.virt, i2o_systab.len, 717 i2o_systab.phys = dma_map_single(dev, i2o_systab.virt, i2o_systab.len,
743 PCI_DMA_TODEVICE); 718 PCI_DMA_TODEVICE);
744 if (!i2o_systab.phys) { 719 if (!i2o_systab.phys) {
745 i2o_msg_nop(c, m); 720 i2o_msg_nop(c, msg);
746 return -ENOMEM; 721 return -ENOMEM;
747 } 722 }
748 723
749 writel(I2O_MESSAGE_SIZE(12) | SGL_OFFSET_6, &msg->u.head[0]); 724 msg->u.head[0] = cpu_to_le32(I2O_MESSAGE_SIZE(12) | SGL_OFFSET_6);
750 writel(I2O_CMD_SYS_TAB_SET << 24 | HOST_TID << 12 | ADAPTER_TID, 725 msg->u.head[1] =
751 &msg->u.head[1]); 726 cpu_to_le32(I2O_CMD_SYS_TAB_SET << 24 | HOST_TID << 12 |
727 ADAPTER_TID);
752 728
753 /* 729 /*
754 * Provide three SGL-elements: 730 * Provide three SGL-elements:
755 * System table (SysTab), Private memory space declaration and 731 * System table (SysTab), Private memory space declaration and
756 * Private i/o space declaration 732 * Private i/o space declaration
757 *
758 * FIXME: is this still true?
759 * Nasty one here. We can't use dma_alloc_coherent to send the
760 * same table to everyone. We have to go remap it for them all
761 */ 733 */
762 734
763 writel(c->unit + 2, &msg->body[0]); 735 msg->body[0] = cpu_to_le32(c->unit + 2);
764 writel(0, &msg->body[1]); 736 msg->body[1] = cpu_to_le32(0x00000000);
765 writel(0x54000000 | i2o_systab.len, &msg->body[2]); 737 msg->body[2] = cpu_to_le32(0x54000000 | i2o_systab.len);
766 writel(i2o_systab.phys, &msg->body[3]); 738 msg->body[3] = cpu_to_le32(i2o_systab.phys);
767 writel(0x54000000 | sb->current_mem_size, &msg->body[4]); 739 msg->body[4] = cpu_to_le32(0x54000000 | sb->current_mem_size);
768 writel(sb->current_mem_base, &msg->body[5]); 740 msg->body[5] = cpu_to_le32(sb->current_mem_base);
769 writel(0xd4000000 | sb->current_io_size, &msg->body[6]); 741 msg->body[6] = cpu_to_le32(0xd4000000 | sb->current_io_size);
770 writel(sb->current_io_base, &msg->body[6]); 742 msg->body[6] = cpu_to_le32(sb->current_io_base);
771 743
772 rc = i2o_msg_post_wait(c, m, 120); 744 rc = i2o_msg_post_wait(c, msg, 120);
773 745
774 dma_unmap_single(dev, i2o_systab.phys, i2o_systab.len, 746 dma_unmap_single(dev, i2o_systab.phys, i2o_systab.len,
775 PCI_DMA_TODEVICE); 747 PCI_DMA_TODEVICE);
@@ -780,8 +752,6 @@ static int i2o_iop_systab_set(struct i2o_controller *c)
780 else 752 else
781 osm_debug("%s: SysTab set.\n", c->name); 753 osm_debug("%s: SysTab set.\n", c->name);
782 754
783 i2o_status_get(c); // Entered READY state
784
785 return rc; 755 return rc;
786} 756}
787 757
@@ -791,7 +761,7 @@ static int i2o_iop_systab_set(struct i2o_controller *c)
791 * 761 *
792 * Send the system table and enable the I2O controller. 762 * Send the system table and enable the I2O controller.
793 * 763 *
794 * Returns 0 on success or negativer error code on failure. 764 * Returns 0 on success or negative error code on failure.
795 */ 765 */
796static int i2o_iop_online(struct i2o_controller *c) 766static int i2o_iop_online(struct i2o_controller *c)
797{ 767{
@@ -830,7 +800,6 @@ void i2o_iop_remove(struct i2o_controller *c)
830 list_for_each_entry_safe(dev, tmp, &c->devices, list) 800 list_for_each_entry_safe(dev, tmp, &c->devices, list)
831 i2o_device_remove(dev); 801 i2o_device_remove(dev);
832 802
833 class_device_unregister(c->classdev);
834 device_del(&c->device); 803 device_del(&c->device);
835 804
836 /* Ask the IOP to switch to RESET state */ 805 /* Ask the IOP to switch to RESET state */
@@ -869,12 +838,11 @@ static int i2o_systab_build(void)
869 i2o_systab.len = sizeof(struct i2o_sys_tbl) + num_controllers * 838 i2o_systab.len = sizeof(struct i2o_sys_tbl) + num_controllers *
870 sizeof(struct i2o_sys_tbl_entry); 839 sizeof(struct i2o_sys_tbl_entry);
871 840
872 systab = i2o_systab.virt = kmalloc(i2o_systab.len, GFP_KERNEL); 841 systab = i2o_systab.virt = kzalloc(i2o_systab.len, GFP_KERNEL);
873 if (!systab) { 842 if (!systab) {
874 osm_err("unable to allocate memory for System Table\n"); 843 osm_err("unable to allocate memory for System Table\n");
875 return -ENOMEM; 844 return -ENOMEM;
876 } 845 }
877 memset(systab, 0, i2o_systab.len);
878 846
879 systab->version = I2OVERSION; 847 systab->version = I2OVERSION;
880 systab->change_ind = change_ind + 1; 848 systab->change_ind = change_ind + 1;
@@ -952,30 +920,30 @@ static int i2o_parse_hrt(struct i2o_controller *c)
952 */ 920 */
953int i2o_status_get(struct i2o_controller *c) 921int i2o_status_get(struct i2o_controller *c)
954{ 922{
955 struct i2o_message __iomem *msg; 923 struct i2o_message *msg;
956 u32 m;
957 volatile u8 *status_block; 924 volatile u8 *status_block;
958 unsigned long timeout; 925 unsigned long timeout;
959 926
960 status_block = (u8 *) c->status_block.virt; 927 status_block = (u8 *) c->status_block.virt;
961 memset(c->status_block.virt, 0, sizeof(i2o_status_block)); 928 memset(c->status_block.virt, 0, sizeof(i2o_status_block));
962 929
963 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 930 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
964 if (m == I2O_QUEUE_EMPTY) 931 if (IS_ERR(msg))
965 return -ETIMEDOUT; 932 return PTR_ERR(msg);
966 933
967 writel(NINE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 934 msg->u.head[0] = cpu_to_le32(NINE_WORD_MSG_SIZE | SGL_OFFSET_0);
968 writel(I2O_CMD_STATUS_GET << 24 | HOST_TID << 12 | ADAPTER_TID, 935 msg->u.head[1] =
969 &msg->u.head[1]); 936 cpu_to_le32(I2O_CMD_STATUS_GET << 24 | HOST_TID << 12 |
970 writel(i2o_exec_driver.context, &msg->u.s.icntxt); 937 ADAPTER_TID);
971 writel(0, &msg->u.s.tcntxt); // FIXME: use resonable transaction context 938 msg->u.s.icntxt = cpu_to_le32(i2o_exec_driver.context);
972 writel(0, &msg->body[0]); 939 msg->u.s.tcntxt = cpu_to_le32(0x00000000);
973 writel(0, &msg->body[1]); 940 msg->body[0] = cpu_to_le32(0x00000000);
974 writel(i2o_dma_low(c->status_block.phys), &msg->body[2]); 941 msg->body[1] = cpu_to_le32(0x00000000);
975 writel(i2o_dma_high(c->status_block.phys), &msg->body[3]); 942 msg->body[2] = cpu_to_le32(i2o_dma_low(c->status_block.phys));
976 writel(sizeof(i2o_status_block), &msg->body[4]); /* always 88 bytes */ 943 msg->body[3] = cpu_to_le32(i2o_dma_high(c->status_block.phys));
944 msg->body[4] = cpu_to_le32(sizeof(i2o_status_block)); /* always 88 bytes */
977 945
978 i2o_msg_post(c, m); 946 i2o_msg_post(c, msg);
979 947
980 /* Wait for a reply */ 948 /* Wait for a reply */
981 timeout = jiffies + I2O_TIMEOUT_STATUS_GET * HZ; 949 timeout = jiffies + I2O_TIMEOUT_STATUS_GET * HZ;
@@ -1002,7 +970,7 @@ int i2o_status_get(struct i2o_controller *c)
1002 * The HRT contains information about possible hidden devices but is 970 * The HRT contains information about possible hidden devices but is
1003 * mostly useless to us. 971 * mostly useless to us.
1004 * 972 *
1005 * Returns 0 on success or negativer error code on failure. 973 * Returns 0 on success or negative error code on failure.
1006 */ 974 */
1007static int i2o_hrt_get(struct i2o_controller *c) 975static int i2o_hrt_get(struct i2o_controller *c)
1008{ 976{
@@ -1013,20 +981,20 @@ static int i2o_hrt_get(struct i2o_controller *c)
1013 struct device *dev = &c->pdev->dev; 981 struct device *dev = &c->pdev->dev;
1014 982
1015 for (i = 0; i < I2O_HRT_GET_TRIES; i++) { 983 for (i = 0; i < I2O_HRT_GET_TRIES; i++) {
1016 struct i2o_message __iomem *msg; 984 struct i2o_message *msg;
1017 u32 m;
1018 985
1019 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 986 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
1020 if (m == I2O_QUEUE_EMPTY) 987 if (IS_ERR(msg))
1021 return -ETIMEDOUT; 988 return PTR_ERR(msg);
1022 989
1023 writel(SIX_WORD_MSG_SIZE | SGL_OFFSET_4, &msg->u.head[0]); 990 msg->u.head[0] = cpu_to_le32(SIX_WORD_MSG_SIZE | SGL_OFFSET_4);
1024 writel(I2O_CMD_HRT_GET << 24 | HOST_TID << 12 | ADAPTER_TID, 991 msg->u.head[1] =
1025 &msg->u.head[1]); 992 cpu_to_le32(I2O_CMD_HRT_GET << 24 | HOST_TID << 12 |
1026 writel(0xd0000000 | c->hrt.len, &msg->body[0]); 993 ADAPTER_TID);
1027 writel(c->hrt.phys, &msg->body[1]); 994 msg->body[0] = cpu_to_le32(0xd0000000 | c->hrt.len);
995 msg->body[1] = cpu_to_le32(c->hrt.phys);
1028 996
1029 rc = i2o_msg_post_wait_mem(c, m, 20, &c->hrt); 997 rc = i2o_msg_post_wait_mem(c, msg, 20, &c->hrt);
1030 998
1031 if (rc < 0) { 999 if (rc < 0) {
1032 osm_err("%s: Unable to get HRT (status=%#x)\n", c->name, 1000 osm_err("%s: Unable to get HRT (status=%#x)\n", c->name,
@@ -1051,15 +1019,6 @@ static int i2o_hrt_get(struct i2o_controller *c)
1051} 1019}
1052 1020
1053/** 1021/**
1054 * i2o_iop_free - Free the i2o_controller struct
1055 * @c: I2O controller to free
1056 */
1057void i2o_iop_free(struct i2o_controller *c)
1058{
1059 kfree(c);
1060};
1061
1062/**
1063 * i2o_iop_release - release the memory for a I2O controller 1022 * i2o_iop_release - release the memory for a I2O controller
1064 * @dev: I2O controller which should be released 1023 * @dev: I2O controller which should be released
1065 * 1024 *
@@ -1073,14 +1032,11 @@ static void i2o_iop_release(struct device *dev)
1073 i2o_iop_free(c); 1032 i2o_iop_free(c);
1074}; 1033};
1075 1034
1076/* I2O controller class */
1077static struct class *i2o_controller_class;
1078
1079/** 1035/**
1080 * i2o_iop_alloc - Allocate and initialize a i2o_controller struct 1036 * i2o_iop_alloc - Allocate and initialize a i2o_controller struct
1081 * 1037 *
1082 * Allocate the necessary memory for a i2o_controller struct and 1038 * Allocate the necessary memory for a i2o_controller struct and
1083 * initialize the lists. 1039 * initialize the lists and message mempool.
1084 * 1040 *
1085 * Returns a pointer to the I2O controller or a negative error code on 1041 * Returns a pointer to the I2O controller or a negative error code on
1086 * failure. 1042 * failure.
@@ -1089,20 +1045,29 @@ struct i2o_controller *i2o_iop_alloc(void)
1089{ 1045{
1090 static int unit = 0; /* 0 and 1 are NULL IOP and Local Host */ 1046 static int unit = 0; /* 0 and 1 are NULL IOP and Local Host */
1091 struct i2o_controller *c; 1047 struct i2o_controller *c;
1048 char poolname[32];
1092 1049
1093 c = kmalloc(sizeof(*c), GFP_KERNEL); 1050 c = kzalloc(sizeof(*c), GFP_KERNEL);
1094 if (!c) { 1051 if (!c) {
1095 osm_err("i2o: Insufficient memory to allocate a I2O controller." 1052 osm_err("i2o: Insufficient memory to allocate a I2O controller."
1096 "\n"); 1053 "\n");
1097 return ERR_PTR(-ENOMEM); 1054 return ERR_PTR(-ENOMEM);
1098 } 1055 }
1099 memset(c, 0, sizeof(*c)); 1056
1057 c->unit = unit++;
1058 sprintf(c->name, "iop%d", c->unit);
1059
1060 snprintf(poolname, sizeof(poolname), "i2o_%s_msg_inpool", c->name);
1061 if (i2o_pool_alloc
1062 (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4,
1063 I2O_MSG_INPOOL_MIN)) {
1064 kfree(c);
1065 return ERR_PTR(-ENOMEM);
1066 };
1100 1067
1101 INIT_LIST_HEAD(&c->devices); 1068 INIT_LIST_HEAD(&c->devices);
1102 spin_lock_init(&c->lock); 1069 spin_lock_init(&c->lock);
1103 init_MUTEX(&c->lct_lock); 1070 init_MUTEX(&c->lct_lock);
1104 c->unit = unit++;
1105 sprintf(c->name, "iop%d", c->unit);
1106 1071
1107 device_initialize(&c->device); 1072 device_initialize(&c->device);
1108 1073
@@ -1137,36 +1102,29 @@ int i2o_iop_add(struct i2o_controller *c)
1137 goto iop_reset; 1102 goto iop_reset;
1138 } 1103 }
1139 1104
1140 c->classdev = class_device_create(i2o_controller_class, NULL, MKDEV(0,0),
1141 &c->device, "iop%d", c->unit);
1142 if (IS_ERR(c->classdev)) {
1143 osm_err("%s: could not add controller class\n", c->name);
1144 goto device_del;
1145 }
1146
1147 osm_info("%s: Activating I2O controller...\n", c->name); 1105 osm_info("%s: Activating I2O controller...\n", c->name);
1148 osm_info("%s: This may take a few minutes if there are many devices\n", 1106 osm_info("%s: This may take a few minutes if there are many devices\n",
1149 c->name); 1107 c->name);
1150 1108
1151 if ((rc = i2o_iop_activate(c))) { 1109 if ((rc = i2o_iop_activate(c))) {
1152 osm_err("%s: could not activate controller\n", c->name); 1110 osm_err("%s: could not activate controller\n", c->name);
1153 goto class_del; 1111 goto device_del;
1154 } 1112 }
1155 1113
1156 osm_debug("%s: building sys table...\n", c->name); 1114 osm_debug("%s: building sys table...\n", c->name);
1157 1115
1158 if ((rc = i2o_systab_build())) 1116 if ((rc = i2o_systab_build()))
1159 goto class_del; 1117 goto device_del;
1160 1118
1161 osm_debug("%s: online controller...\n", c->name); 1119 osm_debug("%s: online controller...\n", c->name);
1162 1120
1163 if ((rc = i2o_iop_online(c))) 1121 if ((rc = i2o_iop_online(c)))
1164 goto class_del; 1122 goto device_del;
1165 1123
1166 osm_debug("%s: getting LCT...\n", c->name); 1124 osm_debug("%s: getting LCT...\n", c->name);
1167 1125
1168 if ((rc = i2o_exec_lct_get(c))) 1126 if ((rc = i2o_exec_lct_get(c)))
1169 goto class_del; 1127 goto device_del;
1170 1128
1171 list_add(&c->list, &i2o_controllers); 1129 list_add(&c->list, &i2o_controllers);
1172 1130
@@ -1176,9 +1134,6 @@ int i2o_iop_add(struct i2o_controller *c)
1176 1134
1177 return 0; 1135 return 0;
1178 1136
1179 class_del:
1180 class_device_unregister(c->classdev);
1181
1182 device_del: 1137 device_del:
1183 device_del(&c->device); 1138 device_del(&c->device);
1184 1139
@@ -1199,28 +1154,27 @@ int i2o_iop_add(struct i2o_controller *c)
1199 * is waited for, or expected. If you do not want further notifications, 1154 * is waited for, or expected. If you do not want further notifications,
1200 * call the i2o_event_register again with a evt_mask of 0. 1155 * call the i2o_event_register again with a evt_mask of 0.
1201 * 1156 *
1202 * Returns 0 on success or -ETIMEDOUT if no message could be fetched for 1157 * Returns 0 on success or negative error code on failure.
1203 * sending the request.
1204 */ 1158 */
1205int i2o_event_register(struct i2o_device *dev, struct i2o_driver *drv, 1159int i2o_event_register(struct i2o_device *dev, struct i2o_driver *drv,
1206 int tcntxt, u32 evt_mask) 1160 int tcntxt, u32 evt_mask)
1207{ 1161{
1208 struct i2o_controller *c = dev->iop; 1162 struct i2o_controller *c = dev->iop;
1209 struct i2o_message __iomem *msg; 1163 struct i2o_message *msg;
1210 u32 m;
1211 1164
1212 m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET); 1165 msg = i2o_msg_get_wait(c, I2O_TIMEOUT_MESSAGE_GET);
1213 if (m == I2O_QUEUE_EMPTY) 1166 if (IS_ERR(msg))
1214 return -ETIMEDOUT; 1167 return PTR_ERR(msg);
1215 1168
1216 writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]); 1169 msg->u.head[0] = cpu_to_le32(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0);
1217 writel(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 | dev->lct_data. 1170 msg->u.head[1] =
1218 tid, &msg->u.head[1]); 1171 cpu_to_le32(I2O_CMD_UTIL_EVT_REGISTER << 24 | HOST_TID << 12 | dev->
1219 writel(drv->context, &msg->u.s.icntxt); 1172 lct_data.tid);
1220 writel(tcntxt, &msg->u.s.tcntxt); 1173 msg->u.s.icntxt = cpu_to_le32(drv->context);
1221 writel(evt_mask, &msg->body[0]); 1174 msg->u.s.tcntxt = cpu_to_le32(tcntxt);
1175 msg->body[0] = cpu_to_le32(evt_mask);
1222 1176
1223 i2o_msg_post(c, m); 1177 i2o_msg_post(c, msg);
1224 1178
1225 return 0; 1179 return 0;
1226}; 1180};
@@ -1239,14 +1193,8 @@ static int __init i2o_iop_init(void)
1239 1193
1240 printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n"); 1194 printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
1241 1195
1242 i2o_controller_class = class_create(THIS_MODULE, "i2o_controller");
1243 if (IS_ERR(i2o_controller_class)) {
1244 osm_err("can't register class i2o_controller\n");
1245 goto exit;
1246 }
1247
1248 if ((rc = i2o_driver_init())) 1196 if ((rc = i2o_driver_init()))
1249 goto class_exit; 1197 goto exit;
1250 1198
1251 if ((rc = i2o_exec_init())) 1199 if ((rc = i2o_exec_init()))
1252 goto driver_exit; 1200 goto driver_exit;
@@ -1262,9 +1210,6 @@ static int __init i2o_iop_init(void)
1262 driver_exit: 1210 driver_exit:
1263 i2o_driver_exit(); 1211 i2o_driver_exit();
1264 1212
1265 class_exit:
1266 class_destroy(i2o_controller_class);
1267
1268 exit: 1213 exit:
1269 return rc; 1214 return rc;
1270} 1215}
@@ -1279,7 +1224,6 @@ static void __exit i2o_iop_exit(void)
1279 i2o_pci_exit(); 1224 i2o_pci_exit();
1280 i2o_exec_exit(); 1225 i2o_exec_exit();
1281 i2o_driver_exit(); 1226 i2o_driver_exit();
1282 class_destroy(i2o_controller_class);
1283}; 1227};
1284 1228
1285module_init(i2o_iop_init); 1229module_init(i2o_iop_init);
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
index ee7075fa1ec3..c5b656cdea7c 100644
--- a/drivers/message/i2o/pci.c
+++ b/drivers/message/i2o/pci.c
@@ -339,7 +339,7 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
339 pci_name(pdev)); 339 pci_name(pdev));
340 340
341 c->pdev = pdev; 341 c->pdev = pdev;
342 c->device.parent = get_device(&pdev->dev); 342 c->device.parent = &pdev->dev;
343 343
344 /* Cards that fall apart if you hit them with large I/O loads... */ 344 /* Cards that fall apart if you hit them with large I/O loads... */
345 if (pdev->vendor == PCI_VENDOR_ID_NCR && pdev->device == 0x0630) { 345 if (pdev->vendor == PCI_VENDOR_ID_NCR && pdev->device == 0x0630) {
@@ -410,8 +410,6 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
410 if ((rc = i2o_iop_add(c))) 410 if ((rc = i2o_iop_add(c)))
411 goto uninstall; 411 goto uninstall;
412 412
413 get_device(&c->device);
414
415 if (i960) 413 if (i960)
416 pci_write_config_word(i960, 0x42, 0x03ff); 414 pci_write_config_word(i960, 0x42, 0x03ff);
417 415
@@ -424,7 +422,6 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
424 i2o_pci_free(c); 422 i2o_pci_free(c);
425 423
426 free_controller: 424 free_controller:
427 put_device(c->device.parent);
428 i2o_iop_free(c); 425 i2o_iop_free(c);
429 426
430 disable: 427 disable:
@@ -454,7 +451,6 @@ static void __devexit i2o_pci_remove(struct pci_dev *pdev)
454 451
455 printk(KERN_INFO "%s: Controller removed.\n", c->name); 452 printk(KERN_INFO "%s: Controller removed.\n", c->name);
456 453
457 put_device(c->device.parent);
458 put_device(&c->device); 454 put_device(&c->device);
459}; 455};
460 456
@@ -483,4 +479,5 @@ void __exit i2o_pci_exit(void)
483{ 479{
484 pci_unregister_driver(&i2o_pci_driver); 480 pci_unregister_driver(&i2o_pci_driver);
485}; 481};
482
486MODULE_DEVICE_TABLE(pci, i2o_pci_ids); 483MODULE_DEVICE_TABLE(pci, i2o_pci_ids);
diff --git a/drivers/mmc/mmc_block.c b/drivers/mmc/mmc_block.c
index abcf19116d70..8e380c14bf65 100644
--- a/drivers/mmc/mmc_block.c
+++ b/drivers/mmc/mmc_block.c
@@ -263,7 +263,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
263 */ 263 */
264 add_disk_randomness(req->rq_disk); 264 add_disk_randomness(req->rq_disk);
265 blkdev_dequeue_request(req); 265 blkdev_dequeue_request(req);
266 end_that_request_last(req); 266 end_that_request_last(req, 1);
267 } 267 }
268 spin_unlock_irq(&md->lock); 268 spin_unlock_irq(&md->lock);
269 } while (ret); 269 } while (ret);
@@ -289,7 +289,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
289 289
290 add_disk_randomness(req->rq_disk); 290 add_disk_randomness(req->rq_disk);
291 blkdev_dequeue_request(req); 291 blkdev_dequeue_request(req);
292 end_that_request_last(req); 292 end_that_request_last(req, 0);
293 spin_unlock_irq(&md->lock); 293 spin_unlock_irq(&md->lock);
294 294
295 return 0; 295 return 0;
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index c782a6329805..fa39b944bc46 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -6,7 +6,7 @@ menu "PHY device support"
6 6
7config PHYLIB 7config PHYLIB
8 tristate "PHY Device support and infrastructure" 8 tristate "PHY Device support and infrastructure"
9 depends on NET_ETHERNET && (BROKEN || !ARCH_S390) 9 depends on NET_ETHERNET && (BROKEN || !S390)
10 help 10 help
11 Ethernet controllers are usually attached to PHY 11 Ethernet controllers are usually attached to PHY
12 devices. This option provides infrastructure for 12 devices. This option provides infrastructure for
diff --git a/drivers/net/plip.c b/drivers/net/plip.c
index 1bd22cd40c75..87ee3271b17d 100644
--- a/drivers/net/plip.c
+++ b/drivers/net/plip.c
@@ -98,7 +98,6 @@ static const char version[] = "NET3 PLIP version 2.4-parport gniibe@mri.co.jp\n"
98#include <linux/in.h> 98#include <linux/in.h>
99#include <linux/errno.h> 99#include <linux/errno.h>
100#include <linux/delay.h> 100#include <linux/delay.h>
101#include <linux/lp.h>
102#include <linux/init.h> 101#include <linux/init.h>
103#include <linux/netdevice.h> 102#include <linux/netdevice.h>
104#include <linux/etherdevice.h> 103#include <linux/etherdevice.h>
@@ -106,7 +105,6 @@ static const char version[] = "NET3 PLIP version 2.4-parport gniibe@mri.co.jp\n"
106#include <linux/skbuff.h> 105#include <linux/skbuff.h>
107#include <linux/if_plip.h> 106#include <linux/if_plip.h>
108#include <linux/workqueue.h> 107#include <linux/workqueue.h>
109#include <linux/ioport.h>
110#include <linux/spinlock.h> 108#include <linux/spinlock.h>
111#include <linux/parport.h> 109#include <linux/parport.h>
112#include <linux/bitops.h> 110#include <linux/bitops.h>
diff --git a/drivers/parport/Kconfig b/drivers/parport/Kconfig
index 725a14119f2a..b8241561da45 100644
--- a/drivers/parport/Kconfig
+++ b/drivers/parport/Kconfig
@@ -77,7 +77,7 @@ config PARPORT_PC_SUPERIO
77 77
78config PARPORT_PC_PCMCIA 78config PARPORT_PC_PCMCIA
79 tristate "Support for PCMCIA management for PC-style ports" 79 tristate "Support for PCMCIA management for PC-style ports"
80 depends on PARPORT!=n && (PCMCIA!=n && PARPORT_PC=m && PARPORT_PC || PARPORT_PC=y && PCMCIA) 80 depends on PCMCIA && PARPORT_PC
81 help 81 help
82 Say Y here if you need PCMCIA support for your PC-style parallel 82 Say Y here if you need PCMCIA support for your PC-style parallel
83 ports. If unsure, say N. 83 ports. If unsure, say N.
diff --git a/drivers/parport/daisy.c b/drivers/parport/daisy.c
index 075c7eb5c85d..9ee67321b630 100644
--- a/drivers/parport/daisy.c
+++ b/drivers/parport/daisy.c
@@ -144,9 +144,9 @@ again:
144 add_dev (numdevs++, port, -1); 144 add_dev (numdevs++, port, -1);
145 145
146 /* Find out the legacy device's IEEE 1284 device ID. */ 146 /* Find out the legacy device's IEEE 1284 device ID. */
147 deviceid = kmalloc (1000, GFP_KERNEL); 147 deviceid = kmalloc (1024, GFP_KERNEL);
148 if (deviceid) { 148 if (deviceid) {
149 if (parport_device_id (numdevs - 1, deviceid, 1000) > 2) 149 if (parport_device_id (numdevs - 1, deviceid, 1024) > 2)
150 detected++; 150 detected++;
151 151
152 kfree (deviceid); 152 kfree (deviceid);
@@ -252,7 +252,7 @@ struct pardevice *parport_open (int devnum, const char *name,
252 selected = port->daisy; 252 selected = port->daisy;
253 parport_release (dev); 253 parport_release (dev);
254 254
255 if (selected != port->daisy) { 255 if (selected != daisy) {
256 /* No corresponding device. */ 256 /* No corresponding device. */
257 parport_unregister_device (dev); 257 parport_unregister_device (dev);
258 return NULL; 258 return NULL;
@@ -344,9 +344,9 @@ static int cpp_daisy (struct parport *port, int cmd)
344 PARPORT_CONTROL_STROBE, 344 PARPORT_CONTROL_STROBE,
345 PARPORT_CONTROL_STROBE); 345 PARPORT_CONTROL_STROBE);
346 udelay (1); 346 udelay (1);
347 s = parport_read_status (port);
347 parport_frob_control (port, PARPORT_CONTROL_STROBE, 0); 348 parport_frob_control (port, PARPORT_CONTROL_STROBE, 0);
348 udelay (1); 349 udelay (1);
349 s = parport_read_status (port);
350 parport_write_data (port, 0xff); udelay (2); 350 parport_write_data (port, 0xff); udelay (2);
351 351
352 return s; 352 return s;
@@ -395,15 +395,15 @@ int parport_daisy_select (struct parport *port, int daisy, int mode)
395 case IEEE1284_MODE_EPP: 395 case IEEE1284_MODE_EPP:
396 case IEEE1284_MODE_EPPSL: 396 case IEEE1284_MODE_EPPSL:
397 case IEEE1284_MODE_EPPSWE: 397 case IEEE1284_MODE_EPPSWE:
398 return (cpp_daisy (port, 0x20 + daisy) & 398 return !(cpp_daisy (port, 0x20 + daisy) &
399 PARPORT_STATUS_ERROR); 399 PARPORT_STATUS_ERROR);
400 400
401 // For these modes we should switch to ECP mode: 401 // For these modes we should switch to ECP mode:
402 case IEEE1284_MODE_ECP: 402 case IEEE1284_MODE_ECP:
403 case IEEE1284_MODE_ECPRLE: 403 case IEEE1284_MODE_ECPRLE:
404 case IEEE1284_MODE_ECPSWE: 404 case IEEE1284_MODE_ECPSWE:
405 return (cpp_daisy (port, 0xd0 + daisy) & 405 return !(cpp_daisy (port, 0xd0 + daisy) &
406 PARPORT_STATUS_ERROR); 406 PARPORT_STATUS_ERROR);
407 407
408 // Nothing was told for BECP in Daisy chain specification. 408 // Nothing was told for BECP in Daisy chain specification.
409 // May be it's wise to use ECP? 409 // May be it's wise to use ECP?
@@ -413,8 +413,8 @@ int parport_daisy_select (struct parport *port, int daisy, int mode)
413 case IEEE1284_MODE_BYTE: 413 case IEEE1284_MODE_BYTE:
414 case IEEE1284_MODE_COMPAT: 414 case IEEE1284_MODE_COMPAT:
415 default: 415 default:
416 return (cpp_daisy (port, 0xe0 + daisy) & 416 return !(cpp_daisy (port, 0xe0 + daisy) &
417 PARPORT_STATUS_ERROR); 417 PARPORT_STATUS_ERROR);
418 } 418 }
419} 419}
420 420
@@ -436,7 +436,7 @@ static int select_port (struct parport *port)
436 436
437static int assign_addrs (struct parport *port) 437static int assign_addrs (struct parport *port)
438{ 438{
439 unsigned char s, last_dev; 439 unsigned char s;
440 unsigned char daisy; 440 unsigned char daisy;
441 int thisdev = numdevs; 441 int thisdev = numdevs;
442 int detected; 442 int detected;
@@ -472,10 +472,13 @@ static int assign_addrs (struct parport *port)
472 } 472 }
473 473
474 parport_write_data (port, 0x78); udelay (2); 474 parport_write_data (port, 0x78); udelay (2);
475 last_dev = 0; /* We've just been speaking to a device, so we 475 s = parport_read_status (port);
476 know there must be at least _one_ out there. */
477 476
478 for (daisy = 0; daisy < 4; daisy++) { 477 for (daisy = 0;
478 (s & (PARPORT_STATUS_PAPEROUT|PARPORT_STATUS_SELECT))
479 == (PARPORT_STATUS_PAPEROUT|PARPORT_STATUS_SELECT)
480 && daisy < 4;
481 ++daisy) {
479 parport_write_data (port, daisy); 482 parport_write_data (port, daisy);
480 udelay (2); 483 udelay (2);
481 parport_frob_control (port, 484 parport_frob_control (port,
@@ -485,14 +488,18 @@ static int assign_addrs (struct parport *port)
485 parport_frob_control (port, PARPORT_CONTROL_STROBE, 0); 488 parport_frob_control (port, PARPORT_CONTROL_STROBE, 0);
486 udelay (1); 489 udelay (1);
487 490
488 if (last_dev) 491 add_dev (numdevs++, port, daisy);
489 /* No more devices. */
490 break;
491 492
492 last_dev = !(parport_read_status (port) 493 /* See if this device thought it was the last in the
493 & PARPORT_STATUS_BUSY); 494 * chain. */
495 if (!(s & PARPORT_STATUS_BUSY))
496 break;
494 497
495 add_dev (numdevs++, port, daisy); 498 /* We are seeing pass through status now. We see
499 last_dev from next device or if last_dev does not
500 work status lines from some non-daisy chain
501 device. */
502 s = parport_read_status (port);
496 } 503 }
497 504
498 parport_write_data (port, 0xff); udelay (2); 505 parport_write_data (port, 0xff); udelay (2);
@@ -501,11 +508,11 @@ static int assign_addrs (struct parport *port)
501 detected); 508 detected);
502 509
503 /* Ask the new devices to introduce themselves. */ 510 /* Ask the new devices to introduce themselves. */
504 deviceid = kmalloc (1000, GFP_KERNEL); 511 deviceid = kmalloc (1024, GFP_KERNEL);
505 if (!deviceid) return 0; 512 if (!deviceid) return 0;
506 513
507 for (daisy = 0; thisdev < numdevs; thisdev++, daisy++) 514 for (daisy = 0; thisdev < numdevs; thisdev++, daisy++)
508 parport_device_id (thisdev, deviceid, 1000); 515 parport_device_id (thisdev, deviceid, 1024);
509 516
510 kfree (deviceid); 517 kfree (deviceid);
511 return detected; 518 return detected;
diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c
index ce1e2aad8b10..d6c77658231e 100644
--- a/drivers/parport/ieee1284_ops.c
+++ b/drivers/parport/ieee1284_ops.c
@@ -165,17 +165,7 @@ size_t parport_ieee1284_read_nibble (struct parport *port,
165 /* Does the error line indicate end of data? */ 165 /* Does the error line indicate end of data? */
166 if (((i & 1) == 0) && 166 if (((i & 1) == 0) &&
167 (parport_read_status(port) & PARPORT_STATUS_ERROR)) { 167 (parport_read_status(port) & PARPORT_STATUS_ERROR)) {
168 port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DNA; 168 goto end_of_data;
169 DPRINTK (KERN_DEBUG
170 "%s: No more nibble data (%d bytes)\n",
171 port->name, i/2);
172
173 /* Go to reverse idle phase. */
174 parport_frob_control (port,
175 PARPORT_CONTROL_AUTOFD,
176 PARPORT_CONTROL_AUTOFD);
177 port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
178 break;
179 } 169 }
180 170
181 /* Event 7: Set nAutoFd low. */ 171 /* Event 7: Set nAutoFd low. */
@@ -225,18 +215,25 @@ size_t parport_ieee1284_read_nibble (struct parport *port,
225 byte = nibble; 215 byte = nibble;
226 } 216 }
227 217
228 i /= 2; /* i is now in bytes */
229
230 if (i == len) { 218 if (i == len) {
231 /* Read the last nibble without checking data avail. */ 219 /* Read the last nibble without checking data avail. */
232 port = port->physport; 220 if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
233 if (parport_read_status (port) & PARPORT_STATUS_ERROR) 221 end_of_data:
234 port->ieee1284.phase = IEEE1284_PH_HBUSY_DNA; 222 DPRINTK (KERN_DEBUG
223 "%s: No more nibble data (%d bytes)\n",
224 port->name, i/2);
225
226 /* Go to reverse idle phase. */
227 parport_frob_control (port,
228 PARPORT_CONTROL_AUTOFD,
229 PARPORT_CONTROL_AUTOFD);
230 port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
231 }
235 else 232 else
236 port->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL; 233 port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
237 } 234 }
238 235
239 return i; 236 return i/2;
240#endif /* IEEE1284 support */ 237#endif /* IEEE1284 support */
241} 238}
242 239
@@ -256,17 +253,7 @@ size_t parport_ieee1284_read_byte (struct parport *port,
256 253
257 /* Data available? */ 254 /* Data available? */
258 if (parport_read_status (port) & PARPORT_STATUS_ERROR) { 255 if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
259 port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DNA; 256 goto end_of_data;
260 DPRINTK (KERN_DEBUG
261 "%s: No more byte data (%Zd bytes)\n",
262 port->name, count);
263
264 /* Go to reverse idle phase. */
265 parport_frob_control (port,
266 PARPORT_CONTROL_AUTOFD,
267 PARPORT_CONTROL_AUTOFD);
268 port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
269 break;
270 } 257 }
271 258
272 /* Event 14: Place data bus in high impedance state. */ 259 /* Event 14: Place data bus in high impedance state. */
@@ -318,11 +305,20 @@ size_t parport_ieee1284_read_byte (struct parport *port,
318 305
319 if (count == len) { 306 if (count == len) {
320 /* Read the last byte without checking data avail. */ 307 /* Read the last byte without checking data avail. */
321 port = port->physport; 308 if (parport_read_status (port) & PARPORT_STATUS_ERROR) {
322 if (parport_read_status (port) & PARPORT_STATUS_ERROR) 309 end_of_data:
323 port->ieee1284.phase = IEEE1284_PH_HBUSY_DNA; 310 DPRINTK (KERN_DEBUG
311 "%s: No more byte data (%Zd bytes)\n",
312 port->name, count);
313
314 /* Go to reverse idle phase. */
315 parport_frob_control (port,
316 PARPORT_CONTROL_AUTOFD,
317 PARPORT_CONTROL_AUTOFD);
318 port->physport->ieee1284.phase = IEEE1284_PH_REV_IDLE;
319 }
324 else 320 else
325 port->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL; 321 port->physport->ieee1284.phase = IEEE1284_PH_HBUSY_DAVAIL;
326 } 322 }
327 323
328 return count; 324 return count;
diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index c6493ad7c0c8..18e85ccdae67 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -1169,7 +1169,7 @@ dump_parport_state ("fwd idle", port);
1169 1169
1170/* GCC is not inlining extern inline function later overwriten to non-inline, 1170/* GCC is not inlining extern inline function later overwriten to non-inline,
1171 so we use outlined_ variants here. */ 1171 so we use outlined_ variants here. */
1172static struct parport_operations parport_pc_ops = 1172static const struct parport_operations parport_pc_ops =
1173{ 1173{
1174 .write_data = parport_pc_write_data, 1174 .write_data = parport_pc_write_data,
1175 .read_data = parport_pc_read_data, 1175 .read_data = parport_pc_read_data,
@@ -1211,10 +1211,11 @@ static struct parport_operations parport_pc_ops =
1211static void __devinit show_parconfig_smsc37c669(int io, int key) 1211static void __devinit show_parconfig_smsc37c669(int io, int key)
1212{ 1212{
1213 int cr1,cr4,cra,cr23,cr26,cr27,i=0; 1213 int cr1,cr4,cra,cr23,cr26,cr27,i=0;
1214 static const char *modes[]={ "SPP and Bidirectional (PS/2)", 1214 static const char *const modes[]={
1215 "EPP and SPP", 1215 "SPP and Bidirectional (PS/2)",
1216 "ECP", 1216 "EPP and SPP",
1217 "ECP and EPP" }; 1217 "ECP",
1218 "ECP and EPP" };
1218 1219
1219 outb(key,io); 1220 outb(key,io);
1220 outb(key,io); 1221 outb(key,io);
@@ -1288,7 +1289,7 @@ static void __devinit show_parconfig_smsc37c669(int io, int key)
1288static void __devinit show_parconfig_winbond(int io, int key) 1289static void __devinit show_parconfig_winbond(int io, int key)
1289{ 1290{
1290 int cr30,cr60,cr61,cr70,cr74,crf0,i=0; 1291 int cr30,cr60,cr61,cr70,cr74,crf0,i=0;
1291 static const char *modes[] = { 1292 static const char *const modes[] = {
1292 "Standard (SPP) and Bidirectional(PS/2)", /* 0 */ 1293 "Standard (SPP) and Bidirectional(PS/2)", /* 0 */
1293 "EPP-1.9 and SPP", 1294 "EPP-1.9 and SPP",
1294 "ECP", 1295 "ECP",
@@ -1297,7 +1298,9 @@ static void __devinit show_parconfig_winbond(int io, int key)
1297 "EPP-1.7 and SPP", /* 5 */ 1298 "EPP-1.7 and SPP", /* 5 */
1298 "undefined!", 1299 "undefined!",
1299 "ECP and EPP-1.7" }; 1300 "ECP and EPP-1.7" };
1300 static char *irqtypes[] = { "pulsed low, high-Z", "follows nACK" }; 1301 static char *const irqtypes[] = {
1302 "pulsed low, high-Z",
1303 "follows nACK" };
1301 1304
1302 /* The registers are called compatible-PnP because the 1305 /* The registers are called compatible-PnP because the
1303 register layout is modelled after ISA-PnP, the access 1306 register layout is modelled after ISA-PnP, the access
@@ -2396,7 +2399,8 @@ EXPORT_SYMBOL (parport_pc_unregister_port);
2396 2399
2397/* ITE support maintained by Rich Liu <richliu@poorman.org> */ 2400/* ITE support maintained by Rich Liu <richliu@poorman.org> */
2398static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq, 2401static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
2399 int autodma, struct parport_pc_via_data *via) 2402 int autodma,
2403 const struct parport_pc_via_data *via)
2400{ 2404{
2401 short inta_addr[6] = { 0x2A0, 0x2C0, 0x220, 0x240, 0x1E0 }; 2405 short inta_addr[6] = { 0x2A0, 0x2C0, 0x220, 0x240, 0x1E0 };
2402 struct resource *base_res; 2406 struct resource *base_res;
@@ -2524,7 +2528,8 @@ static struct parport_pc_via_data via_8231_data __devinitdata = {
2524}; 2528};
2525 2529
2526static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq, 2530static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
2527 int autodma, struct parport_pc_via_data *via) 2531 int autodma,
2532 const struct parport_pc_via_data *via)
2528{ 2533{
2529 u8 tmp, tmp2, siofunc; 2534 u8 tmp, tmp2, siofunc;
2530 u8 ppcontrol = 0; 2535 u8 ppcontrol = 0;
@@ -2694,8 +2699,9 @@ enum parport_pc_sio_types {
2694 2699
2695/* each element directly indexed from enum list, above */ 2700/* each element directly indexed from enum list, above */
2696static struct parport_pc_superio { 2701static struct parport_pc_superio {
2697 int (*probe) (struct pci_dev *pdev, int autoirq, int autodma, struct parport_pc_via_data *via); 2702 int (*probe) (struct pci_dev *pdev, int autoirq, int autodma,
2698 struct parport_pc_via_data *via; 2703 const struct parport_pc_via_data *via);
2704 const struct parport_pc_via_data *via;
2699} parport_pc_superio_info[] __devinitdata = { 2705} parport_pc_superio_info[] __devinitdata = {
2700 { sio_via_probe, &via_686a_data, }, 2706 { sio_via_probe, &via_686a_data, },
2701 { sio_via_probe, &via_8231_data, }, 2707 { sio_via_probe, &via_8231_data, },
@@ -2828,7 +2834,7 @@ static struct parport_pc_pci {
2828 /* netmos_9815 */ { 2, { { 0, -1 }, { 2, -1 }, } }, /* untested */ 2834 /* netmos_9815 */ { 2, { { 0, -1 }, { 2, -1 }, } }, /* untested */
2829}; 2835};
2830 2836
2831static struct pci_device_id parport_pc_pci_tbl[] = { 2837static const struct pci_device_id parport_pc_pci_tbl[] = {
2832 /* Super-IO onboard chips */ 2838 /* Super-IO onboard chips */
2833 { 0x1106, 0x0686, PCI_ANY_ID, PCI_ANY_ID, 0, 0, sio_via_686a }, 2839 { 0x1106, 0x0686, PCI_ANY_ID, PCI_ANY_ID, 0, 0, sio_via_686a },
2834 { 0x1106, 0x8231, PCI_ANY_ID, PCI_ANY_ID, 0, 0, sio_via_8231 }, 2840 { 0x1106, 0x8231, PCI_ANY_ID, PCI_ANY_ID, 0, 0, sio_via_8231 },
diff --git a/drivers/parport/probe.c b/drivers/parport/probe.c
index 4b48b31ec235..b62aee8de3cb 100644
--- a/drivers/parport/probe.c
+++ b/drivers/parport/probe.c
@@ -11,9 +11,9 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <asm/uaccess.h> 12#include <asm/uaccess.h>
13 13
14static struct { 14static const struct {
15 char *token; 15 const char *token;
16 char *descr; 16 const char *descr;
17} classes[] = { 17} classes[] = {
18 { "", "Legacy device" }, 18 { "", "Legacy device" },
19 { "PRINTER", "Printer" }, 19 { "PRINTER", "Printer" },
@@ -128,8 +128,131 @@ static void parse_data(struct parport *port, int device, char *str)
128 kfree(txt); 128 kfree(txt);
129} 129}
130 130
131/* Read up to count-1 bytes of device id. Terminate buffer with
132 * '\0'. Buffer begins with two Device ID length bytes as given by
133 * device. */
134static ssize_t parport_read_device_id (struct parport *port, char *buffer,
135 size_t count)
136{
137 unsigned char length[2];
138 unsigned lelen, belen;
139 size_t idlens[4];
140 unsigned numidlens;
141 unsigned current_idlen;
142 ssize_t retval;
143 size_t len;
144
145 /* First two bytes are MSB,LSB of inclusive length. */
146 retval = parport_read (port, length, 2);
147
148 if (retval < 0)
149 return retval;
150 if (retval != 2)
151 return -EIO;
152
153 if (count < 2)
154 return 0;
155 memcpy(buffer, length, 2);
156 len = 2;
157
158 /* Some devices wrongly send LE length, and some send it two
159 * bytes short. Construct a sorted array of lengths to try. */
160 belen = (length[0] << 8) + length[1];
161 lelen = (length[1] << 8) + length[0];
162 idlens[0] = min(belen, lelen);
163 idlens[1] = idlens[0]+2;
164 if (belen != lelen) {
165 int off = 2;
166 /* Don't try lenghts of 0x100 and 0x200 as 1 and 2 */
167 if (idlens[0] <= 2)
168 off = 0;
169 idlens[off] = max(belen, lelen);
170 idlens[off+1] = idlens[off]+2;
171 numidlens = off+2;
172 }
173 else {
174 /* Some devices don't truly implement Device ID, but
175 * just return constant nibble forever. This catches
176 * also those cases. */
177 if (idlens[0] == 0 || idlens[0] > 0xFFF) {
178 printk (KERN_DEBUG "%s: reported broken Device ID"
179 " length of %#zX bytes\n",
180 port->name, idlens[0]);
181 return -EIO;
182 }
183 numidlens = 2;
184 }
185
186 /* Try to respect the given ID length despite all the bugs in
187 * the ID length. Read according to shortest possible ID
188 * first. */
189 for (current_idlen = 0; current_idlen < numidlens; ++current_idlen) {
190 size_t idlen = idlens[current_idlen];
191 if (idlen+1 >= count)
192 break;
193
194 retval = parport_read (port, buffer+len, idlen-len);
195
196 if (retval < 0)
197 return retval;
198 len += retval;
199
200 if (port->physport->ieee1284.phase != IEEE1284_PH_HBUSY_DAVAIL) {
201 if (belen != len) {
202 printk (KERN_DEBUG "%s: Device ID was %d bytes"
203 " while device told it would be %d"
204 " bytes\n",
205 port->name, len, belen);
206 }
207 goto done;
208 }
209
210 /* This might end reading the Device ID too
211 * soon. Hopefully the needed fields were already in
212 * the first 256 bytes or so that we must have read so
213 * far. */
214 if (buffer[len-1] == ';') {
215 printk (KERN_DEBUG "%s: Device ID reading stopped"
216 " before device told data not available. "
217 "Current idlen %d of %d, len bytes %02X %02X\n",
218 port->name, current_idlen, numidlens,
219 length[0], length[1]);
220 goto done;
221 }
222 }
223 if (current_idlen < numidlens) {
224 /* Buffer not large enough, read to end of buffer. */
225 size_t idlen, len2;
226 if (len+1 < count) {
227 retval = parport_read (port, buffer+len, count-len-1);
228 if (retval < 0)
229 return retval;
230 len += retval;
231 }
232 /* Read the whole ID since some devices would not
233 * otherwise give back the Device ID from beginning
234 * next time when asked. */
235 idlen = idlens[current_idlen];
236 len2 = len;
237 while(len2 < idlen && retval > 0) {
238 char tmp[4];
239 retval = parport_read (port, tmp,
240 min(sizeof tmp, idlen-len2));
241 if (retval < 0)
242 return retval;
243 len2 += retval;
244 }
245 }
246 /* In addition, there are broken devices out there that don't
247 even finish off with a semi-colon. We do not need to care
248 about those at this time. */
249 done:
250 buffer[len] = '\0';
251 return len;
252}
253
131/* Get Std 1284 Device ID. */ 254/* Get Std 1284 Device ID. */
132ssize_t parport_device_id (int devnum, char *buffer, size_t len) 255ssize_t parport_device_id (int devnum, char *buffer, size_t count)
133{ 256{
134 ssize_t retval = -ENXIO; 257 ssize_t retval = -ENXIO;
135 struct pardevice *dev = parport_open (devnum, "Device ID probe", 258 struct pardevice *dev = parport_open (devnum, "Device ID probe",
@@ -139,76 +262,20 @@ ssize_t parport_device_id (int devnum, char *buffer, size_t len)
139 262
140 parport_claim_or_block (dev); 263 parport_claim_or_block (dev);
141 264
142 /* Negotiate to compatibility mode, and then to device ID mode. 265 /* Negotiate to compatibility mode, and then to device ID
143 * (This is in case we are already in device ID mode.) */ 266 * mode. (This so that we start form beginning of device ID if
267 * already in device ID mode.) */
144 parport_negotiate (dev->port, IEEE1284_MODE_COMPAT); 268 parport_negotiate (dev->port, IEEE1284_MODE_COMPAT);
145 retval = parport_negotiate (dev->port, 269 retval = parport_negotiate (dev->port,
146 IEEE1284_MODE_NIBBLE | IEEE1284_DEVICEID); 270 IEEE1284_MODE_NIBBLE | IEEE1284_DEVICEID);
147 271
148 if (!retval) { 272 if (!retval) {
149 int idlen; 273 retval = parport_read_device_id (dev->port, buffer, count);
150 unsigned char length[2];
151
152 /* First two bytes are MSB,LSB of inclusive length. */
153 retval = parport_read (dev->port, length, 2);
154
155 if (retval != 2) goto end_id;
156
157 idlen = (length[0] << 8) + length[1] - 2;
158 /*
159 * Check if the caller-allocated buffer is large enough
160 * otherwise bail out or there will be an at least off by one.
161 */
162 if (idlen + 1 < len)
163 len = idlen;
164 else {
165 retval = -EINVAL;
166 goto out;
167 }
168 retval = parport_read (dev->port, buffer, len);
169
170 if (retval != len)
171 printk (KERN_DEBUG "%s: only read %Zd of %Zd ID bytes\n",
172 dev->port->name, retval,
173 len);
174
175 /* Some printer manufacturers mistakenly believe that
176 the length field is supposed to be _exclusive_.
177 In addition, there are broken devices out there
178 that don't even finish off with a semi-colon. */
179 if (buffer[len - 1] != ';') {
180 ssize_t diff;
181 diff = parport_read (dev->port, buffer + len, 2);
182 retval += diff;
183
184 if (diff)
185 printk (KERN_DEBUG
186 "%s: device reported incorrect "
187 "length field (%d, should be %Zd)\n",
188 dev->port->name, idlen, retval);
189 else {
190 /* One semi-colon short of a device ID. */
191 buffer[len++] = ';';
192 printk (KERN_DEBUG "%s: faking semi-colon\n",
193 dev->port->name);
194
195 /* If we get here, I don't think we
196 need to worry about the possible
197 standard violation of having read
198 more than we were told to. The
199 device is non-compliant anyhow. */
200 }
201 }
202
203 end_id:
204 buffer[len] = '\0';
205 parport_negotiate (dev->port, IEEE1284_MODE_COMPAT); 274 parport_negotiate (dev->port, IEEE1284_MODE_COMPAT);
275 if (retval > 2)
276 parse_data (dev->port, dev->daisy, buffer+2);
206 } 277 }
207 278
208 if (retval > 2)
209 parse_data (dev->port, dev->daisy, buffer);
210
211out:
212 parport_release (dev); 279 parport_release (dev);
213 parport_close (dev); 280 parport_close (dev);
214 return retval; 281 return retval;
diff --git a/drivers/parport/share.c b/drivers/parport/share.c
index 9cb3ab156b09..ea62bed6bc83 100644
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -1002,6 +1002,7 @@ EXPORT_SYMBOL(parport_register_driver);
1002EXPORT_SYMBOL(parport_unregister_driver); 1002EXPORT_SYMBOL(parport_unregister_driver);
1003EXPORT_SYMBOL(parport_register_device); 1003EXPORT_SYMBOL(parport_register_device);
1004EXPORT_SYMBOL(parport_unregister_device); 1004EXPORT_SYMBOL(parport_unregister_device);
1005EXPORT_SYMBOL(parport_get_port);
1005EXPORT_SYMBOL(parport_put_port); 1006EXPORT_SYMBOL(parport_put_port);
1006EXPORT_SYMBOL(parport_find_number); 1007EXPORT_SYMBOL(parport_find_number);
1007EXPORT_SYMBOL(parport_find_base); 1008EXPORT_SYMBOL(parport_find_base);
diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c
index 6b7583f497d0..a1f0b0ba2bfe 100644
--- a/drivers/pnp/pnpbios/bioscalls.c
+++ b/drivers/pnp/pnpbios/bioscalls.c
@@ -31,15 +31,6 @@ static struct {
31} pnp_bios_callpoint; 31} pnp_bios_callpoint;
32 32
33 33
34/* The PnP BIOS entries in the GDT */
35#define PNP_GDT (GDT_ENTRY_PNPBIOS_BASE * 8)
36
37#define PNP_CS32 (PNP_GDT+0x00) /* segment for calling fn */
38#define PNP_CS16 (PNP_GDT+0x08) /* code segment for BIOS */
39#define PNP_DS (PNP_GDT+0x10) /* data segment for BIOS */
40#define PNP_TS1 (PNP_GDT+0x18) /* transfer data segment */
41#define PNP_TS2 (PNP_GDT+0x20) /* another data segment */
42
43/* 34/*
44 * These are some opcodes for a "static asmlinkage" 35 * These are some opcodes for a "static asmlinkage"
45 * As this code is *not* executed inside the linux kernel segment, but in a 36 * As this code is *not* executed inside the linux kernel segment, but in a
@@ -67,16 +58,11 @@ __asm__(
67 ".previous \n" 58 ".previous \n"
68); 59);
69 60
70#define Q_SET_SEL(cpu, selname, address, size) \
71do { \
72set_base(per_cpu(cpu_gdt_table,cpu)[(selname) >> 3], __va((u32)(address))); \
73set_limit(per_cpu(cpu_gdt_table,cpu)[(selname) >> 3], size); \
74} while(0)
75
76#define Q2_SET_SEL(cpu, selname, address, size) \ 61#define Q2_SET_SEL(cpu, selname, address, size) \
77do { \ 62do { \
78set_base(per_cpu(cpu_gdt_table,cpu)[(selname) >> 3], (u32)(address)); \ 63struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \
79set_limit(per_cpu(cpu_gdt_table,cpu)[(selname) >> 3], size); \ 64set_base(gdt[(selname) >> 3], (u32)(address)); \
65set_limit(gdt[(selname) >> 3], size); \
80} while(0) 66} while(0)
81 67
82static struct desc_struct bad_bios_desc = { 0, 0x00409200 }; 68static struct desc_struct bad_bios_desc = { 0, 0x00409200 };
@@ -115,8 +101,8 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
115 return PNP_FUNCTION_NOT_SUPPORTED; 101 return PNP_FUNCTION_NOT_SUPPORTED;
116 102
117 cpu = get_cpu(); 103 cpu = get_cpu();
118 save_desc_40 = per_cpu(cpu_gdt_table,cpu)[0x40 / 8]; 104 save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8];
119 per_cpu(cpu_gdt_table,cpu)[0x40 / 8] = bad_bios_desc; 105 get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc;
120 106
121 /* On some boxes IRQ's during PnP BIOS calls are deadly. */ 107 /* On some boxes IRQ's during PnP BIOS calls are deadly. */
122 spin_lock_irqsave(&pnp_bios_lock, flags); 108 spin_lock_irqsave(&pnp_bios_lock, flags);
@@ -158,7 +144,7 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
158 ); 144 );
159 spin_unlock_irqrestore(&pnp_bios_lock, flags); 145 spin_unlock_irqrestore(&pnp_bios_lock, flags);
160 146
161 per_cpu(cpu_gdt_table,cpu)[0x40 / 8] = save_desc_40; 147 get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40;
162 put_cpu(); 148 put_cpu();
163 149
164 /* If we get here and this is set then the PnP BIOS faulted on us. */ 150 /* If we get here and this is set then the PnP BIOS faulted on us. */
@@ -290,12 +276,15 @@ int pnp_bios_dev_node_info(struct pnp_dev_node_info *data)
290static int __pnp_bios_get_dev_node(u8 *nodenum, char boot, struct pnp_bios_node *data) 276static int __pnp_bios_get_dev_node(u8 *nodenum, char boot, struct pnp_bios_node *data)
291{ 277{
292 u16 status; 278 u16 status;
279 u16 tmp_nodenum;
293 if (!pnp_bios_present()) 280 if (!pnp_bios_present())
294 return PNP_FUNCTION_NOT_SUPPORTED; 281 return PNP_FUNCTION_NOT_SUPPORTED;
295 if ( !boot && pnpbios_dont_use_current_config ) 282 if ( !boot && pnpbios_dont_use_current_config )
296 return PNP_FUNCTION_NOT_SUPPORTED; 283 return PNP_FUNCTION_NOT_SUPPORTED;
284 tmp_nodenum = *nodenum;
297 status = call_pnp_bios(PNP_GET_SYS_DEV_NODE, 0, PNP_TS1, 0, PNP_TS2, boot ? 2 : 1, PNP_DS, 0, 285 status = call_pnp_bios(PNP_GET_SYS_DEV_NODE, 0, PNP_TS1, 0, PNP_TS2, boot ? 2 : 1, PNP_DS, 0,
298 nodenum, sizeof(char), data, 65536); 286 &tmp_nodenum, sizeof(tmp_nodenum), data, 65536);
287 *nodenum = tmp_nodenum;
299 return status; 288 return status;
300} 289}
301 290
@@ -535,10 +524,12 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header)
535 524
536 set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); 525 set_base(bad_bios_desc, __va((unsigned long)0x40 << 4));
537 _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); 526 _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4));
538 for(i=0; i < NR_CPUS; i++) 527 for (i = 0; i < NR_CPUS; i++) {
539 { 528 struct desc_struct *gdt = get_cpu_gdt_table(i);
540 Q2_SET_SEL(i, PNP_CS32, &pnp_bios_callfunc, 64 * 1024); 529 if (!gdt)
541 Q_SET_SEL(i, PNP_CS16, header->fields.pm16cseg, 64 * 1024); 530 continue;
542 Q_SET_SEL(i, PNP_DS, header->fields.pm16dseg, 64 * 1024); 531 set_base(gdt[GDT_ENTRY_PNPBIOS_CS32], &pnp_bios_callfunc);
543 } 532 set_base(gdt[GDT_ENTRY_PNPBIOS_CS16], __va(header->fields.pm16cseg));
533 set_base(gdt[GDT_ENTRY_PNPBIOS_DS], __va(header->fields.pm16dseg));
534 }
544} 535}
diff --git a/drivers/s390/Makefile b/drivers/s390/Makefile
index c99a2fe92fb0..9803c9352d78 100644
--- a/drivers/s390/Makefile
+++ b/drivers/s390/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the S/390 specific device drivers 2# Makefile for the S/390 specific device drivers
3# 3#
4 4
5obj-y += s390mach.o sysinfo.o 5obj-y += s390mach.o sysinfo.o s390_rdev.o
6obj-y += cio/ block/ char/ crypto/ net/ scsi/ 6obj-y += cio/ block/ char/ crypto/ net/ scsi/
7 7
8drivers-y += drivers/s390/built-in.o 8drivers-y += drivers/s390/built-in.o
diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
index 6e7d7b06421d..6f50cc9323d9 100644
--- a/drivers/s390/block/Kconfig
+++ b/drivers/s390/block/Kconfig
@@ -1,11 +1,11 @@
1if ARCH_S390 1if S390
2 2
3comment "S/390 block device drivers" 3comment "S/390 block device drivers"
4 depends on ARCH_S390 4 depends on S390
5 5
6config BLK_DEV_XPRAM 6config BLK_DEV_XPRAM
7 tristate "XPRAM disk support" 7 tristate "XPRAM disk support"
8 depends on ARCH_S390 8 depends on S390
9 help 9 help
10 Select this option if you want to use your expanded storage on S/390 10 Select this option if you want to use your expanded storage on S/390
11 or zSeries as a disk. This is useful as a _fast_ swap device if you 11 or zSeries as a disk. This is useful as a _fast_ swap device if you
@@ -49,7 +49,7 @@ config DASD_FBA
49 49
50config DASD_DIAG 50config DASD_DIAG
51 tristate "Support for DIAG access to Disks" 51 tristate "Support for DIAG access to Disks"
52 depends on DASD && ( ARCH_S390X = 'n' || EXPERIMENTAL) 52 depends on DASD && ( 64BIT = 'n' || EXPERIMENTAL)
53 help 53 help
54 Select this option if you want to use Diagnose250 command to access 54 Select this option if you want to use Diagnose250 command to access
55 Disks under VM. If you are not running under VM or unsure what it is, 55 Disks under VM. If you are not running under VM or unsure what it is,
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 7008d32433bf..f779f674dfa0 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -7,7 +7,7 @@
7 * Bugreports.to..: <Linux390@de.ibm.com> 7 * Bugreports.to..: <Linux390@de.ibm.com>
8 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999-2001 8 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999-2001
9 * 9 *
10 * $Revision: 1.167 $ 10 * $Revision: 1.172 $
11 */ 11 */
12 12
13#include <linux/config.h> 13#include <linux/config.h>
@@ -604,7 +604,7 @@ dasd_smalloc_request(char *magic, int cplength, int datasize,
604void 604void
605dasd_kfree_request(struct dasd_ccw_req * cqr, struct dasd_device * device) 605dasd_kfree_request(struct dasd_ccw_req * cqr, struct dasd_device * device)
606{ 606{
607#ifdef CONFIG_ARCH_S390X 607#ifdef CONFIG_64BIT
608 struct ccw1 *ccw; 608 struct ccw1 *ccw;
609 609
610 /* Clear any idals used for the request. */ 610 /* Clear any idals used for the request. */
@@ -1035,7 +1035,7 @@ dasd_end_request(struct request *req, int uptodate)
1035 if (end_that_request_first(req, uptodate, req->hard_nr_sectors)) 1035 if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
1036 BUG(); 1036 BUG();
1037 add_disk_randomness(req->rq_disk); 1037 add_disk_randomness(req->rq_disk);
1038 end_that_request_last(req); 1038 end_that_request_last(req, uptodate);
1039} 1039}
1040 1040
1041/* 1041/*
@@ -1224,6 +1224,12 @@ __dasd_start_head(struct dasd_device * device)
1224 if (list_empty(&device->ccw_queue)) 1224 if (list_empty(&device->ccw_queue))
1225 return; 1225 return;
1226 cqr = list_entry(device->ccw_queue.next, struct dasd_ccw_req, list); 1226 cqr = list_entry(device->ccw_queue.next, struct dasd_ccw_req, list);
1227 /* check FAILFAST */
1228 if (device->stopped & ~DASD_STOPPED_PENDING &&
1229 test_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags)) {
1230 cqr->status = DASD_CQR_FAILED;
1231 dasd_schedule_bh(device);
1232 }
1227 if ((cqr->status == DASD_CQR_QUEUED) && 1233 if ((cqr->status == DASD_CQR_QUEUED) &&
1228 (!device->stopped)) { 1234 (!device->stopped)) {
1229 /* try to start the first I/O that can be started */ 1235 /* try to start the first I/O that can be started */
@@ -1323,7 +1329,7 @@ void
1323dasd_schedule_bh(struct dasd_device * device) 1329dasd_schedule_bh(struct dasd_device * device)
1324{ 1330{
1325 /* Protect against rescheduling. */ 1331 /* Protect against rescheduling. */
1326 if (atomic_compare_and_swap (0, 1, &device->tasklet_scheduled)) 1332 if (atomic_cmpxchg (&device->tasklet_scheduled, 0, 1) != 0)
1327 return; 1333 return;
1328 dasd_get_device(device); 1334 dasd_get_device(device);
1329 tasklet_hi_schedule(&device->tasklet); 1335 tasklet_hi_schedule(&device->tasklet);
@@ -1750,8 +1756,10 @@ dasd_exit(void)
1750 * SECTION: common functions for ccw_driver use 1756 * SECTION: common functions for ccw_driver use
1751 */ 1757 */
1752 1758
1753/* initial attempt at a probe function. this can be simplified once 1759/*
1754 * the other detection code is gone */ 1760 * Initial attempt at a probe function. this can be simplified once
1761 * the other detection code is gone.
1762 */
1755int 1763int
1756dasd_generic_probe (struct ccw_device *cdev, 1764dasd_generic_probe (struct ccw_device *cdev,
1757 struct dasd_discipline *discipline) 1765 struct dasd_discipline *discipline)
@@ -1770,8 +1778,10 @@ dasd_generic_probe (struct ccw_device *cdev,
1770 return ret; 1778 return ret;
1771} 1779}
1772 1780
1773/* this will one day be called from a global not_oper handler. 1781/*
1774 * It is also used by driver_unregister during module unload */ 1782 * This will one day be called from a global not_oper handler.
1783 * It is also used by driver_unregister during module unload.
1784 */
1775void 1785void
1776dasd_generic_remove (struct ccw_device *cdev) 1786dasd_generic_remove (struct ccw_device *cdev)
1777{ 1787{
@@ -1798,9 +1808,11 @@ dasd_generic_remove (struct ccw_device *cdev)
1798 dasd_delete_device(device); 1808 dasd_delete_device(device);
1799} 1809}
1800 1810
1801/* activate a device. This is called from dasd_{eckd,fba}_probe() when either 1811/*
1812 * Activate a device. This is called from dasd_{eckd,fba}_probe() when either
1802 * the device is detected for the first time and is supposed to be used 1813 * the device is detected for the first time and is supposed to be used
1803 * or the user has started activation through sysfs */ 1814 * or the user has started activation through sysfs.
1815 */
1804int 1816int
1805dasd_generic_set_online (struct ccw_device *cdev, 1817dasd_generic_set_online (struct ccw_device *cdev,
1806 struct dasd_discipline *discipline) 1818 struct dasd_discipline *discipline)
@@ -1917,7 +1929,6 @@ dasd_generic_notify(struct ccw_device *cdev, int event)
1917 if (cqr->status == DASD_CQR_IN_IO) 1929 if (cqr->status == DASD_CQR_IN_IO)
1918 cqr->status = DASD_CQR_FAILED; 1930 cqr->status = DASD_CQR_FAILED;
1919 device->stopped |= DASD_STOPPED_DC_EIO; 1931 device->stopped |= DASD_STOPPED_DC_EIO;
1920 dasd_schedule_bh(device);
1921 } else { 1932 } else {
1922 list_for_each_entry(cqr, &device->ccw_queue, list) 1933 list_for_each_entry(cqr, &device->ccw_queue, list)
1923 if (cqr->status == DASD_CQR_IN_IO) { 1934 if (cqr->status == DASD_CQR_IN_IO) {
@@ -1927,6 +1938,7 @@ dasd_generic_notify(struct ccw_device *cdev, int event)
1927 device->stopped |= DASD_STOPPED_DC_WAIT; 1938 device->stopped |= DASD_STOPPED_DC_WAIT;
1928 dasd_set_timer(device, 0); 1939 dasd_set_timer(device, 0);
1929 } 1940 }
1941 dasd_schedule_bh(device);
1930 ret = 1; 1942 ret = 1;
1931 break; 1943 break;
1932 case CIO_OPER: 1944 case CIO_OPER:
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index ab8754e566bc..ba80fdea7ebf 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -6,7 +6,7 @@
6 * Bugreports.to..: <Linux390@de.ibm.com> 6 * Bugreports.to..: <Linux390@de.ibm.com>
7 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000 7 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
8 * 8 *
9 * $Revision: 1.51 $ 9 * $Revision: 1.53 $
10 */ 10 */
11 11
12#include <linux/config.h> 12#include <linux/config.h>
@@ -25,6 +25,7 @@
25#include <asm/io.h> 25#include <asm/io.h>
26#include <asm/s390_ext.h> 26#include <asm/s390_ext.h>
27#include <asm/todclk.h> 27#include <asm/todclk.h>
28#include <asm/vtoc.h>
28 29
29#include "dasd_int.h" 30#include "dasd_int.h"
30#include "dasd_diag.h" 31#include "dasd_diag.h"
@@ -74,7 +75,7 @@ dia250(void *iob, int cmd)
74 int rc; 75 int rc;
75 76
76 __asm__ __volatile__( 77 __asm__ __volatile__(
77#ifdef CONFIG_ARCH_S390X 78#ifdef CONFIG_64BIT
78 " lghi %0,3\n" 79 " lghi %0,3\n"
79 " lgr 0,%3\n" 80 " lgr 0,%3\n"
80 " diag 0,%2,0x250\n" 81 " diag 0,%2,0x250\n"
@@ -329,7 +330,7 @@ dasd_diag_check_device(struct dasd_device *device)
329 struct dasd_diag_private *private; 330 struct dasd_diag_private *private;
330 struct dasd_diag_characteristics *rdc_data; 331 struct dasd_diag_characteristics *rdc_data;
331 struct dasd_diag_bio bio; 332 struct dasd_diag_bio bio;
332 struct dasd_diag_cms_label *label; 333 struct vtoc_cms_label *label;
333 blocknum_t end_block; 334 blocknum_t end_block;
334 unsigned int sb, bsize; 335 unsigned int sb, bsize;
335 int rc; 336 int rc;
@@ -380,7 +381,7 @@ dasd_diag_check_device(struct dasd_device *device)
380 mdsk_term_io(device); 381 mdsk_term_io(device);
381 382
382 /* figure out blocksize of device */ 383 /* figure out blocksize of device */
383 label = (struct dasd_diag_cms_label *) get_zeroed_page(GFP_KERNEL); 384 label = (struct vtoc_cms_label *) get_zeroed_page(GFP_KERNEL);
384 if (label == NULL) { 385 if (label == NULL) {
385 DEV_MESSAGE(KERN_WARNING, device, "%s", 386 DEV_MESSAGE(KERN_WARNING, device, "%s",
386 "No memory to allocate initialization request"); 387 "No memory to allocate initialization request");
@@ -548,6 +549,8 @@ dasd_diag_build_cp(struct dasd_device * device, struct request *req)
548 } 549 }
549 cqr->retries = DIAG_MAX_RETRIES; 550 cqr->retries = DIAG_MAX_RETRIES;
550 cqr->buildclk = get_clock(); 551 cqr->buildclk = get_clock();
552 if (req->flags & REQ_FAILFAST)
553 set_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags);
551 cqr->device = device; 554 cqr->device = device;
552 cqr->expires = DIAG_TIMEOUT; 555 cqr->expires = DIAG_TIMEOUT;
553 cqr->status = DASD_CQR_FILLED; 556 cqr->status = DASD_CQR_FILLED;
diff --git a/drivers/s390/block/dasd_diag.h b/drivers/s390/block/dasd_diag.h
index df31484d73a7..a4f80bd735f1 100644
--- a/drivers/s390/block/dasd_diag.h
+++ b/drivers/s390/block/dasd_diag.h
@@ -6,7 +6,7 @@
6 * Bugreports.to..: <Linux390@de.ibm.com> 6 * Bugreports.to..: <Linux390@de.ibm.com>
7 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000 7 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
8 * 8 *
9 * $Revision: 1.8 $ 9 * $Revision: 1.9 $
10 */ 10 */
11 11
12#define MDSK_WRITE_REQ 0x01 12#define MDSK_WRITE_REQ 0x01
@@ -44,29 +44,8 @@ struct dasd_diag_characteristics {
44 u8 rdev_features; 44 u8 rdev_features;
45} __attribute__ ((packed, aligned(4))); 45} __attribute__ ((packed, aligned(4)));
46 46
47struct dasd_diag_cms_label { 47
48 u8 label_id[4]; 48#ifdef CONFIG_64BIT
49 u8 vol_id[6];
50 u16 version_id;
51 u32 block_size;
52 u32 origin_ptr;
53 u32 usable_count;
54 u32 formatted_count;
55 u32 block_count;
56 u32 used_count;
57 u32 fst_size;
58 u32 fst_count;
59 u8 format_date[6];
60 u8 reserved1[2];
61 u32 disk_offset;
62 u32 map_block;
63 u32 hblk_disp;
64 u32 user_disp;
65 u8 reserved2[4];
66 u8 segment_name[8];
67} __attribute__ ((packed));
68
69#ifdef CONFIG_ARCH_S390X
70#define DASD_DIAG_FLAGA_DEFAULT DASD_DIAG_FLAGA_FORMAT_64BIT 49#define DASD_DIAG_FLAGA_DEFAULT DASD_DIAG_FLAGA_FORMAT_64BIT
71 50
72typedef u64 blocknum_t; 51typedef u64 blocknum_t;
@@ -107,7 +86,7 @@ struct dasd_diag_rw_io {
107 struct dasd_diag_bio *bio_list; 86 struct dasd_diag_bio *bio_list;
108 u8 spare4[8]; 87 u8 spare4[8];
109} __attribute__ ((packed, aligned(8))); 88} __attribute__ ((packed, aligned(8)));
110#else /* CONFIG_ARCH_S390X */ 89#else /* CONFIG_64BIT */
111#define DASD_DIAG_FLAGA_DEFAULT 0x0 90#define DASD_DIAG_FLAGA_DEFAULT 0x0
112 91
113typedef u32 blocknum_t; 92typedef u32 blocknum_t;
@@ -146,4 +125,4 @@ struct dasd_diag_rw_io {
146 u32 interrupt_params; 125 u32 interrupt_params;
147 u8 spare3[20]; 126 u8 spare3[20];
148} __attribute__ ((packed, aligned(8))); 127} __attribute__ ((packed, aligned(8)));
149#endif /* CONFIG_ARCH_S390X */ 128#endif /* CONFIG_64BIT */
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 811060e10c00..96eb48258580 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -7,7 +7,7 @@
7 * Bugreports.to..: <Linux390@de.ibm.com> 7 * Bugreports.to..: <Linux390@de.ibm.com>
8 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000 8 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
9 * 9 *
10 * $Revision: 1.71 $ 10 * $Revision: 1.74 $
11 */ 11 */
12 12
13#include <linux/config.h> 13#include <linux/config.h>
@@ -1041,7 +1041,7 @@ dasd_eckd_build_cp(struct dasd_device * device, struct request *req)
1041 /* Eckd can only do full blocks. */ 1041 /* Eckd can only do full blocks. */
1042 return ERR_PTR(-EINVAL); 1042 return ERR_PTR(-EINVAL);
1043 count += bv->bv_len >> (device->s2b_shift + 9); 1043 count += bv->bv_len >> (device->s2b_shift + 9);
1044#if defined(CONFIG_ARCH_S390X) 1044#if defined(CONFIG_64BIT)
1045 if (idal_is_needed (page_address(bv->bv_page), 1045 if (idal_is_needed (page_address(bv->bv_page),
1046 bv->bv_len)) 1046 bv->bv_len))
1047 cidaw += bv->bv_len >> (device->s2b_shift + 9); 1047 cidaw += bv->bv_len >> (device->s2b_shift + 9);
@@ -1136,6 +1136,8 @@ dasd_eckd_build_cp(struct dasd_device * device, struct request *req)
1136 recid++; 1136 recid++;
1137 } 1137 }
1138 } 1138 }
1139 if (req->flags & REQ_FAILFAST)
1140 set_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags);
1139 cqr->device = device; 1141 cqr->device = device;
1140 cqr->expires = 5 * 60 * HZ; /* 5 minutes */ 1142 cqr->expires = 5 * 60 * HZ; /* 5 minutes */
1141 cqr->lpm = private->path_data.ppm; 1143 cqr->lpm = private->path_data.ppm;
@@ -1252,6 +1254,7 @@ dasd_eckd_release(struct block_device *bdev, int no, long args)
1252 cqr->cpaddr->cda = (__u32)(addr_t) cqr->data; 1254 cqr->cpaddr->cda = (__u32)(addr_t) cqr->data;
1253 cqr->device = device; 1255 cqr->device = device;
1254 clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); 1256 clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags);
1257 set_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags);
1255 cqr->retries = 0; 1258 cqr->retries = 0;
1256 cqr->expires = 2 * HZ; 1259 cqr->expires = 2 * HZ;
1257 cqr->buildclk = get_clock(); 1260 cqr->buildclk = get_clock();
@@ -1296,6 +1299,7 @@ dasd_eckd_reserve(struct block_device *bdev, int no, long args)
1296 cqr->cpaddr->cda = (__u32)(addr_t) cqr->data; 1299 cqr->cpaddr->cda = (__u32)(addr_t) cqr->data;
1297 cqr->device = device; 1300 cqr->device = device;
1298 clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); 1301 clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags);
1302 set_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags);
1299 cqr->retries = 0; 1303 cqr->retries = 0;
1300 cqr->expires = 2 * HZ; 1304 cqr->expires = 2 * HZ;
1301 cqr->buildclk = get_clock(); 1305 cqr->buildclk = get_clock();
@@ -1339,6 +1343,7 @@ dasd_eckd_steal_lock(struct block_device *bdev, int no, long args)
1339 cqr->cpaddr->cda = (__u32)(addr_t) cqr->data; 1343 cqr->cpaddr->cda = (__u32)(addr_t) cqr->data;
1340 cqr->device = device; 1344 cqr->device = device;
1341 clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); 1345 clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags);
1346 set_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags);
1342 cqr->retries = 0; 1347 cqr->retries = 0;
1343 cqr->expires = 2 * HZ; 1348 cqr->expires = 2 * HZ;
1344 cqr->buildclk = get_clock(); 1349 cqr->buildclk = get_clock();
diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c
index 28cb4613b7f5..8ec75dc08e2c 100644
--- a/drivers/s390/block/dasd_fba.c
+++ b/drivers/s390/block/dasd_fba.c
@@ -4,7 +4,7 @@
4 * Bugreports.to..: <Linux390@de.ibm.com> 4 * Bugreports.to..: <Linux390@de.ibm.com>
5 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000 5 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
6 * 6 *
7 * $Revision: 1.40 $ 7 * $Revision: 1.41 $
8 */ 8 */
9 9
10#include <linux/config.h> 10#include <linux/config.h>
@@ -271,7 +271,7 @@ dasd_fba_build_cp(struct dasd_device * device, struct request *req)
271 /* Fba can only do full blocks. */ 271 /* Fba can only do full blocks. */
272 return ERR_PTR(-EINVAL); 272 return ERR_PTR(-EINVAL);
273 count += bv->bv_len >> (device->s2b_shift + 9); 273 count += bv->bv_len >> (device->s2b_shift + 9);
274#if defined(CONFIG_ARCH_S390X) 274#if defined(CONFIG_64BIT)
275 if (idal_is_needed (page_address(bv->bv_page), 275 if (idal_is_needed (page_address(bv->bv_page),
276 bv->bv_len)) 276 bv->bv_len))
277 cidaw += bv->bv_len / blksize; 277 cidaw += bv->bv_len / blksize;
@@ -352,6 +352,8 @@ dasd_fba_build_cp(struct dasd_device * device, struct request *req)
352 recid++; 352 recid++;
353 } 353 }
354 } 354 }
355 if (req->flags & REQ_FAILFAST)
356 set_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags);
355 cqr->device = device; 357 cqr->device = device;
356 cqr->expires = 5 * 60 * HZ; /* 5 minutes */ 358 cqr->expires = 5 * 60 * HZ; /* 5 minutes */
357 cqr->retries = 32; 359 cqr->retries = 32;
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h
index 9fab04f3056d..2fb05c4a528c 100644
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -6,7 +6,7 @@
6 * Bugreports.to..: <Linux390@de.ibm.com> 6 * Bugreports.to..: <Linux390@de.ibm.com>
7 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000 7 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
8 * 8 *
9 * $Revision: 1.65 $ 9 * $Revision: 1.68 $
10 */ 10 */
11 11
12#ifndef DASD_INT_H 12#ifndef DASD_INT_H
@@ -208,6 +208,7 @@ struct dasd_ccw_req {
208 208
209/* per dasd_ccw_req flags */ 209/* per dasd_ccw_req flags */
210#define DASD_CQR_FLAGS_USE_ERP 0 /* use ERP for this request */ 210#define DASD_CQR_FLAGS_USE_ERP 0 /* use ERP for this request */
211#define DASD_CQR_FLAGS_FAILFAST 1 /* FAILFAST */
211 212
212/* Signature for error recovery functions. */ 213/* Signature for error recovery functions. */
213typedef struct dasd_ccw_req *(*dasd_erp_fn_t) (struct dasd_ccw_req *); 214typedef struct dasd_ccw_req *(*dasd_erp_fn_t) (struct dasd_ccw_req *);
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 789595b3fa09..044b75371990 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -7,7 +7,7 @@
7 * Bugreports.to..: <Linux390@de.ibm.com> 7 * Bugreports.to..: <Linux390@de.ibm.com>
8 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999-2001 8 * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999-2001
9 * 9 *
10 * $Revision: 1.47 $ 10 * $Revision: 1.50 $
11 * 11 *
12 * i/o controls for the dasd driver. 12 * i/o controls for the dasd driver.
13 */ 13 */
@@ -352,6 +352,9 @@ dasd_ioctl_read_profile(struct block_device *bdev, int no, long args)
352 if (device == NULL) 352 if (device == NULL)
353 return -ENODEV; 353 return -ENODEV;
354 354
355 if (dasd_profile_level == DASD_PROFILE_OFF)
356 return -EIO;
357
355 if (copy_to_user((long __user *) args, (long *) &device->profile, 358 if (copy_to_user((long __user *) args, (long *) &device->profile,
356 sizeof (struct dasd_profile_info_t))) 359 sizeof (struct dasd_profile_info_t)))
357 return -EFAULT; 360 return -EFAULT;
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 4fde41188996..2e727f49ad19 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -15,7 +15,7 @@
15#include <asm/io.h> 15#include <asm/io.h>
16#include <linux/completion.h> 16#include <linux/completion.h>
17#include <linux/interrupt.h> 17#include <linux/interrupt.h>
18#include <asm/ccwdev.h> // for s390_root_dev_(un)register() 18#include <asm/s390_rdev.h>
19 19
20//#define DCSSBLK_DEBUG /* Debug messages on/off */ 20//#define DCSSBLK_DEBUG /* Debug messages on/off */
21#define DCSSBLK_NAME "dcssblk" 21#define DCSSBLK_NAME "dcssblk"
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index d428c909b8a0..bf3a67c3cc5e 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -160,7 +160,7 @@ static int xpram_page_in (unsigned long page_addr, unsigned int xpage_index)
160 "0: ipm %0\n" 160 "0: ipm %0\n"
161 " srl %0,28\n" 161 " srl %0,28\n"
162 "1:\n" 162 "1:\n"
163#ifndef CONFIG_ARCH_S390X 163#ifndef CONFIG_64BIT
164 ".section __ex_table,\"a\"\n" 164 ".section __ex_table,\"a\"\n"
165 " .align 4\n" 165 " .align 4\n"
166 " .long 0b,1b\n" 166 " .long 0b,1b\n"
@@ -208,7 +208,7 @@ static long xpram_page_out (unsigned long page_addr, unsigned int xpage_index)
208 "0: ipm %0\n" 208 "0: ipm %0\n"
209 " srl %0,28\n" 209 " srl %0,28\n"
210 "1:\n" 210 "1:\n"
211#ifndef CONFIG_ARCH_S390X 211#ifndef CONFIG_64BIT
212 ".section __ex_table,\"a\"\n" 212 ".section __ex_table,\"a\"\n"
213 " .align 4\n" 213 " .align 4\n"
214 " .long 0b,1b\n" 214 " .long 0b,1b\n"
diff --git a/drivers/s390/char/sclp_cpi.c b/drivers/s390/char/sclp_cpi.c
index 5a6cef2dfa13..80f7f31310e6 100644
--- a/drivers/s390/char/sclp_cpi.c
+++ b/drivers/s390/char/sclp_cpi.c
@@ -204,7 +204,7 @@ cpi_module_init(void)
204 printk(KERN_WARNING "cpi: no control program identification " 204 printk(KERN_WARNING "cpi: no control program identification "
205 "support\n"); 205 "support\n");
206 sclp_unregister(&sclp_cpi_event); 206 sclp_unregister(&sclp_cpi_event);
207 return -ENOTSUPP; 207 return -EOPNOTSUPP;
208 } 208 }
209 209
210 req = cpi_prepare_req(); 210 req = cpi_prepare_req();
diff --git a/drivers/s390/char/sclp_quiesce.c b/drivers/s390/char/sclp_quiesce.c
index 83f75774df60..56fa69168898 100644
--- a/drivers/s390/char/sclp_quiesce.c
+++ b/drivers/s390/char/sclp_quiesce.c
@@ -32,7 +32,7 @@ do_load_quiesce_psw(void * __unused)
32 psw_t quiesce_psw; 32 psw_t quiesce_psw;
33 int cpu; 33 int cpu;
34 34
35 if (atomic_compare_and_swap(-1, smp_processor_id(), &cpuid)) 35 if (atomic_cmpxchg(&cpuid, -1, smp_processor_id()) != -1)
36 signal_processor(smp_processor_id(), sigp_stop); 36 signal_processor(smp_processor_id(), sigp_stop);
37 /* Wait for all other cpus to enter stopped state */ 37 /* Wait for all other cpus to enter stopped state */
38 for_each_online_cpu(cpu) { 38 for_each_online_cpu(cpu) {
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 1efc9f21229e..5ced2725d6c7 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -65,7 +65,7 @@ static void
65tapeblock_trigger_requeue(struct tape_device *device) 65tapeblock_trigger_requeue(struct tape_device *device)
66{ 66{
67 /* Protect against rescheduling. */ 67 /* Protect against rescheduling. */
68 if (atomic_compare_and_swap(0, 1, &device->blk_data.requeue_scheduled)) 68 if (atomic_cmpxchg(&device->blk_data.requeue_scheduled, 0, 1) != 0)
69 return; 69 return;
70 schedule_work(&device->blk_data.requeue_task); 70 schedule_work(&device->blk_data.requeue_task);
71} 71}
@@ -78,7 +78,7 @@ tapeblock_end_request(struct request *req, int uptodate)
78{ 78{
79 if (end_that_request_first(req, uptodate, req->hard_nr_sectors)) 79 if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
80 BUG(); 80 BUG();
81 end_that_request_last(req); 81 end_that_request_last(req, uptodate);
82} 82}
83 83
84static void 84static void
diff --git a/drivers/s390/char/vmwatchdog.c b/drivers/s390/char/vmwatchdog.c
index 5473c23fcb52..5acc0ace3d7d 100644
--- a/drivers/s390/char/vmwatchdog.c
+++ b/drivers/s390/char/vmwatchdog.c
@@ -66,7 +66,7 @@ static int __diag288(enum vmwdt_func func, unsigned int timeout,
66 __cmdl = len; 66 __cmdl = len;
67 err = 0; 67 err = 0;
68 asm volatile ( 68 asm volatile (
69#ifdef __s390x__ 69#ifdef CONFIG_64BIT
70 "diag %2,%4,0x288\n" 70 "diag %2,%4,0x288\n"
71 "1: \n" 71 "1: \n"
72 ".section .fixup,\"ax\"\n" 72 ".section .fixup,\"ax\"\n"
diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c
index a1c52a682191..daf21e03b21d 100644
--- a/drivers/s390/cio/blacklist.c
+++ b/drivers/s390/cio/blacklist.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * drivers/s390/cio/blacklist.c 2 * drivers/s390/cio/blacklist.c
3 * S/390 common I/O routines -- blacklisting of specific devices 3 * S/390 common I/O routines -- blacklisting of specific devices
4 * $Revision: 1.35 $ 4 * $Revision: 1.39 $
5 * 5 *
6 * Copyright (C) 1999-2002 IBM Deutschland Entwicklung GmbH, 6 * Copyright (C) 1999-2002 IBM Deutschland Entwicklung GmbH,
7 * IBM Corporation 7 * IBM Corporation
@@ -15,6 +15,7 @@
15#include <linux/vmalloc.h> 15#include <linux/vmalloc.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/seq_file.h>
18#include <linux/ctype.h> 19#include <linux/ctype.h>
19#include <linux/device.h> 20#include <linux/device.h>
20 21
@@ -34,10 +35,10 @@
34 * These can be single devices or ranges of devices 35 * These can be single devices or ranges of devices
35 */ 36 */
36 37
37/* 65536 bits to indicate if a devno is blacklisted or not */ 38/* 65536 bits for each set to indicate if a devno is blacklisted or not */
38#define __BL_DEV_WORDS ((__MAX_SUBCHANNELS + (8*sizeof(long) - 1)) / \ 39#define __BL_DEV_WORDS ((__MAX_SUBCHANNEL + (8*sizeof(long) - 1)) / \
39 (8*sizeof(long))) 40 (8*sizeof(long)))
40static unsigned long bl_dev[__BL_DEV_WORDS]; 41static unsigned long bl_dev[__MAX_SSID + 1][__BL_DEV_WORDS];
41typedef enum {add, free} range_action; 42typedef enum {add, free} range_action;
42 43
43/* 44/*
@@ -45,21 +46,23 @@ typedef enum {add, free} range_action;
45 * (Un-)blacklist the devices from-to 46 * (Un-)blacklist the devices from-to
46 */ 47 */
47static inline void 48static inline void
48blacklist_range (range_action action, unsigned int from, unsigned int to) 49blacklist_range (range_action action, unsigned int from, unsigned int to,
50 unsigned int ssid)
49{ 51{
50 if (!to) 52 if (!to)
51 to = from; 53 to = from;
52 54
53 if (from > to || to > __MAX_SUBCHANNELS) { 55 if (from > to || to > __MAX_SUBCHANNEL || ssid > __MAX_SSID) {
54 printk (KERN_WARNING "Invalid blacklist range " 56 printk (KERN_WARNING "Invalid blacklist range "
55 "0x%04x to 0x%04x, skipping\n", from, to); 57 "0.%x.%04x to 0.%x.%04x, skipping\n",
58 ssid, from, ssid, to);
56 return; 59 return;
57 } 60 }
58 for (; from <= to; from++) { 61 for (; from <= to; from++) {
59 if (action == add) 62 if (action == add)
60 set_bit (from, bl_dev); 63 set_bit (from, bl_dev[ssid]);
61 else 64 else
62 clear_bit (from, bl_dev); 65 clear_bit (from, bl_dev[ssid]);
63 } 66 }
64} 67}
65 68
@@ -69,7 +72,7 @@ blacklist_range (range_action action, unsigned int from, unsigned int to)
69 * Shamelessly grabbed from dasd_devmap.c. 72 * Shamelessly grabbed from dasd_devmap.c.
70 */ 73 */
71static inline int 74static inline int
72blacklist_busid(char **str, int *id0, int *id1, int *devno) 75blacklist_busid(char **str, int *id0, int *ssid, int *devno)
73{ 76{
74 int val, old_style; 77 int val, old_style;
75 char *sav; 78 char *sav;
@@ -86,7 +89,7 @@ blacklist_busid(char **str, int *id0, int *id1, int *devno)
86 goto confused; 89 goto confused;
87 val = simple_strtoul(*str, str, 16); 90 val = simple_strtoul(*str, str, 16);
88 if (old_style || (*str)[0] != '.') { 91 if (old_style || (*str)[0] != '.') {
89 *id0 = *id1 = 0; 92 *id0 = *ssid = 0;
90 if (val < 0 || val > 0xffff) 93 if (val < 0 || val > 0xffff)
91 goto confused; 94 goto confused;
92 *devno = val; 95 *devno = val;
@@ -105,7 +108,7 @@ blacklist_busid(char **str, int *id0, int *id1, int *devno)
105 val = simple_strtoul(*str, str, 16); 108 val = simple_strtoul(*str, str, 16);
106 if (val < 0 || val > 0xff || (*str)++[0] != '.') 109 if (val < 0 || val > 0xff || (*str)++[0] != '.')
107 goto confused; 110 goto confused;
108 *id1 = val; 111 *ssid = val;
109 if (!isxdigit((*str)[0])) /* We require at least one hex digit */ 112 if (!isxdigit((*str)[0])) /* We require at least one hex digit */
110 goto confused; 113 goto confused;
111 val = simple_strtoul(*str, str, 16); 114 val = simple_strtoul(*str, str, 16);
@@ -125,7 +128,7 @@ confused:
125static inline int 128static inline int
126blacklist_parse_parameters (char *str, range_action action) 129blacklist_parse_parameters (char *str, range_action action)
127{ 130{
128 unsigned int from, to, from_id0, to_id0, from_id1, to_id1; 131 unsigned int from, to, from_id0, to_id0, from_ssid, to_ssid;
129 132
130 while (*str != 0 && *str != '\n') { 133 while (*str != 0 && *str != '\n') {
131 range_action ra = action; 134 range_action ra = action;
@@ -142,23 +145,25 @@ blacklist_parse_parameters (char *str, range_action action)
142 */ 145 */
143 if (strncmp(str,"all,",4) == 0 || strcmp(str,"all") == 0 || 146 if (strncmp(str,"all,",4) == 0 || strcmp(str,"all") == 0 ||
144 strncmp(str,"all\n",4) == 0 || strncmp(str,"all ",4) == 0) { 147 strncmp(str,"all\n",4) == 0 || strncmp(str,"all ",4) == 0) {
145 from = 0; 148 int j;
146 to = __MAX_SUBCHANNELS; 149
147 str += 3; 150 str += 3;
151 for (j=0; j <= __MAX_SSID; j++)
152 blacklist_range(ra, 0, __MAX_SUBCHANNEL, j);
148 } else { 153 } else {
149 int rc; 154 int rc;
150 155
151 rc = blacklist_busid(&str, &from_id0, 156 rc = blacklist_busid(&str, &from_id0,
152 &from_id1, &from); 157 &from_ssid, &from);
153 if (rc) 158 if (rc)
154 continue; 159 continue;
155 to = from; 160 to = from;
156 to_id0 = from_id0; 161 to_id0 = from_id0;
157 to_id1 = from_id1; 162 to_ssid = from_ssid;
158 if (*str == '-') { 163 if (*str == '-') {
159 str++; 164 str++;
160 rc = blacklist_busid(&str, &to_id0, 165 rc = blacklist_busid(&str, &to_id0,
161 &to_id1, &to); 166 &to_ssid, &to);
162 if (rc) 167 if (rc)
163 continue; 168 continue;
164 } 169 }
@@ -168,18 +173,19 @@ blacklist_parse_parameters (char *str, range_action action)
168 strsep(&str, ",\n")); 173 strsep(&str, ",\n"));
169 continue; 174 continue;
170 } 175 }
171 if ((from_id0 != to_id0) || (from_id1 != to_id1)) { 176 if ((from_id0 != to_id0) ||
177 (from_ssid != to_ssid)) {
172 printk(KERN_WARNING "invalid cio_ignore range " 178 printk(KERN_WARNING "invalid cio_ignore range "
173 "%x.%x.%04x-%x.%x.%04x\n", 179 "%x.%x.%04x-%x.%x.%04x\n",
174 from_id0, from_id1, from, 180 from_id0, from_ssid, from,
175 to_id0, to_id1, to); 181 to_id0, to_ssid, to);
176 continue; 182 continue;
177 } 183 }
184 pr_debug("blacklist_setup: adding range "
185 "from %x.%x.%04x to %x.%x.%04x\n",
186 from_id0, from_ssid, from, to_id0, to_ssid, to);
187 blacklist_range (ra, from, to, to_ssid);
178 } 188 }
179 /* FIXME: ignoring id0 and id1 here. */
180 pr_debug("blacklist_setup: adding range "
181 "from 0.0.%04x to 0.0.%04x\n", from, to);
182 blacklist_range (ra, from, to);
183 } 189 }
184 return 1; 190 return 1;
185} 191}
@@ -213,12 +219,33 @@ __setup ("cio_ignore=", blacklist_setup);
213 * Used by validate_subchannel() 219 * Used by validate_subchannel()
214 */ 220 */
215int 221int
216is_blacklisted (int devno) 222is_blacklisted (int ssid, int devno)
217{ 223{
218 return test_bit (devno, bl_dev); 224 return test_bit (devno, bl_dev[ssid]);
219} 225}
220 226
221#ifdef CONFIG_PROC_FS 227#ifdef CONFIG_PROC_FS
228static int
229__s390_redo_validation(struct subchannel_id schid, void *data)
230{
231 int ret;
232 struct subchannel *sch;
233
234 sch = get_subchannel_by_schid(schid);
235 if (sch) {
236 /* Already known. */
237 put_device(&sch->dev);
238 return 0;
239 }
240 ret = css_probe_device(schid);
241 if (ret == -ENXIO)
242 return ret; /* We're through. */
243 if (ret == -ENOMEM)
244 /* Stop validation for now. Bad, but no need for a panic. */
245 return ret;
246 return 0;
247}
248
222/* 249/*
223 * Function: s390_redo_validation 250 * Function: s390_redo_validation
224 * Look for no longer blacklisted devices 251 * Look for no longer blacklisted devices
@@ -226,29 +253,9 @@ is_blacklisted (int devno)
226static inline void 253static inline void
227s390_redo_validation (void) 254s390_redo_validation (void)
228{ 255{
229 unsigned int irq;
230
231 CIO_TRACE_EVENT (0, "redoval"); 256 CIO_TRACE_EVENT (0, "redoval");
232 for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) { 257
233 int ret; 258 for_each_subchannel(__s390_redo_validation, NULL);
234 struct subchannel *sch;
235
236 sch = get_subchannel_by_schid(irq);
237 if (sch) {
238 /* Already known. */
239 put_device(&sch->dev);
240 continue;
241 }
242 ret = css_probe_device(irq);
243 if (ret == -ENXIO)
244 break; /* We're through. */
245 if (ret == -ENOMEM)
246 /*
247 * Stop validation for now. Bad, but no need for a
248 * panic.
249 */
250 break;
251 }
252} 259}
253 260
254/* 261/*
@@ -278,41 +285,90 @@ blacklist_parse_proc_parameters (char *buf)
278 s390_redo_validation (); 285 s390_redo_validation ();
279} 286}
280 287
281/* FIXME: These should be real bus ids and not home-grown ones! */ 288/* Iterator struct for all devices. */
282static int cio_ignore_read (char *page, char **start, off_t off, 289struct ccwdev_iter {
283 int count, int *eof, void *data) 290 int devno;
291 int ssid;
292 int in_range;
293};
294
295static void *
296cio_ignore_proc_seq_start(struct seq_file *s, loff_t *offset)
284{ 297{
285 const unsigned int entry_size = 18; /* "0.0.ABCD-0.0.EFGH\n" */ 298 struct ccwdev_iter *iter;
286 long devno; 299
287 int len; 300 if (*offset >= (__MAX_SUBCHANNEL + 1) * (__MAX_SSID + 1))
288 301 return NULL;
289 len = 0; 302 iter = kzalloc(sizeof(struct ccwdev_iter), GFP_KERNEL);
290 for (devno = off; /* abuse the page variable 303 if (!iter)
291 * as counter, see fs/proc/generic.c */ 304 return ERR_PTR(-ENOMEM);
292 devno < __MAX_SUBCHANNELS && len + entry_size < count; devno++) { 305 iter->ssid = *offset / (__MAX_SUBCHANNEL + 1);
293 if (!test_bit(devno, bl_dev)) 306 iter->devno = *offset % (__MAX_SUBCHANNEL + 1);
294 continue; 307 return iter;
295 len += sprintf(page + len, "0.0.%04lx", devno); 308}
296 if (test_bit(devno + 1, bl_dev)) { /* print range */ 309
297 while (++devno < __MAX_SUBCHANNELS) 310static void
298 if (!test_bit(devno, bl_dev)) 311cio_ignore_proc_seq_stop(struct seq_file *s, void *it)
299 break; 312{
300 len += sprintf(page + len, "-0.0.%04lx", --devno); 313 if (!IS_ERR(it))
301 } 314 kfree(it);
302 len += sprintf(page + len, "\n"); 315}
303 } 316
317static void *
318cio_ignore_proc_seq_next(struct seq_file *s, void *it, loff_t *offset)
319{
320 struct ccwdev_iter *iter;
321
322 if (*offset >= (__MAX_SUBCHANNEL + 1) * (__MAX_SSID + 1))
323 return NULL;
324 iter = it;
325 if (iter->devno == __MAX_SUBCHANNEL) {
326 iter->devno = 0;
327 iter->ssid++;
328 if (iter->ssid > __MAX_SSID)
329 return NULL;
330 } else
331 iter->devno++;
332 (*offset)++;
333 return iter;
334}
304 335
305 if (devno < __MAX_SUBCHANNELS) 336static int
306 *eof = 1; 337cio_ignore_proc_seq_show(struct seq_file *s, void *it)
307 *start = (char *) (devno - off); /* number of checked entries */ 338{
308 return len; 339 struct ccwdev_iter *iter;
340
341 iter = it;
342 if (!is_blacklisted(iter->ssid, iter->devno))
343 /* Not blacklisted, nothing to output. */
344 return 0;
345 if (!iter->in_range) {
346 /* First device in range. */
347 if ((iter->devno == __MAX_SUBCHANNEL) ||
348 !is_blacklisted(iter->ssid, iter->devno + 1))
349 /* Singular device. */
350 return seq_printf(s, "0.%x.%04x\n",
351 iter->ssid, iter->devno);
352 iter->in_range = 1;
353 return seq_printf(s, "0.%x.%04x-", iter->ssid, iter->devno);
354 }
355 if ((iter->devno == __MAX_SUBCHANNEL) ||
356 !is_blacklisted(iter->ssid, iter->devno + 1)) {
357 /* Last device in range. */
358 iter->in_range = 0;
359 return seq_printf(s, "0.%x.%04x\n", iter->ssid, iter->devno);
360 }
361 return 0;
309} 362}
310 363
311static int cio_ignore_write(struct file *file, const char __user *user_buf, 364static ssize_t
312 unsigned long user_len, void *data) 365cio_ignore_write(struct file *file, const char __user *user_buf,
366 size_t user_len, loff_t *offset)
313{ 367{
314 char *buf; 368 char *buf;
315 369
370 if (*offset)
371 return -EINVAL;
316 if (user_len > 65536) 372 if (user_len > 65536)
317 user_len = 65536; 373 user_len = 65536;
318 buf = vmalloc (user_len + 1); /* maybe better use the stack? */ 374 buf = vmalloc (user_len + 1); /* maybe better use the stack? */
@@ -330,6 +386,27 @@ static int cio_ignore_write(struct file *file, const char __user *user_buf,
330 return user_len; 386 return user_len;
331} 387}
332 388
389static struct seq_operations cio_ignore_proc_seq_ops = {
390 .start = cio_ignore_proc_seq_start,
391 .stop = cio_ignore_proc_seq_stop,
392 .next = cio_ignore_proc_seq_next,
393 .show = cio_ignore_proc_seq_show,
394};
395
396static int
397cio_ignore_proc_open(struct inode *inode, struct file *file)
398{
399 return seq_open(file, &cio_ignore_proc_seq_ops);
400}
401
402static struct file_operations cio_ignore_proc_fops = {
403 .open = cio_ignore_proc_open,
404 .read = seq_read,
405 .llseek = seq_lseek,
406 .release = seq_release,
407 .write = cio_ignore_write,
408};
409
333static int 410static int
334cio_ignore_proc_init (void) 411cio_ignore_proc_init (void)
335{ 412{
@@ -340,8 +417,7 @@ cio_ignore_proc_init (void)
340 if (!entry) 417 if (!entry)
341 return 0; 418 return 0;
342 419
343 entry->read_proc = cio_ignore_read; 420 entry->proc_fops = &cio_ignore_proc_fops;
344 entry->write_proc = cio_ignore_write;
345 421
346 return 1; 422 return 1;
347} 423}
diff --git a/drivers/s390/cio/blacklist.h b/drivers/s390/cio/blacklist.h
index fb42cafbe57c..95e25c1df922 100644
--- a/drivers/s390/cio/blacklist.h
+++ b/drivers/s390/cio/blacklist.h
@@ -1,6 +1,6 @@
1#ifndef S390_BLACKLIST_H 1#ifndef S390_BLACKLIST_H
2#define S390_BLACKLIST_H 2#define S390_BLACKLIST_H
3 3
4extern int is_blacklisted (int devno); 4extern int is_blacklisted (int ssid, int devno);
5 5
6#endif 6#endif
diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c
index be9d2d65c22f..e849289d4f3c 100644
--- a/drivers/s390/cio/ccwgroup.c
+++ b/drivers/s390/cio/ccwgroup.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * drivers/s390/cio/ccwgroup.c 2 * drivers/s390/cio/ccwgroup.c
3 * bus driver for ccwgroup 3 * bus driver for ccwgroup
4 * $Revision: 1.32 $ 4 * $Revision: 1.33 $
5 * 5 *
6 * Copyright (C) 2002 IBM Deutschland Entwicklung GmbH, 6 * Copyright (C) 2002 IBM Deutschland Entwicklung GmbH,
7 * IBM Corporation 7 * IBM Corporation
@@ -263,7 +263,7 @@ ccwgroup_set_online(struct ccwgroup_device *gdev)
263 struct ccwgroup_driver *gdrv; 263 struct ccwgroup_driver *gdrv;
264 int ret; 264 int ret;
265 265
266 if (atomic_compare_and_swap(0, 1, &gdev->onoff)) 266 if (atomic_cmpxchg(&gdev->onoff, 0, 1) != 0)
267 return -EAGAIN; 267 return -EAGAIN;
268 if (gdev->state == CCWGROUP_ONLINE) { 268 if (gdev->state == CCWGROUP_ONLINE) {
269 ret = 0; 269 ret = 0;
@@ -289,7 +289,7 @@ ccwgroup_set_offline(struct ccwgroup_device *gdev)
289 struct ccwgroup_driver *gdrv; 289 struct ccwgroup_driver *gdrv;
290 int ret; 290 int ret;
291 291
292 if (atomic_compare_and_swap(0, 1, &gdev->onoff)) 292 if (atomic_cmpxchg(&gdev->onoff, 0, 1) != 0)
293 return -EAGAIN; 293 return -EAGAIN;
294 if (gdev->state == CCWGROUP_OFFLINE) { 294 if (gdev->state == CCWGROUP_OFFLINE) {
295 ret = 0; 295 ret = 0;
diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index fa3c23b80e3a..7270808c02d1 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * drivers/s390/cio/chsc.c 2 * drivers/s390/cio/chsc.c
3 * S/390 common I/O routines -- channel subsystem call 3 * S/390 common I/O routines -- channel subsystem call
4 * $Revision: 1.120 $ 4 * $Revision: 1.126 $
5 * 5 *
6 * Copyright (C) 1999-2002 IBM Deutschland Entwicklung GmbH, 6 * Copyright (C) 1999-2002 IBM Deutschland Entwicklung GmbH,
7 * IBM Corporation 7 * IBM Corporation
@@ -24,8 +24,6 @@
24#include "ioasm.h" 24#include "ioasm.h"
25#include "chsc.h" 25#include "chsc.h"
26 26
27static struct channel_path *chps[NR_CHPIDS];
28
29static void *sei_page; 27static void *sei_page;
30 28
31static int new_channel_path(int chpid); 29static int new_channel_path(int chpid);
@@ -33,13 +31,13 @@ static int new_channel_path(int chpid);
33static inline void 31static inline void
34set_chp_logically_online(int chp, int onoff) 32set_chp_logically_online(int chp, int onoff)
35{ 33{
36 chps[chp]->state = onoff; 34 css[0]->chps[chp]->state = onoff;
37} 35}
38 36
39static int 37static int
40get_chp_status(int chp) 38get_chp_status(int chp)
41{ 39{
42 return (chps[chp] ? chps[chp]->state : -ENODEV); 40 return (css[0]->chps[chp] ? css[0]->chps[chp]->state : -ENODEV);
43} 41}
44 42
45void 43void
@@ -77,7 +75,9 @@ chsc_get_sch_desc_irq(struct subchannel *sch, void *page)
77 75
78 struct { 76 struct {
79 struct chsc_header request; 77 struct chsc_header request;
80 u16 reserved1; 78 u16 reserved1a:10;
79 u16 ssid:2;
80 u16 reserved1b:4;
81 u16 f_sch; /* first subchannel */ 81 u16 f_sch; /* first subchannel */
82 u16 reserved2; 82 u16 reserved2;
83 u16 l_sch; /* last subchannel */ 83 u16 l_sch; /* last subchannel */
@@ -104,8 +104,9 @@ chsc_get_sch_desc_irq(struct subchannel *sch, void *page)
104 .code = 0x0004, 104 .code = 0x0004,
105 }; 105 };
106 106
107 ssd_area->f_sch = sch->irq; 107 ssd_area->ssid = sch->schid.ssid;
108 ssd_area->l_sch = sch->irq; 108 ssd_area->f_sch = sch->schid.sch_no;
109 ssd_area->l_sch = sch->schid.sch_no;
109 110
110 ccode = chsc(ssd_area); 111 ccode = chsc(ssd_area);
111 if (ccode > 0) { 112 if (ccode > 0) {
@@ -147,7 +148,8 @@ chsc_get_sch_desc_irq(struct subchannel *sch, void *page)
147 */ 148 */
148 if (ssd_area->st > 3) { /* uhm, that looks strange... */ 149 if (ssd_area->st > 3) { /* uhm, that looks strange... */
149 CIO_CRW_EVENT(0, "Strange subchannel type %d" 150 CIO_CRW_EVENT(0, "Strange subchannel type %d"
150 " for sch %04x\n", ssd_area->st, sch->irq); 151 " for sch 0.%x.%04x\n", ssd_area->st,
152 sch->schid.ssid, sch->schid.sch_no);
151 /* 153 /*
152 * There may have been a new subchannel type defined in the 154 * There may have been a new subchannel type defined in the
153 * time since this code was written; since we don't know which 155 * time since this code was written; since we don't know which
@@ -156,8 +158,9 @@ chsc_get_sch_desc_irq(struct subchannel *sch, void *page)
156 return 0; 158 return 0;
157 } else { 159 } else {
158 const char *type[4] = {"I/O", "chsc", "message", "ADM"}; 160 const char *type[4] = {"I/O", "chsc", "message", "ADM"};
159 CIO_CRW_EVENT(6, "ssd: sch %04x is %s subchannel\n", 161 CIO_CRW_EVENT(6, "ssd: sch 0.%x.%04x is %s subchannel\n",
160 sch->irq, type[ssd_area->st]); 162 sch->schid.ssid, sch->schid.sch_no,
163 type[ssd_area->st]);
161 164
162 sch->ssd_info.valid = 1; 165 sch->ssd_info.valid = 1;
163 sch->ssd_info.type = ssd_area->st; 166 sch->ssd_info.type = ssd_area->st;
@@ -218,13 +221,13 @@ s390_subchannel_remove_chpid(struct device *dev, void *data)
218 int j; 221 int j;
219 int mask; 222 int mask;
220 struct subchannel *sch; 223 struct subchannel *sch;
221 __u8 *chpid; 224 struct channel_path *chpid;
222 struct schib schib; 225 struct schib schib;
223 226
224 sch = to_subchannel(dev); 227 sch = to_subchannel(dev);
225 chpid = data; 228 chpid = data;
226 for (j = 0; j < 8; j++) 229 for (j = 0; j < 8; j++)
227 if (sch->schib.pmcw.chpid[j] == *chpid) 230 if (sch->schib.pmcw.chpid[j] == chpid->id)
228 break; 231 break;
229 if (j >= 8) 232 if (j >= 8)
230 return 0; 233 return 0;
@@ -232,7 +235,7 @@ s390_subchannel_remove_chpid(struct device *dev, void *data)
232 mask = 0x80 >> j; 235 mask = 0x80 >> j;
233 spin_lock(&sch->lock); 236 spin_lock(&sch->lock);
234 237
235 stsch(sch->irq, &schib); 238 stsch(sch->schid, &schib);
236 if (!schib.pmcw.dnv) 239 if (!schib.pmcw.dnv)
237 goto out_unreg; 240 goto out_unreg;
238 memcpy(&sch->schib, &schib, sizeof(struct schib)); 241 memcpy(&sch->schib, &schib, sizeof(struct schib));
@@ -284,7 +287,7 @@ out_unlock:
284out_unreg: 287out_unreg:
285 spin_unlock(&sch->lock); 288 spin_unlock(&sch->lock);
286 sch->lpm = 0; 289 sch->lpm = 0;
287 if (css_enqueue_subchannel_slow(sch->irq)) { 290 if (css_enqueue_subchannel_slow(sch->schid)) {
288 css_clear_subchannel_slow_list(); 291 css_clear_subchannel_slow_list();
289 need_rescan = 1; 292 need_rescan = 1;
290 } 293 }
@@ -295,23 +298,30 @@ static inline void
295s390_set_chpid_offline( __u8 chpid) 298s390_set_chpid_offline( __u8 chpid)
296{ 299{
297 char dbf_txt[15]; 300 char dbf_txt[15];
301 struct device *dev;
298 302
299 sprintf(dbf_txt, "chpr%x", chpid); 303 sprintf(dbf_txt, "chpr%x", chpid);
300 CIO_TRACE_EVENT(2, dbf_txt); 304 CIO_TRACE_EVENT(2, dbf_txt);
301 305
302 if (get_chp_status(chpid) <= 0) 306 if (get_chp_status(chpid) <= 0)
303 return; 307 return;
304 308 dev = get_device(&css[0]->chps[chpid]->dev);
305 bus_for_each_dev(&css_bus_type, NULL, &chpid, 309 bus_for_each_dev(&css_bus_type, NULL, to_channelpath(dev),
306 s390_subchannel_remove_chpid); 310 s390_subchannel_remove_chpid);
307 311
308 if (need_rescan || css_slow_subchannels_exist()) 312 if (need_rescan || css_slow_subchannels_exist())
309 queue_work(slow_path_wq, &slow_path_work); 313 queue_work(slow_path_wq, &slow_path_work);
314 put_device(dev);
310} 315}
311 316
317struct res_acc_data {
318 struct channel_path *chp;
319 u32 fla_mask;
320 u16 fla;
321};
322
312static int 323static int
313s390_process_res_acc_sch(u8 chpid, __u16 fla, u32 fla_mask, 324s390_process_res_acc_sch(struct res_acc_data *res_data, struct subchannel *sch)
314 struct subchannel *sch)
315{ 325{
316 int found; 326 int found;
317 int chp; 327 int chp;
@@ -323,8 +333,9 @@ s390_process_res_acc_sch(u8 chpid, __u16 fla, u32 fla_mask,
323 * check if chpid is in information updated by ssd 333 * check if chpid is in information updated by ssd
324 */ 334 */
325 if (sch->ssd_info.valid && 335 if (sch->ssd_info.valid &&
326 sch->ssd_info.chpid[chp] == chpid && 336 sch->ssd_info.chpid[chp] == res_data->chp->id &&
327 (sch->ssd_info.fla[chp] & fla_mask) == fla) { 337 (sch->ssd_info.fla[chp] & res_data->fla_mask)
338 == res_data->fla) {
328 found = 1; 339 found = 1;
329 break; 340 break;
330 } 341 }
@@ -337,24 +348,87 @@ s390_process_res_acc_sch(u8 chpid, __u16 fla, u32 fla_mask,
337 * new path information and eventually check for logically 348 * new path information and eventually check for logically
338 * offline chpids. 349 * offline chpids.
339 */ 350 */
340 ccode = stsch(sch->irq, &sch->schib); 351 ccode = stsch(sch->schid, &sch->schib);
341 if (ccode > 0) 352 if (ccode > 0)
342 return 0; 353 return 0;
343 354
344 return 0x80 >> chp; 355 return 0x80 >> chp;
345} 356}
346 357
358static inline int
359s390_process_res_acc_new_sch(struct subchannel_id schid)
360{
361 struct schib schib;
362 int ret;
363 /*
364 * We don't know the device yet, but since a path
365 * may be available now to the device we'll have
366 * to do recognition again.
367 * Since we don't have any idea about which chpid
368 * that beast may be on we'll have to do a stsch
369 * on all devices, grr...
370 */
371 if (stsch_err(schid, &schib))
372 /* We're through */
373 return need_rescan ? -EAGAIN : -ENXIO;
374
375 /* Put it on the slow path. */
376 ret = css_enqueue_subchannel_slow(schid);
377 if (ret) {
378 css_clear_subchannel_slow_list();
379 need_rescan = 1;
380 return -EAGAIN;
381 }
382 return 0;
383}
384
347static int 385static int
348s390_process_res_acc (u8 chpid, __u16 fla, u32 fla_mask) 386__s390_process_res_acc(struct subchannel_id schid, void *data)
349{ 387{
388 int chp_mask, old_lpm;
389 struct res_acc_data *res_data;
350 struct subchannel *sch; 390 struct subchannel *sch;
351 int irq, rc; 391
392 res_data = (struct res_acc_data *)data;
393 sch = get_subchannel_by_schid(schid);
394 if (!sch)
395 /* Check if a subchannel is newly available. */
396 return s390_process_res_acc_new_sch(schid);
397
398 spin_lock_irq(&sch->lock);
399
400 chp_mask = s390_process_res_acc_sch(res_data, sch);
401
402 if (chp_mask == 0) {
403 spin_unlock_irq(&sch->lock);
404 return 0;
405 }
406 old_lpm = sch->lpm;
407 sch->lpm = ((sch->schib.pmcw.pim &
408 sch->schib.pmcw.pam &
409 sch->schib.pmcw.pom)
410 | chp_mask) & sch->opm;
411 if (!old_lpm && sch->lpm)
412 device_trigger_reprobe(sch);
413 else if (sch->driver && sch->driver->verify)
414 sch->driver->verify(&sch->dev);
415
416 spin_unlock_irq(&sch->lock);
417 put_device(&sch->dev);
418 return (res_data->fla_mask == 0xffff) ? -ENODEV : 0;
419}
420
421
422static int
423s390_process_res_acc (struct res_acc_data *res_data)
424{
425 int rc;
352 char dbf_txt[15]; 426 char dbf_txt[15];
353 427
354 sprintf(dbf_txt, "accpr%x", chpid); 428 sprintf(dbf_txt, "accpr%x", res_data->chp->id);
355 CIO_TRACE_EVENT( 2, dbf_txt); 429 CIO_TRACE_EVENT( 2, dbf_txt);
356 if (fla != 0) { 430 if (res_data->fla != 0) {
357 sprintf(dbf_txt, "fla%x", fla); 431 sprintf(dbf_txt, "fla%x", res_data->fla);
358 CIO_TRACE_EVENT( 2, dbf_txt); 432 CIO_TRACE_EVENT( 2, dbf_txt);
359 } 433 }
360 434
@@ -365,70 +439,11 @@ s390_process_res_acc (u8 chpid, __u16 fla, u32 fla_mask)
365 * The more information we have (info), the less scanning 439 * The more information we have (info), the less scanning
366 * will we have to do. 440 * will we have to do.
367 */ 441 */
368 442 rc = for_each_subchannel(__s390_process_res_acc, res_data);
369 if (!get_chp_status(chpid)) 443 if (css_slow_subchannels_exist())
370 return 0; /* no need to do the rest */ 444 rc = -EAGAIN;
371 445 else if (rc != -EAGAIN)
372 rc = 0; 446 rc = 0;
373 for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) {
374 int chp_mask, old_lpm;
375
376 sch = get_subchannel_by_schid(irq);
377 if (!sch) {
378 struct schib schib;
379 int ret;
380 /*
381 * We don't know the device yet, but since a path
382 * may be available now to the device we'll have
383 * to do recognition again.
384 * Since we don't have any idea about which chpid
385 * that beast may be on we'll have to do a stsch
386 * on all devices, grr...
387 */
388 if (stsch(irq, &schib)) {
389 /* We're through */
390 if (need_rescan)
391 rc = -EAGAIN;
392 break;
393 }
394 if (need_rescan) {
395 rc = -EAGAIN;
396 continue;
397 }
398 /* Put it on the slow path. */
399 ret = css_enqueue_subchannel_slow(irq);
400 if (ret) {
401 css_clear_subchannel_slow_list();
402 need_rescan = 1;
403 }
404 rc = -EAGAIN;
405 continue;
406 }
407
408 spin_lock_irq(&sch->lock);
409
410 chp_mask = s390_process_res_acc_sch(chpid, fla, fla_mask, sch);
411
412 if (chp_mask == 0) {
413
414 spin_unlock_irq(&sch->lock);
415 continue;
416 }
417 old_lpm = sch->lpm;
418 sch->lpm = ((sch->schib.pmcw.pim &
419 sch->schib.pmcw.pam &
420 sch->schib.pmcw.pom)
421 | chp_mask) & sch->opm;
422 if (!old_lpm && sch->lpm)
423 device_trigger_reprobe(sch);
424 else if (sch->driver && sch->driver->verify)
425 sch->driver->verify(&sch->dev);
426
427 spin_unlock_irq(&sch->lock);
428 put_device(&sch->dev);
429 if (fla_mask == 0xffff)
430 break;
431 }
432 return rc; 447 return rc;
433} 448}
434 449
@@ -466,6 +481,7 @@ int
466chsc_process_crw(void) 481chsc_process_crw(void)
467{ 482{
468 int chpid, ret; 483 int chpid, ret;
484 struct res_acc_data res_data;
469 struct { 485 struct {
470 struct chsc_header request; 486 struct chsc_header request;
471 u32 reserved1; 487 u32 reserved1;
@@ -499,8 +515,9 @@ chsc_process_crw(void)
499 ret = 0; 515 ret = 0;
500 do { 516 do {
501 int ccode, status; 517 int ccode, status;
518 struct device *dev;
502 memset(sei_area, 0, sizeof(*sei_area)); 519 memset(sei_area, 0, sizeof(*sei_area));
503 520 memset(&res_data, 0, sizeof(struct res_acc_data));
504 sei_area->request = (struct chsc_header) { 521 sei_area->request = (struct chsc_header) {
505 .length = 0x0010, 522 .length = 0x0010,
506 .code = 0x000e, 523 .code = 0x000e,
@@ -573,26 +590,25 @@ chsc_process_crw(void)
573 if (status < 0) 590 if (status < 0)
574 new_channel_path(sei_area->rsid); 591 new_channel_path(sei_area->rsid);
575 else if (!status) 592 else if (!status)
576 return 0; 593 break;
577 if ((sei_area->vf & 0x80) == 0) { 594 dev = get_device(&css[0]->chps[sei_area->rsid]->dev);
578 pr_debug("chpid: %x\n", sei_area->rsid); 595 res_data.chp = to_channelpath(dev);
579 ret = s390_process_res_acc(sei_area->rsid, 596 pr_debug("chpid: %x", sei_area->rsid);
580 0, 0); 597 if ((sei_area->vf & 0xc0) != 0) {
581 } else if ((sei_area->vf & 0xc0) == 0x80) { 598 res_data.fla = sei_area->fla;
582 pr_debug("chpid: %x link addr: %x\n", 599 if ((sei_area->vf & 0xc0) == 0xc0) {
583 sei_area->rsid, sei_area->fla); 600 pr_debug(" full link addr: %x",
584 ret = s390_process_res_acc(sei_area->rsid, 601 sei_area->fla);
585 sei_area->fla, 602 res_data.fla_mask = 0xffff;
586 0xff00); 603 } else {
587 } else if ((sei_area->vf & 0xc0) == 0xc0) { 604 pr_debug(" link addr: %x",
588 pr_debug("chpid: %x full link addr: %x\n", 605 sei_area->fla);
589 sei_area->rsid, sei_area->fla); 606 res_data.fla_mask = 0xff00;
590 ret = s390_process_res_acc(sei_area->rsid, 607 }
591 sei_area->fla,
592 0xffff);
593 } 608 }
594 pr_debug("\n"); 609 ret = s390_process_res_acc(&res_data);
595 610 pr_debug("\n\n");
611 put_device(dev);
596 break; 612 break;
597 613
598 default: /* other stuff */ 614 default: /* other stuff */
@@ -604,12 +620,72 @@ chsc_process_crw(void)
604 return ret; 620 return ret;
605} 621}
606 622
623static inline int
624__chp_add_new_sch(struct subchannel_id schid)
625{
626 struct schib schib;
627 int ret;
628
629 if (stsch(schid, &schib))
630 /* We're through */
631 return need_rescan ? -EAGAIN : -ENXIO;
632
633 /* Put it on the slow path. */
634 ret = css_enqueue_subchannel_slow(schid);
635 if (ret) {
636 css_clear_subchannel_slow_list();
637 need_rescan = 1;
638 return -EAGAIN;
639 }
640 return 0;
641}
642
643
607static int 644static int
608chp_add(int chpid) 645__chp_add(struct subchannel_id schid, void *data)
609{ 646{
647 int i;
648 struct channel_path *chp;
610 struct subchannel *sch; 649 struct subchannel *sch;
611 int irq, ret, rc; 650
651 chp = (struct channel_path *)data;
652 sch = get_subchannel_by_schid(schid);
653 if (!sch)
654 /* Check if the subchannel is now available. */
655 return __chp_add_new_sch(schid);
656 spin_lock(&sch->lock);
657 for (i=0; i<8; i++)
658 if (sch->schib.pmcw.chpid[i] == chp->id) {
659 if (stsch(sch->schid, &sch->schib) != 0) {
660 /* Endgame. */
661 spin_unlock(&sch->lock);
662 return -ENXIO;
663 }
664 break;
665 }
666 if (i==8) {
667 spin_unlock(&sch->lock);
668 return 0;
669 }
670 sch->lpm = ((sch->schib.pmcw.pim &
671 sch->schib.pmcw.pam &
672 sch->schib.pmcw.pom)
673 | 0x80 >> i) & sch->opm;
674
675 if (sch->driver && sch->driver->verify)
676 sch->driver->verify(&sch->dev);
677
678 spin_unlock(&sch->lock);
679 put_device(&sch->dev);
680 return 0;
681}
682
683static int
684chp_add(int chpid)
685{
686 int rc;
612 char dbf_txt[15]; 687 char dbf_txt[15];
688 struct device *dev;
613 689
614 if (!get_chp_status(chpid)) 690 if (!get_chp_status(chpid))
615 return 0; /* no need to do the rest */ 691 return 0; /* no need to do the rest */
@@ -617,59 +693,13 @@ chp_add(int chpid)
617 sprintf(dbf_txt, "cadd%x", chpid); 693 sprintf(dbf_txt, "cadd%x", chpid);
618 CIO_TRACE_EVENT(2, dbf_txt); 694 CIO_TRACE_EVENT(2, dbf_txt);
619 695
620 rc = 0; 696 dev = get_device(&css[0]->chps[chpid]->dev);
621 for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) { 697 rc = for_each_subchannel(__chp_add, to_channelpath(dev));
622 int i; 698 if (css_slow_subchannels_exist())
623 699 rc = -EAGAIN;
624 sch = get_subchannel_by_schid(irq); 700 if (rc != -EAGAIN)
625 if (!sch) { 701 rc = 0;
626 struct schib schib; 702 put_device(dev);
627
628 if (stsch(irq, &schib)) {
629 /* We're through */
630 if (need_rescan)
631 rc = -EAGAIN;
632 break;
633 }
634 if (need_rescan) {
635 rc = -EAGAIN;
636 continue;
637 }
638 /* Put it on the slow path. */
639 ret = css_enqueue_subchannel_slow(irq);
640 if (ret) {
641 css_clear_subchannel_slow_list();
642 need_rescan = 1;
643 }
644 rc = -EAGAIN;
645 continue;
646 }
647
648 spin_lock(&sch->lock);
649 for (i=0; i<8; i++)
650 if (sch->schib.pmcw.chpid[i] == chpid) {
651 if (stsch(sch->irq, &sch->schib) != 0) {
652 /* Endgame. */
653 spin_unlock(&sch->lock);
654 return rc;
655 }
656 break;
657 }
658 if (i==8) {
659 spin_unlock(&sch->lock);
660 return rc;
661 }
662 sch->lpm = ((sch->schib.pmcw.pim &
663 sch->schib.pmcw.pam &
664 sch->schib.pmcw.pom)
665 | 0x80 >> i) & sch->opm;
666
667 if (sch->driver && sch->driver->verify)
668 sch->driver->verify(&sch->dev);
669
670 spin_unlock(&sch->lock);
671 put_device(&sch->dev);
672 }
673 return rc; 703 return rc;
674} 704}
675 705
@@ -702,7 +732,7 @@ __check_for_io_and_kill(struct subchannel *sch, int index)
702 if (!device_is_online(sch)) 732 if (!device_is_online(sch))
703 /* cio could be doing I/O. */ 733 /* cio could be doing I/O. */
704 return 0; 734 return 0;
705 cc = stsch(sch->irq, &sch->schib); 735 cc = stsch(sch->schid, &sch->schib);
706 if (cc) 736 if (cc)
707 return 0; 737 return 0;
708 if (sch->schib.scsw.actl && sch->schib.pmcw.lpum == (0x80 >> index)) { 738 if (sch->schib.scsw.actl && sch->schib.pmcw.lpum == (0x80 >> index)) {
@@ -743,7 +773,7 @@ __s390_subchannel_vary_chpid(struct subchannel *sch, __u8 chpid, int on)
743 * just varied off path. Then kill it. 773 * just varied off path. Then kill it.
744 */ 774 */
745 if (!__check_for_io_and_kill(sch, chp) && !sch->lpm) { 775 if (!__check_for_io_and_kill(sch, chp) && !sch->lpm) {
746 if (css_enqueue_subchannel_slow(sch->irq)) { 776 if (css_enqueue_subchannel_slow(sch->schid)) {
747 css_clear_subchannel_slow_list(); 777 css_clear_subchannel_slow_list();
748 need_rescan = 1; 778 need_rescan = 1;
749 } 779 }
@@ -781,6 +811,29 @@ s390_subchannel_vary_chpid_on(struct device *dev, void *data)
781 return 0; 811 return 0;
782} 812}
783 813
814static int
815__s390_vary_chpid_on(struct subchannel_id schid, void *data)
816{
817 struct schib schib;
818 struct subchannel *sch;
819
820 sch = get_subchannel_by_schid(schid);
821 if (sch) {
822 put_device(&sch->dev);
823 return 0;
824 }
825 if (stsch_err(schid, &schib))
826 /* We're through */
827 return -ENXIO;
828 /* Put it on the slow path. */
829 if (css_enqueue_subchannel_slow(schid)) {
830 css_clear_subchannel_slow_list();
831 need_rescan = 1;
832 return -EAGAIN;
833 }
834 return 0;
835}
836
784/* 837/*
785 * Function: s390_vary_chpid 838 * Function: s390_vary_chpid
786 * Varies the specified chpid online or offline 839 * Varies the specified chpid online or offline
@@ -789,8 +842,7 @@ static int
789s390_vary_chpid( __u8 chpid, int on) 842s390_vary_chpid( __u8 chpid, int on)
790{ 843{
791 char dbf_text[15]; 844 char dbf_text[15];
792 int status, irq, ret; 845 int status;
793 struct subchannel *sch;
794 846
795 sprintf(dbf_text, on?"varyon%x":"varyoff%x", chpid); 847 sprintf(dbf_text, on?"varyon%x":"varyoff%x", chpid);
796 CIO_TRACE_EVENT( 2, dbf_text); 848 CIO_TRACE_EVENT( 2, dbf_text);
@@ -815,30 +867,9 @@ s390_vary_chpid( __u8 chpid, int on)
815 bus_for_each_dev(&css_bus_type, NULL, &chpid, on ? 867 bus_for_each_dev(&css_bus_type, NULL, &chpid, on ?
816 s390_subchannel_vary_chpid_on : 868 s390_subchannel_vary_chpid_on :
817 s390_subchannel_vary_chpid_off); 869 s390_subchannel_vary_chpid_off);
818 if (!on) 870 if (on)
819 goto out; 871 /* Scan for new devices on varied on path. */
820 /* Scan for new devices on varied on path. */ 872 for_each_subchannel(__s390_vary_chpid_on, NULL);
821 for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) {
822 struct schib schib;
823
824 if (need_rescan)
825 break;
826 sch = get_subchannel_by_schid(irq);
827 if (sch) {
828 put_device(&sch->dev);
829 continue;
830 }
831 if (stsch(irq, &schib))
832 /* We're through */
833 break;
834 /* Put it on the slow path. */
835 ret = css_enqueue_subchannel_slow(irq);
836 if (ret) {
837 css_clear_subchannel_slow_list();
838 need_rescan = 1;
839 }
840 }
841out:
842 if (need_rescan || css_slow_subchannels_exist()) 873 if (need_rescan || css_slow_subchannels_exist())
843 queue_work(slow_path_wq, &slow_path_work); 874 queue_work(slow_path_wq, &slow_path_work);
844 return 0; 875 return 0;
@@ -995,7 +1026,7 @@ new_channel_path(int chpid)
995 chp->id = chpid; 1026 chp->id = chpid;
996 chp->state = 1; 1027 chp->state = 1;
997 chp->dev = (struct device) { 1028 chp->dev = (struct device) {
998 .parent = &css_bus_device, 1029 .parent = &css[0]->device,
999 .release = chp_release, 1030 .release = chp_release,
1000 }; 1031 };
1001 snprintf(chp->dev.bus_id, BUS_ID_SIZE, "chp0.%x", chpid); 1032 snprintf(chp->dev.bus_id, BUS_ID_SIZE, "chp0.%x", chpid);
@@ -1017,7 +1048,7 @@ new_channel_path(int chpid)
1017 device_unregister(&chp->dev); 1048 device_unregister(&chp->dev);
1018 goto out_free; 1049 goto out_free;
1019 } else 1050 } else
1020 chps[chpid] = chp; 1051 css[0]->chps[chpid] = chp;
1021 return ret; 1052 return ret;
1022out_free: 1053out_free:
1023 kfree(chp); 1054 kfree(chp);
@@ -1030,7 +1061,7 @@ chsc_get_chp_desc(struct subchannel *sch, int chp_no)
1030 struct channel_path *chp; 1061 struct channel_path *chp;
1031 struct channel_path_desc *desc; 1062 struct channel_path_desc *desc;
1032 1063
1033 chp = chps[sch->schib.pmcw.chpid[chp_no]]; 1064 chp = css[0]->chps[sch->schib.pmcw.chpid[chp_no]];
1034 if (!chp) 1065 if (!chp)
1035 return NULL; 1066 return NULL;
1036 desc = kmalloc(sizeof(struct channel_path_desc), GFP_KERNEL); 1067 desc = kmalloc(sizeof(struct channel_path_desc), GFP_KERNEL);
@@ -1051,6 +1082,54 @@ chsc_alloc_sei_area(void)
1051 return (sei_page ? 0 : -ENOMEM); 1082 return (sei_page ? 0 : -ENOMEM);
1052} 1083}
1053 1084
1085int __init
1086chsc_enable_facility(int operation_code)
1087{
1088 int ret;
1089 struct {
1090 struct chsc_header request;
1091 u8 reserved1:4;
1092 u8 format:4;
1093 u8 reserved2;
1094 u16 operation_code;
1095 u32 reserved3;
1096 u32 reserved4;
1097 u32 operation_data_area[252];
1098 struct chsc_header response;
1099 u32 reserved5:4;
1100 u32 format2:4;
1101 u32 reserved6:24;
1102 } *sda_area;
1103
1104 sda_area = (void *)get_zeroed_page(GFP_KERNEL|GFP_DMA);
1105 if (!sda_area)
1106 return -ENOMEM;
1107 sda_area->request = (struct chsc_header) {
1108 .length = 0x0400,
1109 .code = 0x0031,
1110 };
1111 sda_area->operation_code = operation_code;
1112
1113 ret = chsc(sda_area);
1114 if (ret > 0) {
1115 ret = (ret == 3) ? -ENODEV : -EBUSY;
1116 goto out;
1117 }
1118 switch (sda_area->response.code) {
1119 case 0x0003: /* invalid request block */
1120 case 0x0007:
1121 ret = -EINVAL;
1122 break;
1123 case 0x0004: /* command not provided */
1124 case 0x0101: /* facility not provided */
1125 ret = -EOPNOTSUPP;
1126 break;
1127 }
1128 out:
1129 free_page((unsigned long)sda_area);
1130 return ret;
1131}
1132
1054subsys_initcall(chsc_alloc_sei_area); 1133subsys_initcall(chsc_alloc_sei_area);
1055 1134
1056struct css_general_char css_general_characteristics; 1135struct css_general_char css_general_characteristics;
diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h
index be20da49d147..44e4b4bb1c5a 100644
--- a/drivers/s390/cio/chsc.h
+++ b/drivers/s390/cio/chsc.h
@@ -1,12 +1,12 @@
1#ifndef S390_CHSC_H 1#ifndef S390_CHSC_H
2#define S390_CHSC_H 2#define S390_CHSC_H
3 3
4#define NR_CHPIDS 256
5
6#define CHSC_SEI_ACC_CHPID 1 4#define CHSC_SEI_ACC_CHPID 1
7#define CHSC_SEI_ACC_LINKADDR 2 5#define CHSC_SEI_ACC_LINKADDR 2
8#define CHSC_SEI_ACC_FULLLINKADDR 3 6#define CHSC_SEI_ACC_FULLLINKADDR 3
9 7
8#define CHSC_SDA_OC_MSS 0x2
9
10struct chsc_header { 10struct chsc_header {
11 u16 length; 11 u16 length;
12 u16 code; 12 u16 code;
@@ -43,7 +43,9 @@ struct css_general_char {
43 u32 ext_mb : 1; /* bit 48 */ 43 u32 ext_mb : 1; /* bit 48 */
44 u32 : 7; 44 u32 : 7;
45 u32 aif_tdd : 1; /* bit 56 */ 45 u32 aif_tdd : 1; /* bit 56 */
46 u32 : 10; 46 u32 : 1;
47 u32 qebsm : 1; /* bit 58 */
48 u32 : 8;
47 u32 aif_osa : 1; /* bit 67 */ 49 u32 aif_osa : 1; /* bit 67 */
48 u32 : 28; 50 u32 : 28;
49}__attribute__((packed)); 51}__attribute__((packed));
@@ -63,4 +65,9 @@ extern int chsc_determine_css_characteristics(void);
63extern int css_characteristics_avail; 65extern int css_characteristics_avail;
64 66
65extern void *chsc_get_chp_desc(struct subchannel*, int); 67extern void *chsc_get_chp_desc(struct subchannel*, int);
68
69extern int chsc_enable_facility(int);
70
71#define to_channelpath(dev) container_of(dev, struct channel_path, dev)
72
66#endif 73#endif
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index 185bc73c3ecd..7376bc87206d 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * drivers/s390/cio/cio.c 2 * drivers/s390/cio/cio.c
3 * S/390 common I/O routines -- low level i/o calls 3 * S/390 common I/O routines -- low level i/o calls
4 * $Revision: 1.135 $ 4 * $Revision: 1.138 $
5 * 5 *
6 * Copyright (C) 1999-2002 IBM Deutschland Entwicklung GmbH, 6 * Copyright (C) 1999-2002 IBM Deutschland Entwicklung GmbH,
7 * IBM Corporation 7 * IBM Corporation
@@ -135,7 +135,7 @@ cio_tpi(void)
135 return 0; 135 return 0;
136 irb = (struct irb *) __LC_IRB; 136 irb = (struct irb *) __LC_IRB;
137 /* Store interrupt response block to lowcore. */ 137 /* Store interrupt response block to lowcore. */
138 if (tsch (tpi_info->irq, irb) != 0) 138 if (tsch (tpi_info->schid, irb) != 0)
139 /* Not status pending or not operational. */ 139 /* Not status pending or not operational. */
140 return 1; 140 return 1;
141 sch = (struct subchannel *)(unsigned long)tpi_info->intparm; 141 sch = (struct subchannel *)(unsigned long)tpi_info->intparm;
@@ -163,10 +163,11 @@ cio_start_handle_notoper(struct subchannel *sch, __u8 lpm)
163 else 163 else
164 sch->lpm = 0; 164 sch->lpm = 0;
165 165
166 stsch (sch->irq, &sch->schib); 166 stsch (sch->schid, &sch->schib);
167 167
168 CIO_MSG_EVENT(0, "cio_start: 'not oper' status for " 168 CIO_MSG_EVENT(0, "cio_start: 'not oper' status for "
169 "subchannel %04x!\n", sch->irq); 169 "subchannel 0.%x.%04x!\n", sch->schid.ssid,
170 sch->schid.sch_no);
170 sprintf(dbf_text, "no%s", sch->dev.bus_id); 171 sprintf(dbf_text, "no%s", sch->dev.bus_id);
171 CIO_TRACE_EVENT(0, dbf_text); 172 CIO_TRACE_EVENT(0, dbf_text);
172 CIO_HEX_EVENT(0, &sch->schib, sizeof (struct schib)); 173 CIO_HEX_EVENT(0, &sch->schib, sizeof (struct schib));
@@ -194,7 +195,7 @@ cio_start_key (struct subchannel *sch, /* subchannel structure */
194 sch->orb.spnd = sch->options.suspend; 195 sch->orb.spnd = sch->options.suspend;
195 sch->orb.ssic = sch->options.suspend && sch->options.inter; 196 sch->orb.ssic = sch->options.suspend && sch->options.inter;
196 sch->orb.lpm = (lpm != 0) ? (lpm & sch->opm) : sch->lpm; 197 sch->orb.lpm = (lpm != 0) ? (lpm & sch->opm) : sch->lpm;
197#ifdef CONFIG_ARCH_S390X 198#ifdef CONFIG_64BIT
198 /* 199 /*
199 * for 64 bit we always support 64 bit IDAWs with 4k page size only 200 * for 64 bit we always support 64 bit IDAWs with 4k page size only
200 */ 201 */
@@ -204,7 +205,7 @@ cio_start_key (struct subchannel *sch, /* subchannel structure */
204 sch->orb.key = key >> 4; 205 sch->orb.key = key >> 4;
205 /* issue "Start Subchannel" */ 206 /* issue "Start Subchannel" */
206 sch->orb.cpa = (__u32) __pa (cpa); 207 sch->orb.cpa = (__u32) __pa (cpa);
207 ccode = ssch (sch->irq, &sch->orb); 208 ccode = ssch (sch->schid, &sch->orb);
208 209
209 /* process condition code */ 210 /* process condition code */
210 sprintf (dbf_txt, "ccode:%d", ccode); 211 sprintf (dbf_txt, "ccode:%d", ccode);
@@ -243,7 +244,7 @@ cio_resume (struct subchannel *sch)
243 CIO_TRACE_EVENT (4, "resIO"); 244 CIO_TRACE_EVENT (4, "resIO");
244 CIO_TRACE_EVENT (4, sch->dev.bus_id); 245 CIO_TRACE_EVENT (4, sch->dev.bus_id);
245 246
246 ccode = rsch (sch->irq); 247 ccode = rsch (sch->schid);
247 248
248 sprintf (dbf_txt, "ccode:%d", ccode); 249 sprintf (dbf_txt, "ccode:%d", ccode);
249 CIO_TRACE_EVENT (4, dbf_txt); 250 CIO_TRACE_EVENT (4, dbf_txt);
@@ -283,7 +284,7 @@ cio_halt(struct subchannel *sch)
283 /* 284 /*
284 * Issue "Halt subchannel" and process condition code 285 * Issue "Halt subchannel" and process condition code
285 */ 286 */
286 ccode = hsch (sch->irq); 287 ccode = hsch (sch->schid);
287 288
288 sprintf (dbf_txt, "ccode:%d", ccode); 289 sprintf (dbf_txt, "ccode:%d", ccode);
289 CIO_TRACE_EVENT (2, dbf_txt); 290 CIO_TRACE_EVENT (2, dbf_txt);
@@ -318,7 +319,7 @@ cio_clear(struct subchannel *sch)
318 /* 319 /*
319 * Issue "Clear subchannel" and process condition code 320 * Issue "Clear subchannel" and process condition code
320 */ 321 */
321 ccode = csch (sch->irq); 322 ccode = csch (sch->schid);
322 323
323 sprintf (dbf_txt, "ccode:%d", ccode); 324 sprintf (dbf_txt, "ccode:%d", ccode);
324 CIO_TRACE_EVENT (2, dbf_txt); 325 CIO_TRACE_EVENT (2, dbf_txt);
@@ -351,7 +352,7 @@ cio_cancel (struct subchannel *sch)
351 CIO_TRACE_EVENT (2, "cancelIO"); 352 CIO_TRACE_EVENT (2, "cancelIO");
352 CIO_TRACE_EVENT (2, sch->dev.bus_id); 353 CIO_TRACE_EVENT (2, sch->dev.bus_id);
353 354
354 ccode = xsch (sch->irq); 355 ccode = xsch (sch->schid);
355 356
356 sprintf (dbf_txt, "ccode:%d", ccode); 357 sprintf (dbf_txt, "ccode:%d", ccode);
357 CIO_TRACE_EVENT (2, dbf_txt); 358 CIO_TRACE_EVENT (2, dbf_txt);
@@ -359,7 +360,7 @@ cio_cancel (struct subchannel *sch)
359 switch (ccode) { 360 switch (ccode) {
360 case 0: /* success */ 361 case 0: /* success */
361 /* Update information in scsw. */ 362 /* Update information in scsw. */
362 stsch (sch->irq, &sch->schib); 363 stsch (sch->schid, &sch->schib);
363 return 0; 364 return 0;
364 case 1: /* status pending */ 365 case 1: /* status pending */
365 return -EBUSY; 366 return -EBUSY;
@@ -381,7 +382,7 @@ cio_modify (struct subchannel *sch)
381 382
382 ret = 0; 383 ret = 0;
383 for (retry = 0; retry < 5; retry++) { 384 for (retry = 0; retry < 5; retry++) {
384 ccode = msch_err (sch->irq, &sch->schib); 385 ccode = msch_err (sch->schid, &sch->schib);
385 if (ccode < 0) /* -EIO if msch gets a program check. */ 386 if (ccode < 0) /* -EIO if msch gets a program check. */
386 return ccode; 387 return ccode;
387 switch (ccode) { 388 switch (ccode) {
@@ -414,7 +415,7 @@ cio_enable_subchannel (struct subchannel *sch, unsigned int isc)
414 CIO_TRACE_EVENT (2, "ensch"); 415 CIO_TRACE_EVENT (2, "ensch");
415 CIO_TRACE_EVENT (2, sch->dev.bus_id); 416 CIO_TRACE_EVENT (2, sch->dev.bus_id);
416 417
417 ccode = stsch (sch->irq, &sch->schib); 418 ccode = stsch (sch->schid, &sch->schib);
418 if (ccode) 419 if (ccode)
419 return -ENODEV; 420 return -ENODEV;
420 421
@@ -432,13 +433,13 @@ cio_enable_subchannel (struct subchannel *sch, unsigned int isc)
432 */ 433 */
433 sch->schib.pmcw.csense = 0; 434 sch->schib.pmcw.csense = 0;
434 if (ret == 0) { 435 if (ret == 0) {
435 stsch (sch->irq, &sch->schib); 436 stsch (sch->schid, &sch->schib);
436 if (sch->schib.pmcw.ena) 437 if (sch->schib.pmcw.ena)
437 break; 438 break;
438 } 439 }
439 if (ret == -EBUSY) { 440 if (ret == -EBUSY) {
440 struct irb irb; 441 struct irb irb;
441 if (tsch(sch->irq, &irb) != 0) 442 if (tsch(sch->schid, &irb) != 0)
442 break; 443 break;
443 } 444 }
444 } 445 }
@@ -461,7 +462,7 @@ cio_disable_subchannel (struct subchannel *sch)
461 CIO_TRACE_EVENT (2, "dissch"); 462 CIO_TRACE_EVENT (2, "dissch");
462 CIO_TRACE_EVENT (2, sch->dev.bus_id); 463 CIO_TRACE_EVENT (2, sch->dev.bus_id);
463 464
464 ccode = stsch (sch->irq, &sch->schib); 465 ccode = stsch (sch->schid, &sch->schib);
465 if (ccode == 3) /* Not operational. */ 466 if (ccode == 3) /* Not operational. */
466 return -ENODEV; 467 return -ENODEV;
467 468
@@ -485,7 +486,7 @@ cio_disable_subchannel (struct subchannel *sch)
485 */ 486 */
486 break; 487 break;
487 if (ret == 0) { 488 if (ret == 0) {
488 stsch (sch->irq, &sch->schib); 489 stsch (sch->schid, &sch->schib);
489 if (!sch->schib.pmcw.ena) 490 if (!sch->schib.pmcw.ena)
490 break; 491 break;
491 } 492 }
@@ -508,12 +509,12 @@ cio_disable_subchannel (struct subchannel *sch)
508 * -ENODEV for subchannels with invalid device number or blacklisted devices 509 * -ENODEV for subchannels with invalid device number or blacklisted devices
509 */ 510 */
510int 511int
511cio_validate_subchannel (struct subchannel *sch, unsigned int irq) 512cio_validate_subchannel (struct subchannel *sch, struct subchannel_id schid)
512{ 513{
513 char dbf_txt[15]; 514 char dbf_txt[15];
514 int ccode; 515 int ccode;
515 516
516 sprintf (dbf_txt, "valsch%x", irq); 517 sprintf (dbf_txt, "valsch%x", schid.sch_no);
517 CIO_TRACE_EVENT (4, dbf_txt); 518 CIO_TRACE_EVENT (4, dbf_txt);
518 519
519 /* Nuke all fields. */ 520 /* Nuke all fields. */
@@ -522,17 +523,20 @@ cio_validate_subchannel (struct subchannel *sch, unsigned int irq)
522 spin_lock_init(&sch->lock); 523 spin_lock_init(&sch->lock);
523 524
524 /* Set a name for the subchannel */ 525 /* Set a name for the subchannel */
525 snprintf (sch->dev.bus_id, BUS_ID_SIZE, "0.0.%04x", irq); 526 snprintf (sch->dev.bus_id, BUS_ID_SIZE, "0.%x.%04x", schid.ssid,
527 schid.sch_no);
526 528
527 /* 529 /*
528 * The first subchannel that is not-operational (ccode==3) 530 * The first subchannel that is not-operational (ccode==3)
529 * indicates that there aren't any more devices available. 531 * indicates that there aren't any more devices available.
532 * If stsch gets an exception, it means the current subchannel set
533 * is not valid.
530 */ 534 */
531 sch->irq = irq; 535 ccode = stsch_err (schid, &sch->schib);
532 ccode = stsch (irq, &sch->schib);
533 if (ccode) 536 if (ccode)
534 return -ENXIO; 537 return (ccode == 3) ? -ENXIO : ccode;
535 538
539 sch->schid = schid;
536 /* Copy subchannel type from path management control word. */ 540 /* Copy subchannel type from path management control word. */
537 sch->st = sch->schib.pmcw.st; 541 sch->st = sch->schib.pmcw.st;
538 542
@@ -541,9 +545,9 @@ cio_validate_subchannel (struct subchannel *sch, unsigned int irq)
541 */ 545 */
542 if (sch->st != 0) { 546 if (sch->st != 0) {
543 CIO_DEBUG(KERN_INFO, 0, 547 CIO_DEBUG(KERN_INFO, 0,
544 "Subchannel %04X reports " 548 "Subchannel 0.%x.%04x reports "
545 "non-I/O subchannel type %04X\n", 549 "non-I/O subchannel type %04X\n",
546 sch->irq, sch->st); 550 sch->schid.ssid, sch->schid.sch_no, sch->st);
547 /* We stop here for non-io subchannels. */ 551 /* We stop here for non-io subchannels. */
548 return sch->st; 552 return sch->st;
549 } 553 }
@@ -554,26 +558,29 @@ cio_validate_subchannel (struct subchannel *sch, unsigned int irq)
554 return -ENODEV; 558 return -ENODEV;
555 559
556 /* Devno is valid. */ 560 /* Devno is valid. */
557 if (is_blacklisted (sch->schib.pmcw.dev)) { 561 if (is_blacklisted (sch->schid.ssid, sch->schib.pmcw.dev)) {
558 /* 562 /*
559 * This device must not be known to Linux. So we simply 563 * This device must not be known to Linux. So we simply
560 * say that there is no device and return ENODEV. 564 * say that there is no device and return ENODEV.
561 */ 565 */
562 CIO_MSG_EVENT(0, "Blacklisted device detected " 566 CIO_MSG_EVENT(0, "Blacklisted device detected "
563 "at devno %04X\n", sch->schib.pmcw.dev); 567 "at devno %04X, subchannel set %x\n",
568 sch->schib.pmcw.dev, sch->schid.ssid);
564 return -ENODEV; 569 return -ENODEV;
565 } 570 }
566 sch->opm = 0xff; 571 sch->opm = 0xff;
567 chsc_validate_chpids(sch); 572 if (!cio_is_console(sch->schid))
573 chsc_validate_chpids(sch);
568 sch->lpm = sch->schib.pmcw.pim & 574 sch->lpm = sch->schib.pmcw.pim &
569 sch->schib.pmcw.pam & 575 sch->schib.pmcw.pam &
570 sch->schib.pmcw.pom & 576 sch->schib.pmcw.pom &
571 sch->opm; 577 sch->opm;
572 578
573 CIO_DEBUG(KERN_INFO, 0, 579 CIO_DEBUG(KERN_INFO, 0,
574 "Detected device %04X on subchannel %04X" 580 "Detected device %04x on subchannel 0.%x.%04X"
575 " - PIM = %02X, PAM = %02X, POM = %02X\n", 581 " - PIM = %02X, PAM = %02X, POM = %02X\n",
576 sch->schib.pmcw.dev, sch->irq, sch->schib.pmcw.pim, 582 sch->schib.pmcw.dev, sch->schid.ssid,
583 sch->schid.sch_no, sch->schib.pmcw.pim,
577 sch->schib.pmcw.pam, sch->schib.pmcw.pom); 584 sch->schib.pmcw.pam, sch->schib.pmcw.pom);
578 585
579 /* 586 /*
@@ -632,7 +639,7 @@ do_IRQ (struct pt_regs *regs)
632 if (sch) 639 if (sch)
633 spin_lock(&sch->lock); 640 spin_lock(&sch->lock);
634 /* Store interrupt response block to lowcore. */ 641 /* Store interrupt response block to lowcore. */
635 if (tsch (tpi_info->irq, irb) == 0 && sch) { 642 if (tsch (tpi_info->schid, irb) == 0 && sch) {
636 /* Keep subchannel information word up to date. */ 643 /* Keep subchannel information word up to date. */
637 memcpy (&sch->schib.scsw, &irb->scsw, 644 memcpy (&sch->schib.scsw, &irb->scsw,
638 sizeof (irb->scsw)); 645 sizeof (irb->scsw));
@@ -691,28 +698,36 @@ wait_cons_dev (void)
691} 698}
692 699
693static int 700static int
694cio_console_irq(void) 701cio_test_for_console(struct subchannel_id schid, void *data)
695{ 702{
696 int irq; 703 if (stsch_err(schid, &console_subchannel.schib) != 0)
704 return -ENXIO;
705 if (console_subchannel.schib.pmcw.dnv &&
706 console_subchannel.schib.pmcw.dev ==
707 console_devno) {
708 console_irq = schid.sch_no;
709 return 1; /* found */
710 }
711 return 0;
712}
713
714
715static int
716cio_get_console_sch_no(void)
717{
718 struct subchannel_id schid;
697 719
720 init_subchannel_id(&schid);
698 if (console_irq != -1) { 721 if (console_irq != -1) {
699 /* VM provided us with the irq number of the console. */ 722 /* VM provided us with the irq number of the console. */
700 if (stsch(console_irq, &console_subchannel.schib) != 0 || 723 schid.sch_no = console_irq;
724 if (stsch(schid, &console_subchannel.schib) != 0 ||
701 !console_subchannel.schib.pmcw.dnv) 725 !console_subchannel.schib.pmcw.dnv)
702 return -1; 726 return -1;
703 console_devno = console_subchannel.schib.pmcw.dev; 727 console_devno = console_subchannel.schib.pmcw.dev;
704 } else if (console_devno != -1) { 728 } else if (console_devno != -1) {
705 /* At least the console device number is known. */ 729 /* At least the console device number is known. */
706 for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) { 730 for_each_subchannel(cio_test_for_console, NULL);
707 if (stsch(irq, &console_subchannel.schib) != 0)
708 break;
709 if (console_subchannel.schib.pmcw.dnv &&
710 console_subchannel.schib.pmcw.dev ==
711 console_devno) {
712 console_irq = irq;
713 break;
714 }
715 }
716 if (console_irq == -1) 731 if (console_irq == -1)
717 return -1; 732 return -1;
718 } else { 733 } else {
@@ -728,17 +743,20 @@ cio_console_irq(void)
728struct subchannel * 743struct subchannel *
729cio_probe_console(void) 744cio_probe_console(void)
730{ 745{
731 int irq, ret; 746 int sch_no, ret;
747 struct subchannel_id schid;
732 748
733 if (xchg(&console_subchannel_in_use, 1) != 0) 749 if (xchg(&console_subchannel_in_use, 1) != 0)
734 return ERR_PTR(-EBUSY); 750 return ERR_PTR(-EBUSY);
735 irq = cio_console_irq(); 751 sch_no = cio_get_console_sch_no();
736 if (irq == -1) { 752 if (sch_no == -1) {
737 console_subchannel_in_use = 0; 753 console_subchannel_in_use = 0;
738 return ERR_PTR(-ENODEV); 754 return ERR_PTR(-ENODEV);
739 } 755 }
740 memset(&console_subchannel, 0, sizeof(struct subchannel)); 756 memset(&console_subchannel, 0, sizeof(struct subchannel));
741 ret = cio_validate_subchannel(&console_subchannel, irq); 757 init_subchannel_id(&schid);
758 schid.sch_no = sch_no;
759 ret = cio_validate_subchannel(&console_subchannel, schid);
742 if (ret) { 760 if (ret) {
743 console_subchannel_in_use = 0; 761 console_subchannel_in_use = 0;
744 return ERR_PTR(-ENODEV); 762 return ERR_PTR(-ENODEV);
@@ -770,11 +788,11 @@ cio_release_console(void)
770 788
771/* Bah... hack to catch console special sausages. */ 789/* Bah... hack to catch console special sausages. */
772int 790int
773cio_is_console(int irq) 791cio_is_console(struct subchannel_id schid)
774{ 792{
775 if (!console_subchannel_in_use) 793 if (!console_subchannel_in_use)
776 return 0; 794 return 0;
777 return (irq == console_subchannel.irq); 795 return schid_equal(&schid, &console_subchannel.schid);
778} 796}
779 797
780struct subchannel * 798struct subchannel *
@@ -787,7 +805,7 @@ cio_get_console_subchannel(void)
787 805
788#endif 806#endif
789static inline int 807static inline int
790__disable_subchannel_easy(unsigned int schid, struct schib *schib) 808__disable_subchannel_easy(struct subchannel_id schid, struct schib *schib)
791{ 809{
792 int retry, cc; 810 int retry, cc;
793 811
@@ -805,7 +823,7 @@ __disable_subchannel_easy(unsigned int schid, struct schib *schib)
805} 823}
806 824
807static inline int 825static inline int
808__clear_subchannel_easy(unsigned int schid) 826__clear_subchannel_easy(struct subchannel_id schid)
809{ 827{
810 int retry; 828 int retry;
811 829
@@ -815,8 +833,8 @@ __clear_subchannel_easy(unsigned int schid)
815 struct tpi_info ti; 833 struct tpi_info ti;
816 834
817 if (tpi(&ti)) { 835 if (tpi(&ti)) {
818 tsch(ti.irq, (struct irb *)__LC_IRB); 836 tsch(ti.schid, (struct irb *)__LC_IRB);
819 if (ti.irq == schid) 837 if (schid_equal(&ti.schid, &schid))
820 return 0; 838 return 0;
821 } 839 }
822 udelay(100); 840 udelay(100);
@@ -825,31 +843,33 @@ __clear_subchannel_easy(unsigned int schid)
825} 843}
826 844
827extern void do_reipl(unsigned long devno); 845extern void do_reipl(unsigned long devno);
846static int
847__shutdown_subchannel_easy(struct subchannel_id schid, void *data)
848{
849 struct schib schib;
850
851 if (stsch_err(schid, &schib))
852 return -ENXIO;
853 if (!schib.pmcw.ena)
854 return 0;
855 switch(__disable_subchannel_easy(schid, &schib)) {
856 case 0:
857 case -ENODEV:
858 break;
859 default: /* -EBUSY */
860 if (__clear_subchannel_easy(schid))
861 break; /* give up... */
862 stsch(schid, &schib);
863 __disable_subchannel_easy(schid, &schib);
864 }
865 return 0;
866}
828 867
829/* Clear all subchannels. */
830void 868void
831clear_all_subchannels(void) 869clear_all_subchannels(void)
832{ 870{
833 unsigned int schid;
834
835 local_irq_disable(); 871 local_irq_disable();
836 for (schid=0;schid<=highest_subchannel;schid++) { 872 for_each_subchannel(__shutdown_subchannel_easy, NULL);
837 struct schib schib;
838 if (stsch(schid, &schib))
839 break; /* break out of the loop */
840 if (!schib.pmcw.ena)
841 continue;
842 switch(__disable_subchannel_easy(schid, &schib)) {
843 case 0:
844 case -ENODEV:
845 break;
846 default: /* -EBUSY */
847 if (__clear_subchannel_easy(schid))
848 break; /* give up... jump out of switch */
849 stsch(schid, &schib);
850 __disable_subchannel_easy(schid, &schib);
851 }
852 }
853} 873}
854 874
855/* Make sure all subchannels are quiet before we re-ipl an lpar. */ 875/* Make sure all subchannels are quiet before we re-ipl an lpar. */
diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h
index c50a9da420a9..0ca987344e07 100644
--- a/drivers/s390/cio/cio.h
+++ b/drivers/s390/cio/cio.h
@@ -1,6 +1,8 @@
1#ifndef S390_CIO_H 1#ifndef S390_CIO_H
2#define S390_CIO_H 2#define S390_CIO_H
3 3
4#include "schid.h"
5
4/* 6/*
5 * where we put the ssd info 7 * where we put the ssd info
6 */ 8 */
@@ -83,7 +85,7 @@ struct orb {
83 85
84/* subchannel data structure used by I/O subroutines */ 86/* subchannel data structure used by I/O subroutines */
85struct subchannel { 87struct subchannel {
86 unsigned int irq; /* aka. subchannel number */ 88 struct subchannel_id schid;
87 spinlock_t lock; /* subchannel lock */ 89 spinlock_t lock; /* subchannel lock */
88 90
89 enum { 91 enum {
@@ -114,7 +116,7 @@ struct subchannel {
114 116
115#define to_subchannel(n) container_of(n, struct subchannel, dev) 117#define to_subchannel(n) container_of(n, struct subchannel, dev)
116 118
117extern int cio_validate_subchannel (struct subchannel *, unsigned int); 119extern int cio_validate_subchannel (struct subchannel *, struct subchannel_id);
118extern int cio_enable_subchannel (struct subchannel *, unsigned int); 120extern int cio_enable_subchannel (struct subchannel *, unsigned int);
119extern int cio_disable_subchannel (struct subchannel *); 121extern int cio_disable_subchannel (struct subchannel *);
120extern int cio_cancel (struct subchannel *); 122extern int cio_cancel (struct subchannel *);
@@ -127,14 +129,15 @@ extern int cio_cancel (struct subchannel *);
127extern int cio_set_options (struct subchannel *, int); 129extern int cio_set_options (struct subchannel *, int);
128extern int cio_get_options (struct subchannel *); 130extern int cio_get_options (struct subchannel *);
129extern int cio_modify (struct subchannel *); 131extern int cio_modify (struct subchannel *);
132
130/* Use with care. */ 133/* Use with care. */
131#ifdef CONFIG_CCW_CONSOLE 134#ifdef CONFIG_CCW_CONSOLE
132extern struct subchannel *cio_probe_console(void); 135extern struct subchannel *cio_probe_console(void);
133extern void cio_release_console(void); 136extern void cio_release_console(void);
134extern int cio_is_console(int irq); 137extern int cio_is_console(struct subchannel_id);
135extern struct subchannel *cio_get_console_subchannel(void); 138extern struct subchannel *cio_get_console_subchannel(void);
136#else 139#else
137#define cio_is_console(irq) 0 140#define cio_is_console(schid) 0
138#define cio_get_console_subchannel() NULL 141#define cio_get_console_subchannel() NULL
139#endif 142#endif
140 143
diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c
index b978f7fe8327..0b03714e696a 100644
--- a/drivers/s390/cio/cmf.c
+++ b/drivers/s390/cio/cmf.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * linux/drivers/s390/cio/cmf.c ($Revision: 1.16 $) 2 * linux/drivers/s390/cio/cmf.c ($Revision: 1.19 $)
3 * 3 *
4 * Linux on zSeries Channel Measurement Facility support 4 * Linux on zSeries Channel Measurement Facility support
5 * 5 *
@@ -178,7 +178,7 @@ set_schib(struct ccw_device *cdev, u32 mme, int mbfc, unsigned long address)
178 /* msch can silently fail, so do it again if necessary */ 178 /* msch can silently fail, so do it again if necessary */
179 for (retry = 0; retry < 3; retry++) { 179 for (retry = 0; retry < 3; retry++) {
180 /* prepare schib */ 180 /* prepare schib */
181 stsch(sch->irq, schib); 181 stsch(sch->schid, schib);
182 schib->pmcw.mme = mme; 182 schib->pmcw.mme = mme;
183 schib->pmcw.mbfc = mbfc; 183 schib->pmcw.mbfc = mbfc;
184 /* address can be either a block address or a block index */ 184 /* address can be either a block address or a block index */
@@ -188,7 +188,7 @@ set_schib(struct ccw_device *cdev, u32 mme, int mbfc, unsigned long address)
188 schib->pmcw.mbi = address; 188 schib->pmcw.mbi = address;
189 189
190 /* try to submit it */ 190 /* try to submit it */
191 switch(ret = msch_err(sch->irq, schib)) { 191 switch(ret = msch_err(sch->schid, schib)) {
192 case 0: 192 case 0:
193 break; 193 break;
194 case 1: 194 case 1:
@@ -202,7 +202,7 @@ set_schib(struct ccw_device *cdev, u32 mme, int mbfc, unsigned long address)
202 ret = -EINVAL; 202 ret = -EINVAL;
203 break; 203 break;
204 } 204 }
205 stsch(sch->irq, schib); /* restore the schib */ 205 stsch(sch->schid, schib); /* restore the schib */
206 206
207 if (ret) 207 if (ret)
208 break; 208 break;
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 555119cacc27..e565193650c7 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * drivers/s390/cio/css.c 2 * drivers/s390/cio/css.c
3 * driver for channel subsystem 3 * driver for channel subsystem
4 * $Revision: 1.85 $ 4 * $Revision: 1.93 $
5 * 5 *
6 * Copyright (C) 2002 IBM Deutschland Entwicklung GmbH, 6 * Copyright (C) 2002 IBM Deutschland Entwicklung GmbH,
7 * IBM Corporation 7 * IBM Corporation
@@ -21,19 +21,35 @@
21#include "ioasm.h" 21#include "ioasm.h"
22#include "chsc.h" 22#include "chsc.h"
23 23
24unsigned int highest_subchannel;
25int need_rescan = 0; 24int need_rescan = 0;
26int css_init_done = 0; 25int css_init_done = 0;
26static int max_ssid = 0;
27
28struct channel_subsystem *css[__MAX_CSSID + 1];
27 29
28struct pgid global_pgid;
29int css_characteristics_avail = 0; 30int css_characteristics_avail = 0;
30 31
31struct device css_bus_device = { 32inline int
32 .bus_id = "css0", 33for_each_subchannel(int(*fn)(struct subchannel_id, void *), void *data)
33}; 34{
35 struct subchannel_id schid;
36 int ret;
37
38 init_subchannel_id(&schid);
39 ret = -ENODEV;
40 do {
41 do {
42 ret = fn(schid, data);
43 if (ret)
44 break;
45 } while (schid.sch_no++ < __MAX_SUBCHANNEL);
46 schid.sch_no = 0;
47 } while (schid.ssid++ < max_ssid);
48 return ret;
49}
34 50
35static struct subchannel * 51static struct subchannel *
36css_alloc_subchannel(int irq) 52css_alloc_subchannel(struct subchannel_id schid)
37{ 53{
38 struct subchannel *sch; 54 struct subchannel *sch;
39 int ret; 55 int ret;
@@ -41,13 +57,11 @@ css_alloc_subchannel(int irq)
41 sch = kmalloc (sizeof (*sch), GFP_KERNEL | GFP_DMA); 57 sch = kmalloc (sizeof (*sch), GFP_KERNEL | GFP_DMA);
42 if (sch == NULL) 58 if (sch == NULL)
43 return ERR_PTR(-ENOMEM); 59 return ERR_PTR(-ENOMEM);
44 ret = cio_validate_subchannel (sch, irq); 60 ret = cio_validate_subchannel (sch, schid);
45 if (ret < 0) { 61 if (ret < 0) {
46 kfree(sch); 62 kfree(sch);
47 return ERR_PTR(ret); 63 return ERR_PTR(ret);
48 } 64 }
49 if (irq > highest_subchannel)
50 highest_subchannel = irq;
51 65
52 if (sch->st != SUBCHANNEL_TYPE_IO) { 66 if (sch->st != SUBCHANNEL_TYPE_IO) {
53 /* For now we ignore all non-io subchannels. */ 67 /* For now we ignore all non-io subchannels. */
@@ -87,7 +101,7 @@ css_subchannel_release(struct device *dev)
87 struct subchannel *sch; 101 struct subchannel *sch;
88 102
89 sch = to_subchannel(dev); 103 sch = to_subchannel(dev);
90 if (!cio_is_console(sch->irq)) 104 if (!cio_is_console(sch->schid))
91 kfree(sch); 105 kfree(sch);
92} 106}
93 107
@@ -99,7 +113,7 @@ css_register_subchannel(struct subchannel *sch)
99 int ret; 113 int ret;
100 114
101 /* Initialize the subchannel structure */ 115 /* Initialize the subchannel structure */
102 sch->dev.parent = &css_bus_device; 116 sch->dev.parent = &css[0]->device;
103 sch->dev.bus = &css_bus_type; 117 sch->dev.bus = &css_bus_type;
104 sch->dev.release = &css_subchannel_release; 118 sch->dev.release = &css_subchannel_release;
105 119
@@ -114,12 +128,12 @@ css_register_subchannel(struct subchannel *sch)
114} 128}
115 129
116int 130int
117css_probe_device(int irq) 131css_probe_device(struct subchannel_id schid)
118{ 132{
119 int ret; 133 int ret;
120 struct subchannel *sch; 134 struct subchannel *sch;
121 135
122 sch = css_alloc_subchannel(irq); 136 sch = css_alloc_subchannel(schid);
123 if (IS_ERR(sch)) 137 if (IS_ERR(sch))
124 return PTR_ERR(sch); 138 return PTR_ERR(sch);
125 ret = css_register_subchannel(sch); 139 ret = css_register_subchannel(sch);
@@ -132,26 +146,26 @@ static int
132check_subchannel(struct device * dev, void * data) 146check_subchannel(struct device * dev, void * data)
133{ 147{
134 struct subchannel *sch; 148 struct subchannel *sch;
135 int irq = (unsigned long)data; 149 struct subchannel_id *schid = data;
136 150
137 sch = to_subchannel(dev); 151 sch = to_subchannel(dev);
138 return (sch->irq == irq); 152 return schid_equal(&sch->schid, schid);
139} 153}
140 154
141struct subchannel * 155struct subchannel *
142get_subchannel_by_schid(int irq) 156get_subchannel_by_schid(struct subchannel_id schid)
143{ 157{
144 struct device *dev; 158 struct device *dev;
145 159
146 dev = bus_find_device(&css_bus_type, NULL, 160 dev = bus_find_device(&css_bus_type, NULL,
147 (void *)(unsigned long)irq, check_subchannel); 161 (void *)&schid, check_subchannel);
148 162
149 return dev ? to_subchannel(dev) : NULL; 163 return dev ? to_subchannel(dev) : NULL;
150} 164}
151 165
152 166
153static inline int 167static inline int
154css_get_subchannel_status(struct subchannel *sch, int schid) 168css_get_subchannel_status(struct subchannel *sch, struct subchannel_id schid)
155{ 169{
156 struct schib schib; 170 struct schib schib;
157 int cc; 171 int cc;
@@ -170,13 +184,13 @@ css_get_subchannel_status(struct subchannel *sch, int schid)
170} 184}
171 185
172static int 186static int
173css_evaluate_subchannel(int irq, int slow) 187css_evaluate_subchannel(struct subchannel_id schid, int slow)
174{ 188{
175 int event, ret, disc; 189 int event, ret, disc;
176 struct subchannel *sch; 190 struct subchannel *sch;
177 unsigned long flags; 191 unsigned long flags;
178 192
179 sch = get_subchannel_by_schid(irq); 193 sch = get_subchannel_by_schid(schid);
180 disc = sch ? device_is_disconnected(sch) : 0; 194 disc = sch ? device_is_disconnected(sch) : 0;
181 if (disc && slow) { 195 if (disc && slow) {
182 if (sch) 196 if (sch)
@@ -194,9 +208,10 @@ css_evaluate_subchannel(int irq, int slow)
194 put_device(&sch->dev); 208 put_device(&sch->dev);
195 return -EAGAIN; /* Will be done on the slow path. */ 209 return -EAGAIN; /* Will be done on the slow path. */
196 } 210 }
197 event = css_get_subchannel_status(sch, irq); 211 event = css_get_subchannel_status(sch, schid);
198 CIO_MSG_EVENT(4, "Evaluating schid %04x, event %d, %s, %s path.\n", 212 CIO_MSG_EVENT(4, "Evaluating schid 0.%x.%04x, event %d, %s, %s path.\n",
199 irq, event, sch?(disc?"disconnected":"normal"):"unknown", 213 schid.ssid, schid.sch_no, event,
214 sch?(disc?"disconnected":"normal"):"unknown",
200 slow?"slow":"fast"); 215 slow?"slow":"fast");
201 switch (event) { 216 switch (event) {
202 case CIO_NO_PATH: 217 case CIO_NO_PATH:
@@ -253,7 +268,7 @@ css_evaluate_subchannel(int irq, int slow)
253 sch->schib.pmcw.intparm = 0; 268 sch->schib.pmcw.intparm = 0;
254 cio_modify(sch); 269 cio_modify(sch);
255 put_device(&sch->dev); 270 put_device(&sch->dev);
256 ret = css_probe_device(irq); 271 ret = css_probe_device(schid);
257 } else { 272 } else {
258 /* 273 /*
259 * We can't immediately deregister the disconnected 274 * We can't immediately deregister the disconnected
@@ -272,7 +287,7 @@ css_evaluate_subchannel(int irq, int slow)
272 device_trigger_reprobe(sch); 287 device_trigger_reprobe(sch);
273 spin_unlock_irqrestore(&sch->lock, flags); 288 spin_unlock_irqrestore(&sch->lock, flags);
274 } 289 }
275 ret = sch ? 0 : css_probe_device(irq); 290 ret = sch ? 0 : css_probe_device(schid);
276 break; 291 break;
277 default: 292 default:
278 BUG(); 293 BUG();
@@ -281,28 +296,15 @@ css_evaluate_subchannel(int irq, int slow)
281 return ret; 296 return ret;
282} 297}
283 298
284static void 299static int
285css_rescan_devices(void) 300css_rescan_devices(struct subchannel_id schid, void *data)
286{ 301{
287 int irq, ret; 302 return css_evaluate_subchannel(schid, 1);
288
289 for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) {
290 ret = css_evaluate_subchannel(irq, 1);
291 /* No more memory. It doesn't make sense to continue. No
292 * panic because this can happen in midflight and just
293 * because we can't use a new device is no reason to crash
294 * the system. */
295 if (ret == -ENOMEM)
296 break;
297 /* -ENXIO indicates that there are no more subchannels. */
298 if (ret == -ENXIO)
299 break;
300 }
301} 303}
302 304
303struct slow_subchannel { 305struct slow_subchannel {
304 struct list_head slow_list; 306 struct list_head slow_list;
305 unsigned long schid; 307 struct subchannel_id schid;
306}; 308};
307 309
308static LIST_HEAD(slow_subchannels_head); 310static LIST_HEAD(slow_subchannels_head);
@@ -315,7 +317,7 @@ css_trigger_slow_path(void)
315 317
316 if (need_rescan) { 318 if (need_rescan) {
317 need_rescan = 0; 319 need_rescan = 0;
318 css_rescan_devices(); 320 for_each_subchannel(css_rescan_devices, NULL);
319 return; 321 return;
320 } 322 }
321 323
@@ -354,23 +356,31 @@ css_reiterate_subchannels(void)
354 * Called from the machine check handler for subchannel report words. 356 * Called from the machine check handler for subchannel report words.
355 */ 357 */
356int 358int
357css_process_crw(int irq) 359css_process_crw(int rsid1, int rsid2)
358{ 360{
359 int ret; 361 int ret;
362 struct subchannel_id mchk_schid;
360 363
361 CIO_CRW_EVENT(2, "source is subchannel %04X\n", irq); 364 CIO_CRW_EVENT(2, "source is subchannel %04X, subsystem id %x\n",
365 rsid1, rsid2);
362 366
363 if (need_rescan) 367 if (need_rescan)
364 /* We need to iterate all subchannels anyway. */ 368 /* We need to iterate all subchannels anyway. */
365 return -EAGAIN; 369 return -EAGAIN;
370
371 init_subchannel_id(&mchk_schid);
372 mchk_schid.sch_no = rsid1;
373 if (rsid2 != 0)
374 mchk_schid.ssid = (rsid2 >> 8) & 3;
375
366 /* 376 /*
367 * Since we are always presented with IPI in the CRW, we have to 377 * Since we are always presented with IPI in the CRW, we have to
368 * use stsch() to find out if the subchannel in question has come 378 * use stsch() to find out if the subchannel in question has come
369 * or gone. 379 * or gone.
370 */ 380 */
371 ret = css_evaluate_subchannel(irq, 0); 381 ret = css_evaluate_subchannel(mchk_schid, 0);
372 if (ret == -EAGAIN) { 382 if (ret == -EAGAIN) {
373 if (css_enqueue_subchannel_slow(irq)) { 383 if (css_enqueue_subchannel_slow(mchk_schid)) {
374 css_clear_subchannel_slow_list(); 384 css_clear_subchannel_slow_list();
375 need_rescan = 1; 385 need_rescan = 1;
376 } 386 }
@@ -378,22 +388,83 @@ css_process_crw(int irq)
378 return ret; 388 return ret;
379} 389}
380 390
381static void __init 391static int __init
382css_generate_pgid(void) 392__init_channel_subsystem(struct subchannel_id schid, void *data)
383{ 393{
384 /* Let's build our path group ID here. */ 394 struct subchannel *sch;
385 if (css_characteristics_avail && css_general_characteristics.mcss) 395 int ret;
386 global_pgid.cpu_addr = 0x8000; 396
397 if (cio_is_console(schid))
398 sch = cio_get_console_subchannel();
387 else { 399 else {
400 sch = css_alloc_subchannel(schid);
401 if (IS_ERR(sch))
402 ret = PTR_ERR(sch);
403 else
404 ret = 0;
405 switch (ret) {
406 case 0:
407 break;
408 case -ENOMEM:
409 panic("Out of memory in init_channel_subsystem\n");
410 /* -ENXIO: no more subchannels. */
411 case -ENXIO:
412 return ret;
413 default:
414 return 0;
415 }
416 }
417 /*
418 * We register ALL valid subchannels in ioinfo, even those
419 * that have been present before init_channel_subsystem.
420 * These subchannels can't have been registered yet (kmalloc
421 * not working) so we do it now. This is true e.g. for the
422 * console subchannel.
423 */
424 css_register_subchannel(sch);
425 return 0;
426}
427
428static void __init
429css_generate_pgid(struct channel_subsystem *css, u32 tod_high)
430{
431 if (css_characteristics_avail && css_general_characteristics.mcss) {
432 css->global_pgid.pgid_high.ext_cssid.version = 0x80;
433 css->global_pgid.pgid_high.ext_cssid.cssid = css->cssid;
434 } else {
388#ifdef CONFIG_SMP 435#ifdef CONFIG_SMP
389 global_pgid.cpu_addr = hard_smp_processor_id(); 436 css->global_pgid.pgid_high.cpu_addr = hard_smp_processor_id();
390#else 437#else
391 global_pgid.cpu_addr = 0; 438 css->global_pgid.pgid_high.cpu_addr = 0;
392#endif 439#endif
393 } 440 }
394 global_pgid.cpu_id = ((cpuid_t *) __LC_CPUID)->ident; 441 css->global_pgid.cpu_id = ((cpuid_t *) __LC_CPUID)->ident;
395 global_pgid.cpu_model = ((cpuid_t *) __LC_CPUID)->machine; 442 css->global_pgid.cpu_model = ((cpuid_t *) __LC_CPUID)->machine;
396 global_pgid.tod_high = (__u32) (get_clock() >> 32); 443 css->global_pgid.tod_high = tod_high;
444
445}
446
447static void
448channel_subsystem_release(struct device *dev)
449{
450 struct channel_subsystem *css;
451
452 css = to_css(dev);
453 kfree(css);
454}
455
456static inline void __init
457setup_css(int nr)
458{
459 u32 tod_high;
460
461 memset(css[nr], 0, sizeof(struct channel_subsystem));
462 css[nr]->valid = 1;
463 css[nr]->cssid = nr;
464 sprintf(css[nr]->device.bus_id, "css%x", nr);
465 css[nr]->device.release = channel_subsystem_release;
466 tod_high = (u32) (get_clock() >> 32);
467 css_generate_pgid(css[nr], tod_high);
397} 468}
398 469
399/* 470/*
@@ -404,53 +475,50 @@ css_generate_pgid(void)
404static int __init 475static int __init
405init_channel_subsystem (void) 476init_channel_subsystem (void)
406{ 477{
407 int ret, irq; 478 int ret, i;
408 479
409 if (chsc_determine_css_characteristics() == 0) 480 if (chsc_determine_css_characteristics() == 0)
410 css_characteristics_avail = 1; 481 css_characteristics_avail = 1;
411 482
412 css_generate_pgid();
413
414 if ((ret = bus_register(&css_bus_type))) 483 if ((ret = bus_register(&css_bus_type)))
415 goto out; 484 goto out;
416 if ((ret = device_register (&css_bus_device)))
417 goto out_bus;
418 485
486 /* Try to enable MSS. */
487 ret = chsc_enable_facility(CHSC_SDA_OC_MSS);
488 switch (ret) {
489 case 0: /* Success. */
490 max_ssid = __MAX_SSID;
491 break;
492 case -ENOMEM:
493 goto out_bus;
494 default:
495 max_ssid = 0;
496 }
497 /* Setup css structure. */
498 for (i = 0; i <= __MAX_CSSID; i++) {
499 css[i] = kmalloc(sizeof(struct channel_subsystem), GFP_KERNEL);
500 if (!css[i]) {
501 ret = -ENOMEM;
502 goto out_unregister;
503 }
504 setup_css(i);
505 ret = device_register(&css[i]->device);
506 if (ret)
507 goto out_free;
508 }
419 css_init_done = 1; 509 css_init_done = 1;
420 510
421 ctl_set_bit(6, 28); 511 ctl_set_bit(6, 28);
422 512
423 for (irq = 0; irq < __MAX_SUBCHANNELS; irq++) { 513 for_each_subchannel(__init_channel_subsystem, NULL);
424 struct subchannel *sch;
425
426 if (cio_is_console(irq))
427 sch = cio_get_console_subchannel();
428 else {
429 sch = css_alloc_subchannel(irq);
430 if (IS_ERR(sch))
431 ret = PTR_ERR(sch);
432 else
433 ret = 0;
434 if (ret == -ENOMEM)
435 panic("Out of memory in "
436 "init_channel_subsystem\n");
437 /* -ENXIO: no more subchannels. */
438 if (ret == -ENXIO)
439 break;
440 if (ret)
441 continue;
442 }
443 /*
444 * We register ALL valid subchannels in ioinfo, even those
445 * that have been present before init_channel_subsystem.
446 * These subchannels can't have been registered yet (kmalloc
447 * not working) so we do it now. This is true e.g. for the
448 * console subchannel.
449 */
450 css_register_subchannel(sch);
451 }
452 return 0; 514 return 0;
453 515out_free:
516 kfree(css[i]);
517out_unregister:
518 while (i > 0) {
519 i--;
520 device_unregister(&css[i]->device);
521 }
454out_bus: 522out_bus:
455 bus_unregister(&css_bus_type); 523 bus_unregister(&css_bus_type);
456out: 524out:
@@ -481,47 +549,8 @@ struct bus_type css_bus_type = {
481 549
482subsys_initcall(init_channel_subsystem); 550subsys_initcall(init_channel_subsystem);
483 551
484/*
485 * Register root devices for some drivers. The release function must not be
486 * in the device drivers, so we do it here.
487 */
488static void
489s390_root_dev_release(struct device *dev)
490{
491 kfree(dev);
492}
493
494struct device *
495s390_root_dev_register(const char *name)
496{
497 struct device *dev;
498 int ret;
499
500 if (!strlen(name))
501 return ERR_PTR(-EINVAL);
502 dev = kmalloc(sizeof(struct device), GFP_KERNEL);
503 if (!dev)
504 return ERR_PTR(-ENOMEM);
505 memset(dev, 0, sizeof(struct device));
506 strncpy(dev->bus_id, name, min(strlen(name), (size_t)BUS_ID_SIZE));
507 dev->release = s390_root_dev_release;
508 ret = device_register(dev);
509 if (ret) {
510 kfree(dev);
511 return ERR_PTR(ret);
512 }
513 return dev;
514}
515
516void
517s390_root_dev_unregister(struct device *dev)
518{
519 if (dev)
520 device_unregister(dev);
521}
522
523int 552int
524css_enqueue_subchannel_slow(unsigned long schid) 553css_enqueue_subchannel_slow(struct subchannel_id schid)
525{ 554{
526 struct slow_subchannel *new_slow_sch; 555 struct slow_subchannel *new_slow_sch;
527 unsigned long flags; 556 unsigned long flags;
@@ -564,6 +593,4 @@ css_slow_subchannels_exist(void)
564 593
565MODULE_LICENSE("GPL"); 594MODULE_LICENSE("GPL");
566EXPORT_SYMBOL(css_bus_type); 595EXPORT_SYMBOL(css_bus_type);
567EXPORT_SYMBOL(s390_root_dev_register);
568EXPORT_SYMBOL(s390_root_dev_unregister);
569EXPORT_SYMBOL_GPL(css_characteristics_avail); 596EXPORT_SYMBOL_GPL(css_characteristics_avail);
diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h
index 2004a6c49388..251ebd7a7d3a 100644
--- a/drivers/s390/cio/css.h
+++ b/drivers/s390/cio/css.h
@@ -6,6 +6,8 @@
6 6
7#include <asm/cio.h> 7#include <asm/cio.h>
8 8
9#include "schid.h"
10
9/* 11/*
10 * path grouping stuff 12 * path grouping stuff
11 */ 13 */
@@ -33,19 +35,25 @@ struct path_state {
33 __u8 resvd : 3; /* reserved */ 35 __u8 resvd : 3; /* reserved */
34} __attribute__ ((packed)); 36} __attribute__ ((packed));
35 37
38struct extended_cssid {
39 u8 version;
40 u8 cssid;
41} __attribute__ ((packed));
42
36struct pgid { 43struct pgid {
37 union { 44 union {
38 __u8 fc; /* SPID function code */ 45 __u8 fc; /* SPID function code */
39 struct path_state ps; /* SNID path state */ 46 struct path_state ps; /* SNID path state */
40 } inf; 47 } inf;
41 __u32 cpu_addr : 16; /* CPU address */ 48 union {
49 __u32 cpu_addr : 16; /* CPU address */
50 struct extended_cssid ext_cssid;
51 } pgid_high;
42 __u32 cpu_id : 24; /* CPU identification */ 52 __u32 cpu_id : 24; /* CPU identification */
43 __u32 cpu_model : 16; /* CPU model */ 53 __u32 cpu_model : 16; /* CPU model */
44 __u32 tod_high; /* high word TOD clock */ 54 __u32 tod_high; /* high word TOD clock */
45} __attribute__ ((packed)); 55} __attribute__ ((packed));
46 56
47extern struct pgid global_pgid;
48
49#define MAX_CIWS 8 57#define MAX_CIWS 8
50 58
51/* 59/*
@@ -68,7 +76,8 @@ struct ccw_device_private {
68 atomic_t onoff; 76 atomic_t onoff;
69 unsigned long registered; 77 unsigned long registered;
70 __u16 devno; /* device number */ 78 __u16 devno; /* device number */
71 __u16 irq; /* subchannel number */ 79 __u16 sch_no; /* subchannel number */
80 __u8 ssid; /* subchannel set id */
72 __u8 imask; /* lpm mask for SNID/SID/SPGID */ 81 __u8 imask; /* lpm mask for SNID/SID/SPGID */
73 int iretry; /* retry counter SNID/SID/SPGID */ 82 int iretry; /* retry counter SNID/SID/SPGID */
74 struct { 83 struct {
@@ -121,15 +130,27 @@ struct css_driver {
121extern struct bus_type css_bus_type; 130extern struct bus_type css_bus_type;
122extern struct css_driver io_subchannel_driver; 131extern struct css_driver io_subchannel_driver;
123 132
124int css_probe_device(int irq); 133extern int css_probe_device(struct subchannel_id);
125extern struct subchannel * get_subchannel_by_schid(int irq); 134extern struct subchannel * get_subchannel_by_schid(struct subchannel_id);
126extern unsigned int highest_subchannel;
127extern int css_init_done; 135extern int css_init_done;
128 136extern int for_each_subchannel(int(*fn)(struct subchannel_id, void *), void *);
129#define __MAX_SUBCHANNELS 65536 137
138#define __MAX_SUBCHANNEL 65535
139#define __MAX_SSID 3
140#define __MAX_CHPID 255
141#define __MAX_CSSID 0
142
143struct channel_subsystem {
144 u8 cssid;
145 int valid;
146 struct channel_path *chps[__MAX_CHPID];
147 struct device device;
148 struct pgid global_pgid;
149};
150#define to_css(dev) container_of(dev, struct channel_subsystem, device)
130 151
131extern struct bus_type css_bus_type; 152extern struct bus_type css_bus_type;
132extern struct device css_bus_device; 153extern struct channel_subsystem *css[];
133 154
134/* Some helper functions for disconnected state. */ 155/* Some helper functions for disconnected state. */
135int device_is_disconnected(struct subchannel *); 156int device_is_disconnected(struct subchannel *);
@@ -144,7 +165,7 @@ void device_set_waiting(struct subchannel *);
144void device_kill_pending_timer(struct subchannel *); 165void device_kill_pending_timer(struct subchannel *);
145 166
146/* Helper functions to build lists for the slow path. */ 167/* Helper functions to build lists for the slow path. */
147int css_enqueue_subchannel_slow(unsigned long schid); 168extern int css_enqueue_subchannel_slow(struct subchannel_id schid);
148void css_walk_subchannel_slow_list(void (*fn)(unsigned long)); 169void css_walk_subchannel_slow_list(void (*fn)(unsigned long));
149void css_clear_subchannel_slow_list(void); 170void css_clear_subchannel_slow_list(void);
150int css_slow_subchannels_exist(void); 171int css_slow_subchannels_exist(void);
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 85908cacc3b8..fa3e4c0a2536 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * drivers/s390/cio/device.c 2 * drivers/s390/cio/device.c
3 * bus driver for ccw devices 3 * bus driver for ccw devices
4 * $Revision: 1.131 $ 4 * $Revision: 1.137 $
5 * 5 *
6 * Copyright (C) 2002 IBM Deutschland Entwicklung GmbH, 6 * Copyright (C) 2002 IBM Deutschland Entwicklung GmbH,
7 * IBM Corporation 7 * IBM Corporation
@@ -374,7 +374,7 @@ online_store (struct device *dev, struct device_attribute *attr, const char *buf
374 int i, force, ret; 374 int i, force, ret;
375 char *tmp; 375 char *tmp;
376 376
377 if (atomic_compare_and_swap(0, 1, &cdev->private->onoff)) 377 if (atomic_cmpxchg(&cdev->private->onoff, 0, 1) != 0)
378 return -EAGAIN; 378 return -EAGAIN;
379 379
380 if (cdev->drv && !try_module_get(cdev->drv->owner)) { 380 if (cdev->drv && !try_module_get(cdev->drv->owner)) {
@@ -535,7 +535,8 @@ ccw_device_register(struct ccw_device *cdev)
535} 535}
536 536
537struct match_data { 537struct match_data {
538 unsigned int devno; 538 unsigned int devno;
539 unsigned int ssid;
539 struct ccw_device * sibling; 540 struct ccw_device * sibling;
540}; 541};
541 542
@@ -548,6 +549,7 @@ match_devno(struct device * dev, void * data)
548 cdev = to_ccwdev(dev); 549 cdev = to_ccwdev(dev);
549 if ((cdev->private->state == DEV_STATE_DISCONNECTED) && 550 if ((cdev->private->state == DEV_STATE_DISCONNECTED) &&
550 (cdev->private->devno == d->devno) && 551 (cdev->private->devno == d->devno) &&
552 (cdev->private->ssid == d->ssid) &&
551 (cdev != d->sibling)) { 553 (cdev != d->sibling)) {
552 cdev->private->state = DEV_STATE_NOT_OPER; 554 cdev->private->state = DEV_STATE_NOT_OPER;
553 return 1; 555 return 1;
@@ -556,11 +558,13 @@ match_devno(struct device * dev, void * data)
556} 558}
557 559
558static struct ccw_device * 560static struct ccw_device *
559get_disc_ccwdev_by_devno(unsigned int devno, struct ccw_device *sibling) 561get_disc_ccwdev_by_devno(unsigned int devno, unsigned int ssid,
562 struct ccw_device *sibling)
560{ 563{
561 struct device *dev; 564 struct device *dev;
562 struct match_data data = { 565 struct match_data data = {
563 .devno = devno, 566 .devno = devno,
567 .ssid = ssid,
564 .sibling = sibling, 568 .sibling = sibling,
565 }; 569 };
566 570
@@ -616,13 +620,13 @@ ccw_device_do_unreg_rereg(void *data)
616 620
617 need_rename = 1; 621 need_rename = 1;
618 other_cdev = get_disc_ccwdev_by_devno(sch->schib.pmcw.dev, 622 other_cdev = get_disc_ccwdev_by_devno(sch->schib.pmcw.dev,
619 cdev); 623 sch->schid.ssid, cdev);
620 if (other_cdev) { 624 if (other_cdev) {
621 struct subchannel *other_sch; 625 struct subchannel *other_sch;
622 626
623 other_sch = to_subchannel(other_cdev->dev.parent); 627 other_sch = to_subchannel(other_cdev->dev.parent);
624 if (get_device(&other_sch->dev)) { 628 if (get_device(&other_sch->dev)) {
625 stsch(other_sch->irq, &other_sch->schib); 629 stsch(other_sch->schid, &other_sch->schib);
626 if (other_sch->schib.pmcw.dnv) { 630 if (other_sch->schib.pmcw.dnv) {
627 other_sch->schib.pmcw.intparm = 0; 631 other_sch->schib.pmcw.intparm = 0;
628 cio_modify(other_sch); 632 cio_modify(other_sch);
@@ -639,8 +643,8 @@ ccw_device_do_unreg_rereg(void *data)
639 if (test_and_clear_bit(1, &cdev->private->registered)) 643 if (test_and_clear_bit(1, &cdev->private->registered))
640 device_del(&cdev->dev); 644 device_del(&cdev->dev);
641 if (need_rename) 645 if (need_rename)
642 snprintf (cdev->dev.bus_id, BUS_ID_SIZE, "0.0.%04x", 646 snprintf (cdev->dev.bus_id, BUS_ID_SIZE, "0.%x.%04x",
643 sch->schib.pmcw.dev); 647 sch->schid.ssid, sch->schib.pmcw.dev);
644 PREPARE_WORK(&cdev->private->kick_work, 648 PREPARE_WORK(&cdev->private->kick_work,
645 ccw_device_add_changed, (void *)cdev); 649 ccw_device_add_changed, (void *)cdev);
646 queue_work(ccw_device_work, &cdev->private->kick_work); 650 queue_work(ccw_device_work, &cdev->private->kick_work);
@@ -769,18 +773,20 @@ io_subchannel_recog(struct ccw_device *cdev, struct subchannel *sch)
769 sch->dev.driver_data = cdev; 773 sch->dev.driver_data = cdev;
770 sch->driver = &io_subchannel_driver; 774 sch->driver = &io_subchannel_driver;
771 cdev->ccwlock = &sch->lock; 775 cdev->ccwlock = &sch->lock;
776
772 /* Init private data. */ 777 /* Init private data. */
773 priv = cdev->private; 778 priv = cdev->private;
774 priv->devno = sch->schib.pmcw.dev; 779 priv->devno = sch->schib.pmcw.dev;
775 priv->irq = sch->irq; 780 priv->ssid = sch->schid.ssid;
781 priv->sch_no = sch->schid.sch_no;
776 priv->state = DEV_STATE_NOT_OPER; 782 priv->state = DEV_STATE_NOT_OPER;
777 INIT_LIST_HEAD(&priv->cmb_list); 783 INIT_LIST_HEAD(&priv->cmb_list);
778 init_waitqueue_head(&priv->wait_q); 784 init_waitqueue_head(&priv->wait_q);
779 init_timer(&priv->timer); 785 init_timer(&priv->timer);
780 786
781 /* Set an initial name for the device. */ 787 /* Set an initial name for the device. */
782 snprintf (cdev->dev.bus_id, BUS_ID_SIZE, "0.0.%04x", 788 snprintf (cdev->dev.bus_id, BUS_ID_SIZE, "0.%x.%04x",
783 sch->schib.pmcw.dev); 789 sch->schid.ssid, sch->schib.pmcw.dev);
784 790
785 /* Increase counter of devices currently in recognition. */ 791 /* Increase counter of devices currently in recognition. */
786 atomic_inc(&ccw_device_init_count); 792 atomic_inc(&ccw_device_init_count);
@@ -951,7 +957,7 @@ io_subchannel_shutdown(struct device *dev)
951 sch = to_subchannel(dev); 957 sch = to_subchannel(dev);
952 cdev = dev->driver_data; 958 cdev = dev->driver_data;
953 959
954 if (cio_is_console(sch->irq)) 960 if (cio_is_console(sch->schid))
955 return; 961 return;
956 if (!sch->schib.pmcw.ena) 962 if (!sch->schib.pmcw.ena)
957 /* Nothing to do. */ 963 /* Nothing to do. */
@@ -986,10 +992,6 @@ ccw_device_console_enable (struct ccw_device *cdev, struct subchannel *sch)
986 cdev->dev = (struct device) { 992 cdev->dev = (struct device) {
987 .parent = &sch->dev, 993 .parent = &sch->dev,
988 }; 994 };
989 /* Initialize the subchannel structure */
990 sch->dev.parent = &css_bus_device;
991 sch->dev.bus = &css_bus_type;
992
993 rc = io_subchannel_recog(cdev, sch); 995 rc = io_subchannel_recog(cdev, sch);
994 if (rc) 996 if (rc)
995 return rc; 997 return rc;
@@ -1146,6 +1148,16 @@ ccw_driver_unregister (struct ccw_driver *cdriver)
1146 driver_unregister(&cdriver->driver); 1148 driver_unregister(&cdriver->driver);
1147} 1149}
1148 1150
1151/* Helper func for qdio. */
1152struct subchannel_id
1153ccw_device_get_subchannel_id(struct ccw_device *cdev)
1154{
1155 struct subchannel *sch;
1156
1157 sch = to_subchannel(cdev->dev.parent);
1158 return sch->schid;
1159}
1160
1149MODULE_LICENSE("GPL"); 1161MODULE_LICENSE("GPL");
1150EXPORT_SYMBOL(ccw_device_set_online); 1162EXPORT_SYMBOL(ccw_device_set_online);
1151EXPORT_SYMBOL(ccw_device_set_offline); 1163EXPORT_SYMBOL(ccw_device_set_offline);
@@ -1155,3 +1167,4 @@ EXPORT_SYMBOL(get_ccwdev_by_busid);
1155EXPORT_SYMBOL(ccw_bus_type); 1167EXPORT_SYMBOL(ccw_bus_type);
1156EXPORT_SYMBOL(ccw_device_work); 1168EXPORT_SYMBOL(ccw_device_work);
1157EXPORT_SYMBOL(ccw_device_notify_work); 1169EXPORT_SYMBOL(ccw_device_notify_work);
1170EXPORT_SYMBOL_GPL(ccw_device_get_subchannel_id);
diff --git a/drivers/s390/cio/device.h b/drivers/s390/cio/device.h
index a3aa056d7245..11587ebb7289 100644
--- a/drivers/s390/cio/device.h
+++ b/drivers/s390/cio/device.h
@@ -110,6 +110,7 @@ int ccw_device_stlck(struct ccw_device *);
110 110
111/* qdio needs this. */ 111/* qdio needs this. */
112void ccw_device_set_timeout(struct ccw_device *, int); 112void ccw_device_set_timeout(struct ccw_device *, int);
113extern struct subchannel_id ccw_device_get_subchannel_id(struct ccw_device *);
113 114
114void retry_set_schib(struct ccw_device *cdev); 115void retry_set_schib(struct ccw_device *cdev);
115#endif 116#endif
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index c1c89f4fd4e3..23d12b65e5fa 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -133,7 +133,7 @@ ccw_device_cancel_halt_clear(struct ccw_device *cdev)
133 int ret; 133 int ret;
134 134
135 sch = to_subchannel(cdev->dev.parent); 135 sch = to_subchannel(cdev->dev.parent);
136 ret = stsch(sch->irq, &sch->schib); 136 ret = stsch(sch->schid, &sch->schib);
137 if (ret || !sch->schib.pmcw.dnv) 137 if (ret || !sch->schib.pmcw.dnv)
138 return -ENODEV; 138 return -ENODEV;
139 if (!sch->schib.pmcw.ena || sch->schib.scsw.actl == 0) 139 if (!sch->schib.pmcw.ena || sch->schib.scsw.actl == 0)
@@ -231,7 +231,7 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
231 * through ssch() and the path information is up to date. 231 * through ssch() and the path information is up to date.
232 */ 232 */
233 old_lpm = sch->lpm; 233 old_lpm = sch->lpm;
234 stsch(sch->irq, &sch->schib); 234 stsch(sch->schid, &sch->schib);
235 sch->lpm = sch->schib.pmcw.pim & 235 sch->lpm = sch->schib.pmcw.pim &
236 sch->schib.pmcw.pam & 236 sch->schib.pmcw.pam &
237 sch->schib.pmcw.pom & 237 sch->schib.pmcw.pom &
@@ -257,8 +257,9 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
257 switch (state) { 257 switch (state) {
258 case DEV_STATE_NOT_OPER: 258 case DEV_STATE_NOT_OPER:
259 CIO_DEBUG(KERN_WARNING, 2, 259 CIO_DEBUG(KERN_WARNING, 2,
260 "SenseID : unknown device %04x on subchannel %04x\n", 260 "SenseID : unknown device %04x on subchannel "
261 cdev->private->devno, sch->irq); 261 "0.%x.%04x\n", cdev->private->devno,
262 sch->schid.ssid, sch->schid.sch_no);
262 break; 263 break;
263 case DEV_STATE_OFFLINE: 264 case DEV_STATE_OFFLINE:
264 if (cdev->private->state == DEV_STATE_DISCONNECTED_SENSE_ID) { 265 if (cdev->private->state == DEV_STATE_DISCONNECTED_SENSE_ID) {
@@ -282,16 +283,18 @@ ccw_device_recog_done(struct ccw_device *cdev, int state)
282 return; 283 return;
283 } 284 }
284 /* Issue device info message. */ 285 /* Issue device info message. */
285 CIO_DEBUG(KERN_INFO, 2, "SenseID : device %04x reports: " 286 CIO_DEBUG(KERN_INFO, 2, "SenseID : device 0.%x.%04x reports: "
286 "CU Type/Mod = %04X/%02X, Dev Type/Mod = " 287 "CU Type/Mod = %04X/%02X, Dev Type/Mod = "
287 "%04X/%02X\n", cdev->private->devno, 288 "%04X/%02X\n",
289 cdev->private->ssid, cdev->private->devno,
288 cdev->id.cu_type, cdev->id.cu_model, 290 cdev->id.cu_type, cdev->id.cu_model,
289 cdev->id.dev_type, cdev->id.dev_model); 291 cdev->id.dev_type, cdev->id.dev_model);
290 break; 292 break;
291 case DEV_STATE_BOXED: 293 case DEV_STATE_BOXED:
292 CIO_DEBUG(KERN_WARNING, 2, 294 CIO_DEBUG(KERN_WARNING, 2,
293 "SenseID : boxed device %04x on subchannel %04x\n", 295 "SenseID : boxed device %04x on subchannel "
294 cdev->private->devno, sch->irq); 296 "0.%x.%04x\n", cdev->private->devno,
297 sch->schid.ssid, sch->schid.sch_no);
295 break; 298 break;
296 } 299 }
297 cdev->private->state = state; 300 cdev->private->state = state;
@@ -359,7 +362,7 @@ ccw_device_done(struct ccw_device *cdev, int state)
359 if (state == DEV_STATE_BOXED) 362 if (state == DEV_STATE_BOXED)
360 CIO_DEBUG(KERN_WARNING, 2, 363 CIO_DEBUG(KERN_WARNING, 2,
361 "Boxed device %04x on subchannel %04x\n", 364 "Boxed device %04x on subchannel %04x\n",
362 cdev->private->devno, sch->irq); 365 cdev->private->devno, sch->schid.sch_no);
363 366
364 if (cdev->private->flags.donotify) { 367 if (cdev->private->flags.donotify) {
365 cdev->private->flags.donotify = 0; 368 cdev->private->flags.donotify = 0;
@@ -592,7 +595,7 @@ ccw_device_offline(struct ccw_device *cdev)
592 struct subchannel *sch; 595 struct subchannel *sch;
593 596
594 sch = to_subchannel(cdev->dev.parent); 597 sch = to_subchannel(cdev->dev.parent);
595 if (stsch(sch->irq, &sch->schib) || !sch->schib.pmcw.dnv) 598 if (stsch(sch->schid, &sch->schib) || !sch->schib.pmcw.dnv)
596 return -ENODEV; 599 return -ENODEV;
597 if (cdev->private->state != DEV_STATE_ONLINE) { 600 if (cdev->private->state != DEV_STATE_ONLINE) {
598 if (sch->schib.scsw.actl != 0) 601 if (sch->schib.scsw.actl != 0)
@@ -711,7 +714,7 @@ ccw_device_online_verify(struct ccw_device *cdev, enum dev_event dev_event)
711 * Since we might not just be coming from an interrupt from the 714 * Since we might not just be coming from an interrupt from the
712 * subchannel we have to update the schib. 715 * subchannel we have to update the schib.
713 */ 716 */
714 stsch(sch->irq, &sch->schib); 717 stsch(sch->schid, &sch->schib);
715 718
716 if (sch->schib.scsw.actl != 0 || 719 if (sch->schib.scsw.actl != 0 ||
717 (cdev->private->irb.scsw.stctl & SCSW_STCTL_STATUS_PEND)) { 720 (cdev->private->irb.scsw.stctl & SCSW_STCTL_STATUS_PEND)) {
@@ -923,7 +926,7 @@ ccw_device_wait4io_irq(struct ccw_device *cdev, enum dev_event dev_event)
923 926
924 /* Iff device is idle, reset timeout. */ 927 /* Iff device is idle, reset timeout. */
925 sch = to_subchannel(cdev->dev.parent); 928 sch = to_subchannel(cdev->dev.parent);
926 if (!stsch(sch->irq, &sch->schib)) 929 if (!stsch(sch->schid, &sch->schib))
927 if (sch->schib.scsw.actl == 0) 930 if (sch->schib.scsw.actl == 0)
928 ccw_device_set_timeout(cdev, 0); 931 ccw_device_set_timeout(cdev, 0);
929 /* Call the handler. */ 932 /* Call the handler. */
@@ -1035,7 +1038,7 @@ device_trigger_reprobe(struct subchannel *sch)
1035 return; 1038 return;
1036 1039
1037 /* Update some values. */ 1040 /* Update some values. */
1038 if (stsch(sch->irq, &sch->schib)) 1041 if (stsch(sch->schid, &sch->schib))
1039 return; 1042 return;
1040 1043
1041 /* 1044 /*
diff --git a/drivers/s390/cio/device_id.c b/drivers/s390/cio/device_id.c
index 0e68fb511dc9..04ceba343db8 100644
--- a/drivers/s390/cio/device_id.c
+++ b/drivers/s390/cio/device_id.c
@@ -27,7 +27,7 @@
27/* 27/*
28 * diag210 is used under VM to get information about a virtual device 28 * diag210 is used under VM to get information about a virtual device
29 */ 29 */
30#ifdef CONFIG_ARCH_S390X 30#ifdef CONFIG_64BIT
31int 31int
32diag210(struct diag210 * addr) 32diag210(struct diag210 * addr)
33{ 33{
@@ -256,16 +256,17 @@ ccw_device_check_sense_id(struct ccw_device *cdev)
256 * sense id information. So, for intervention required, 256 * sense id information. So, for intervention required,
257 * we use the "whack it until it talks" strategy... 257 * we use the "whack it until it talks" strategy...
258 */ 258 */
259 CIO_MSG_EVENT(2, "SenseID : device %04x on Subchannel %04x " 259 CIO_MSG_EVENT(2, "SenseID : device %04x on Subchannel "
260 "reports cmd reject\n", 260 "0.%x.%04x reports cmd reject\n",
261 cdev->private->devno, sch->irq); 261 cdev->private->devno, sch->schid.ssid,
262 sch->schid.sch_no);
262 return -EOPNOTSUPP; 263 return -EOPNOTSUPP;
263 } 264 }
264 if (irb->esw.esw0.erw.cons) { 265 if (irb->esw.esw0.erw.cons) {
265 CIO_MSG_EVENT(2, "SenseID : UC on dev %04x, " 266 CIO_MSG_EVENT(2, "SenseID : UC on dev 0.%x.%04x, "
266 "lpum %02X, cnt %02d, sns :" 267 "lpum %02X, cnt %02d, sns :"
267 " %02X%02X%02X%02X %02X%02X%02X%02X ...\n", 268 " %02X%02X%02X%02X %02X%02X%02X%02X ...\n",
268 cdev->private->devno, 269 cdev->private->ssid, cdev->private->devno,
269 irb->esw.esw0.sublog.lpum, 270 irb->esw.esw0.sublog.lpum,
270 irb->esw.esw0.erw.scnt, 271 irb->esw.esw0.erw.scnt,
271 irb->ecw[0], irb->ecw[1], 272 irb->ecw[0], irb->ecw[1],
@@ -277,16 +278,17 @@ ccw_device_check_sense_id(struct ccw_device *cdev)
277 if (irb->scsw.cc == 3) { 278 if (irb->scsw.cc == 3) {
278 if ((sch->orb.lpm & 279 if ((sch->orb.lpm &
279 sch->schib.pmcw.pim & sch->schib.pmcw.pam) != 0) 280 sch->schib.pmcw.pim & sch->schib.pmcw.pam) != 0)
280 CIO_MSG_EVENT(2, "SenseID : path %02X for device %04x on" 281 CIO_MSG_EVENT(2, "SenseID : path %02X for device %04x "
281 " subchannel %04x is 'not operational'\n", 282 "on subchannel 0.%x.%04x is "
282 sch->orb.lpm, cdev->private->devno, 283 "'not operational'\n", sch->orb.lpm,
283 sch->irq); 284 cdev->private->devno, sch->schid.ssid,
285 sch->schid.sch_no);
284 return -EACCES; 286 return -EACCES;
285 } 287 }
286 /* Hmm, whatever happened, try again. */ 288 /* Hmm, whatever happened, try again. */
287 CIO_MSG_EVENT(2, "SenseID : start_IO() for device %04x on " 289 CIO_MSG_EVENT(2, "SenseID : start_IO() for device %04x on "
288 "subchannel %04x returns status %02X%02X\n", 290 "subchannel 0.%x.%04x returns status %02X%02X\n",
289 cdev->private->devno, sch->irq, 291 cdev->private->devno, sch->schid.ssid, sch->schid.sch_no,
290 irb->scsw.dstat, irb->scsw.cstat); 292 irb->scsw.dstat, irb->scsw.cstat);
291 return -EAGAIN; 293 return -EAGAIN;
292} 294}
diff --git a/drivers/s390/cio/device_ops.c b/drivers/s390/cio/device_ops.c
index 85a3026e6900..143b6c25a4e6 100644
--- a/drivers/s390/cio/device_ops.c
+++ b/drivers/s390/cio/device_ops.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * drivers/s390/cio/device_ops.c 2 * drivers/s390/cio/device_ops.c
3 * 3 *
4 * $Revision: 1.57 $ 4 * $Revision: 1.58 $
5 * 5 *
6 * Copyright (C) 2002 IBM Deutschland Entwicklung GmbH, 6 * Copyright (C) 2002 IBM Deutschland Entwicklung GmbH,
7 * IBM Corporation 7 * IBM Corporation
@@ -570,7 +570,7 @@ ccw_device_get_chp_desc(struct ccw_device *cdev, int chp_no)
570int 570int
571_ccw_device_get_subchannel_number(struct ccw_device *cdev) 571_ccw_device_get_subchannel_number(struct ccw_device *cdev)
572{ 572{
573 return cdev->private->irq; 573 return cdev->private->sch_no;
574} 574}
575 575
576int 576int
diff --git a/drivers/s390/cio/device_pgid.c b/drivers/s390/cio/device_pgid.c
index 0adac8a67331..052832d03d38 100644
--- a/drivers/s390/cio/device_pgid.c
+++ b/drivers/s390/cio/device_pgid.c
@@ -22,6 +22,7 @@
22#include "cio_debug.h" 22#include "cio_debug.h"
23#include "css.h" 23#include "css.h"
24#include "device.h" 24#include "device.h"
25#include "ioasm.h"
25 26
26/* 27/*
27 * Start Sense Path Group ID helper function. Used in ccw_device_recog 28 * Start Sense Path Group ID helper function. Used in ccw_device_recog
@@ -56,10 +57,10 @@ __ccw_device_sense_pgid_start(struct ccw_device *cdev)
56 if (ret != -EACCES) 57 if (ret != -EACCES)
57 return ret; 58 return ret;
58 CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel " 59 CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel "
59 "%04x, lpm %02X, became 'not " 60 "0.%x.%04x, lpm %02X, became 'not "
60 "operational'\n", 61 "operational'\n",
61 cdev->private->devno, sch->irq, 62 cdev->private->devno, sch->schid.ssid,
62 cdev->private->imask); 63 sch->schid.sch_no, cdev->private->imask);
63 64
64 } 65 }
65 cdev->private->imask >>= 1; 66 cdev->private->imask >>= 1;
@@ -105,10 +106,10 @@ __ccw_device_check_sense_pgid(struct ccw_device *cdev)
105 return -EOPNOTSUPP; 106 return -EOPNOTSUPP;
106 } 107 }
107 if (irb->esw.esw0.erw.cons) { 108 if (irb->esw.esw0.erw.cons) {
108 CIO_MSG_EVENT(2, "SNID - device %04x, unit check, " 109 CIO_MSG_EVENT(2, "SNID - device 0.%x.%04x, unit check, "
109 "lpum %02X, cnt %02d, sns : " 110 "lpum %02X, cnt %02d, sns : "
110 "%02X%02X%02X%02X %02X%02X%02X%02X ...\n", 111 "%02X%02X%02X%02X %02X%02X%02X%02X ...\n",
111 cdev->private->devno, 112 cdev->private->ssid, cdev->private->devno,
112 irb->esw.esw0.sublog.lpum, 113 irb->esw.esw0.sublog.lpum,
113 irb->esw.esw0.erw.scnt, 114 irb->esw.esw0.erw.scnt,
114 irb->ecw[0], irb->ecw[1], 115 irb->ecw[0], irb->ecw[1],
@@ -118,15 +119,17 @@ __ccw_device_check_sense_pgid(struct ccw_device *cdev)
118 return -EAGAIN; 119 return -EAGAIN;
119 } 120 }
120 if (irb->scsw.cc == 3) { 121 if (irb->scsw.cc == 3) {
121 CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel " 122 CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel 0.%x.%04x,"
122 "%04x, lpm %02X, became 'not operational'\n", 123 " lpm %02X, became 'not operational'\n",
123 cdev->private->devno, sch->irq, sch->orb.lpm); 124 cdev->private->devno, sch->schid.ssid,
125 sch->schid.sch_no, sch->orb.lpm);
124 return -EACCES; 126 return -EACCES;
125 } 127 }
126 if (cdev->private->pgid.inf.ps.state2 == SNID_STATE2_RESVD_ELSE) { 128 if (cdev->private->pgid.inf.ps.state2 == SNID_STATE2_RESVD_ELSE) {
127 CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel %04x " 129 CIO_MSG_EVENT(2, "SNID - Device %04x on Subchannel 0.%x.%04x "
128 "is reserved by someone else\n", 130 "is reserved by someone else\n",
129 cdev->private->devno, sch->irq); 131 cdev->private->devno, sch->schid.ssid,
132 sch->schid.sch_no);
130 return -EUSERS; 133 return -EUSERS;
131 } 134 }
132 return 0; 135 return 0;
@@ -162,7 +165,7 @@ ccw_device_sense_pgid_irq(struct ccw_device *cdev, enum dev_event dev_event)
162 /* 0, -ETIME, -EOPNOTSUPP, -EAGAIN, -EACCES or -EUSERS */ 165 /* 0, -ETIME, -EOPNOTSUPP, -EAGAIN, -EACCES or -EUSERS */
163 case 0: /* Sense Path Group ID successful. */ 166 case 0: /* Sense Path Group ID successful. */
164 if (cdev->private->pgid.inf.ps.state1 == SNID_STATE1_RESET) 167 if (cdev->private->pgid.inf.ps.state1 == SNID_STATE1_RESET)
165 memcpy(&cdev->private->pgid, &global_pgid, 168 memcpy(&cdev->private->pgid, &css[0]->global_pgid,
166 sizeof(struct pgid)); 169 sizeof(struct pgid));
167 ccw_device_sense_pgid_done(cdev, 0); 170 ccw_device_sense_pgid_done(cdev, 0);
168 break; 171 break;
@@ -235,8 +238,9 @@ __ccw_device_do_pgid(struct ccw_device *cdev, __u8 func)
235 sch->lpm &= ~cdev->private->imask; 238 sch->lpm &= ~cdev->private->imask;
236 sch->vpm &= ~cdev->private->imask; 239 sch->vpm &= ~cdev->private->imask;
237 CIO_MSG_EVENT(2, "SPID - Device %04x on Subchannel " 240 CIO_MSG_EVENT(2, "SPID - Device %04x on Subchannel "
238 "%04x, lpm %02X, became 'not operational'\n", 241 "0.%x.%04x, lpm %02X, became 'not operational'\n",
239 cdev->private->devno, sch->irq, cdev->private->imask); 242 cdev->private->devno, sch->schid.ssid,
243 sch->schid.sch_no, cdev->private->imask);
240 return ret; 244 return ret;
241} 245}
242 246
@@ -258,8 +262,10 @@ __ccw_device_check_pgid(struct ccw_device *cdev)
258 if (irb->ecw[0] & SNS0_CMD_REJECT) 262 if (irb->ecw[0] & SNS0_CMD_REJECT)
259 return -EOPNOTSUPP; 263 return -EOPNOTSUPP;
260 /* Hmm, whatever happened, try again. */ 264 /* Hmm, whatever happened, try again. */
261 CIO_MSG_EVENT(2, "SPID - device %04x, unit check, cnt %02d, " 265 CIO_MSG_EVENT(2, "SPID - device 0.%x.%04x, unit check, "
266 "cnt %02d, "
262 "sns : %02X%02X%02X%02X %02X%02X%02X%02X ...\n", 267 "sns : %02X%02X%02X%02X %02X%02X%02X%02X ...\n",
268 cdev->private->ssid,
263 cdev->private->devno, irb->esw.esw0.erw.scnt, 269 cdev->private->devno, irb->esw.esw0.erw.scnt,
264 irb->ecw[0], irb->ecw[1], 270 irb->ecw[0], irb->ecw[1],
265 irb->ecw[2], irb->ecw[3], 271 irb->ecw[2], irb->ecw[3],
@@ -268,10 +274,10 @@ __ccw_device_check_pgid(struct ccw_device *cdev)
268 return -EAGAIN; 274 return -EAGAIN;
269 } 275 }
270 if (irb->scsw.cc == 3) { 276 if (irb->scsw.cc == 3) {
271 CIO_MSG_EVENT(2, "SPID - Device %04x on Subchannel " 277 CIO_MSG_EVENT(2, "SPID - Device %04x on Subchannel 0.%x.%04x,"
272 "%04x, lpm %02X, became 'not operational'\n", 278 " lpm %02X, became 'not operational'\n",
273 cdev->private->devno, sch->irq, 279 cdev->private->devno, sch->schid.ssid,
274 cdev->private->imask); 280 sch->schid.sch_no, cdev->private->imask);
275 return -EACCES; 281 return -EACCES;
276 } 282 }
277 return 0; 283 return 0;
@@ -364,8 +370,22 @@ ccw_device_verify_irq(struct ccw_device *cdev, enum dev_event dev_event)
364void 370void
365ccw_device_verify_start(struct ccw_device *cdev) 371ccw_device_verify_start(struct ccw_device *cdev)
366{ 372{
373 struct subchannel *sch = to_subchannel(cdev->dev.parent);
374
367 cdev->private->flags.pgid_single = 0; 375 cdev->private->flags.pgid_single = 0;
368 cdev->private->iretry = 5; 376 cdev->private->iretry = 5;
377 /*
378 * Update sch->lpm with current values to catch paths becoming
379 * available again.
380 */
381 if (stsch(sch->schid, &sch->schib)) {
382 ccw_device_verify_done(cdev, -ENODEV);
383 return;
384 }
385 sch->lpm = sch->schib.pmcw.pim &
386 sch->schib.pmcw.pam &
387 sch->schib.pmcw.pom &
388 sch->opm;
369 __ccw_device_verify_start(cdev); 389 __ccw_device_verify_start(cdev);
370} 390}
371 391
diff --git a/drivers/s390/cio/device_status.c b/drivers/s390/cio/device_status.c
index 12a24d4331a2..db09c209098b 100644
--- a/drivers/s390/cio/device_status.c
+++ b/drivers/s390/cio/device_status.c
@@ -36,15 +36,16 @@ ccw_device_msg_control_check(struct ccw_device *cdev, struct irb *irb)
36 36
37 CIO_MSG_EVENT(0, "Channel-Check or Interface-Control-Check " 37 CIO_MSG_EVENT(0, "Channel-Check or Interface-Control-Check "
38 "received" 38 "received"
39 " ... device %04X on subchannel %04X, dev_stat " 39 " ... device %04x on subchannel 0.%x.%04x, dev_stat "
40 ": %02X sch_stat : %02X\n", 40 ": %02X sch_stat : %02X\n",
41 cdev->private->devno, cdev->private->irq, 41 cdev->private->devno, cdev->private->ssid,
42 cdev->private->sch_no,
42 irb->scsw.dstat, irb->scsw.cstat); 43 irb->scsw.dstat, irb->scsw.cstat);
43 44
44 if (irb->scsw.cc != 3) { 45 if (irb->scsw.cc != 3) {
45 char dbf_text[15]; 46 char dbf_text[15];
46 47
47 sprintf(dbf_text, "chk%x", cdev->private->irq); 48 sprintf(dbf_text, "chk%x", cdev->private->sch_no);
48 CIO_TRACE_EVENT(0, dbf_text); 49 CIO_TRACE_EVENT(0, dbf_text);
49 CIO_HEX_EVENT(0, irb, sizeof (struct irb)); 50 CIO_HEX_EVENT(0, irb, sizeof (struct irb));
50 } 51 }
@@ -59,10 +60,11 @@ ccw_device_path_notoper(struct ccw_device *cdev)
59 struct subchannel *sch; 60 struct subchannel *sch;
60 61
61 sch = to_subchannel(cdev->dev.parent); 62 sch = to_subchannel(cdev->dev.parent);
62 stsch (sch->irq, &sch->schib); 63 stsch (sch->schid, &sch->schib);
63 64
64 CIO_MSG_EVENT(0, "%s(%04x) - path(s) %02x are " 65 CIO_MSG_EVENT(0, "%s(0.%x.%04x) - path(s) %02x are "
65 "not operational \n", __FUNCTION__, sch->irq, 66 "not operational \n", __FUNCTION__,
67 sch->schid.ssid, sch->schid.sch_no,
66 sch->schib.pmcw.pnom); 68 sch->schib.pmcw.pnom);
67 69
68 sch->lpm &= ~sch->schib.pmcw.pnom; 70 sch->lpm &= ~sch->schib.pmcw.pnom;
diff --git a/drivers/s390/cio/ioasm.h b/drivers/s390/cio/ioasm.h
index 45480a2bc4c0..95a9462f9a91 100644
--- a/drivers/s390/cio/ioasm.h
+++ b/drivers/s390/cio/ioasm.h
@@ -1,12 +1,13 @@
1#ifndef S390_CIO_IOASM_H 1#ifndef S390_CIO_IOASM_H
2#define S390_CIO_IOASM_H 2#define S390_CIO_IOASM_H
3 3
4#include "schid.h"
5
4/* 6/*
5 * TPI info structure 7 * TPI info structure
6 */ 8 */
7struct tpi_info { 9struct tpi_info {
8 __u32 reserved1 : 16; /* reserved 0x00000001 */ 10 struct subchannel_id schid;
9 __u32 irq : 16; /* aka. subchannel number */
10 __u32 intparm; /* interruption parameter */ 11 __u32 intparm; /* interruption parameter */
11 __u32 adapter_IO : 1; 12 __u32 adapter_IO : 1;
12 __u32 reserved2 : 1; 13 __u32 reserved2 : 1;
@@ -21,7 +22,8 @@ struct tpi_info {
21 * Some S390 specific IO instructions as inline 22 * Some S390 specific IO instructions as inline
22 */ 23 */
23 24
24static inline int stsch(int irq, volatile struct schib *addr) 25static inline int stsch(struct subchannel_id schid,
26 volatile struct schib *addr)
25{ 27{
26 int ccode; 28 int ccode;
27 29
@@ -31,12 +33,42 @@ static inline int stsch(int irq, volatile struct schib *addr)
31 " ipm %0\n" 33 " ipm %0\n"
32 " srl %0,28" 34 " srl %0,28"
33 : "=d" (ccode) 35 : "=d" (ccode)
34 : "d" (irq | 0x10000), "a" (addr) 36 : "d" (schid), "a" (addr), "m" (*addr)
37 : "cc", "1" );
38 return ccode;
39}
40
41static inline int stsch_err(struct subchannel_id schid,
42 volatile struct schib *addr)
43{
44 int ccode;
45
46 __asm__ __volatile__(
47 " lhi %0,%3\n"
48 " lr 1,%1\n"
49 " stsch 0(%2)\n"
50 "0: ipm %0\n"
51 " srl %0,28\n"
52 "1:\n"
53#ifdef CONFIG_64BIT
54 ".section __ex_table,\"a\"\n"
55 " .align 8\n"
56 " .quad 0b,1b\n"
57 ".previous"
58#else
59 ".section __ex_table,\"a\"\n"
60 " .align 4\n"
61 " .long 0b,1b\n"
62 ".previous"
63#endif
64 : "=&d" (ccode)
65 : "d" (schid), "a" (addr), "K" (-EIO), "m" (*addr)
35 : "cc", "1" ); 66 : "cc", "1" );
36 return ccode; 67 return ccode;
37} 68}
38 69
39static inline int msch(int irq, volatile struct schib *addr) 70static inline int msch(struct subchannel_id schid,
71 volatile struct schib *addr)
40{ 72{
41 int ccode; 73 int ccode;
42 74
@@ -46,12 +78,13 @@ static inline int msch(int irq, volatile struct schib *addr)
46 " ipm %0\n" 78 " ipm %0\n"
47 " srl %0,28" 79 " srl %0,28"
48 : "=d" (ccode) 80 : "=d" (ccode)
49 : "d" (irq | 0x10000L), "a" (addr) 81 : "d" (schid), "a" (addr), "m" (*addr)
50 : "cc", "1" ); 82 : "cc", "1" );
51 return ccode; 83 return ccode;
52} 84}
53 85
54static inline int msch_err(int irq, volatile struct schib *addr) 86static inline int msch_err(struct subchannel_id schid,
87 volatile struct schib *addr)
55{ 88{
56 int ccode; 89 int ccode;
57 90
@@ -62,7 +95,7 @@ static inline int msch_err(int irq, volatile struct schib *addr)
62 "0: ipm %0\n" 95 "0: ipm %0\n"
63 " srl %0,28\n" 96 " srl %0,28\n"
64 "1:\n" 97 "1:\n"
65#ifdef CONFIG_ARCH_S390X 98#ifdef CONFIG_64BIT
66 ".section __ex_table,\"a\"\n" 99 ".section __ex_table,\"a\"\n"
67 " .align 8\n" 100 " .align 8\n"
68 " .quad 0b,1b\n" 101 " .quad 0b,1b\n"
@@ -74,12 +107,13 @@ static inline int msch_err(int irq, volatile struct schib *addr)
74 ".previous" 107 ".previous"
75#endif 108#endif
76 : "=&d" (ccode) 109 : "=&d" (ccode)
77 : "d" (irq | 0x10000L), "a" (addr), "K" (-EIO) 110 : "d" (schid), "a" (addr), "K" (-EIO), "m" (*addr)
78 : "cc", "1" ); 111 : "cc", "1" );
79 return ccode; 112 return ccode;
80} 113}
81 114
82static inline int tsch(int irq, volatile struct irb *addr) 115static inline int tsch(struct subchannel_id schid,
116 volatile struct irb *addr)
83{ 117{
84 int ccode; 118 int ccode;
85 119
@@ -89,7 +123,7 @@ static inline int tsch(int irq, volatile struct irb *addr)
89 " ipm %0\n" 123 " ipm %0\n"
90 " srl %0,28" 124 " srl %0,28"
91 : "=d" (ccode) 125 : "=d" (ccode)
92 : "d" (irq | 0x10000L), "a" (addr) 126 : "d" (schid), "a" (addr), "m" (*addr)
93 : "cc", "1" ); 127 : "cc", "1" );
94 return ccode; 128 return ccode;
95} 129}
@@ -103,12 +137,13 @@ static inline int tpi( volatile struct tpi_info *addr)
103 " ipm %0\n" 137 " ipm %0\n"
104 " srl %0,28" 138 " srl %0,28"
105 : "=d" (ccode) 139 : "=d" (ccode)
106 : "a" (addr) 140 : "a" (addr), "m" (*addr)
107 : "cc", "1" ); 141 : "cc", "1" );
108 return ccode; 142 return ccode;
109} 143}
110 144
111static inline int ssch(int irq, volatile struct orb *addr) 145static inline int ssch(struct subchannel_id schid,
146 volatile struct orb *addr)
112{ 147{
113 int ccode; 148 int ccode;
114 149
@@ -118,12 +153,12 @@ static inline int ssch(int irq, volatile struct orb *addr)
118 " ipm %0\n" 153 " ipm %0\n"
119 " srl %0,28" 154 " srl %0,28"
120 : "=d" (ccode) 155 : "=d" (ccode)
121 : "d" (irq | 0x10000L), "a" (addr) 156 : "d" (schid), "a" (addr), "m" (*addr)
122 : "cc", "1" ); 157 : "cc", "1" );
123 return ccode; 158 return ccode;
124} 159}
125 160
126static inline int rsch(int irq) 161static inline int rsch(struct subchannel_id schid)
127{ 162{
128 int ccode; 163 int ccode;
129 164
@@ -133,12 +168,12 @@ static inline int rsch(int irq)
133 " ipm %0\n" 168 " ipm %0\n"
134 " srl %0,28" 169 " srl %0,28"
135 : "=d" (ccode) 170 : "=d" (ccode)
136 : "d" (irq | 0x10000L) 171 : "d" (schid)
137 : "cc", "1" ); 172 : "cc", "1" );
138 return ccode; 173 return ccode;
139} 174}
140 175
141static inline int csch(int irq) 176static inline int csch(struct subchannel_id schid)
142{ 177{
143 int ccode; 178 int ccode;
144 179
@@ -148,12 +183,12 @@ static inline int csch(int irq)
148 " ipm %0\n" 183 " ipm %0\n"
149 " srl %0,28" 184 " srl %0,28"
150 : "=d" (ccode) 185 : "=d" (ccode)
151 : "d" (irq | 0x10000L) 186 : "d" (schid)
152 : "cc", "1" ); 187 : "cc", "1" );
153 return ccode; 188 return ccode;
154} 189}
155 190
156static inline int hsch(int irq) 191static inline int hsch(struct subchannel_id schid)
157{ 192{
158 int ccode; 193 int ccode;
159 194
@@ -163,12 +198,12 @@ static inline int hsch(int irq)
163 " ipm %0\n" 198 " ipm %0\n"
164 " srl %0,28" 199 " srl %0,28"
165 : "=d" (ccode) 200 : "=d" (ccode)
166 : "d" (irq | 0x10000L) 201 : "d" (schid)
167 : "cc", "1" ); 202 : "cc", "1" );
168 return ccode; 203 return ccode;
169} 204}
170 205
171static inline int xsch(int irq) 206static inline int xsch(struct subchannel_id schid)
172{ 207{
173 int ccode; 208 int ccode;
174 209
@@ -178,21 +213,22 @@ static inline int xsch(int irq)
178 " ipm %0\n" 213 " ipm %0\n"
179 " srl %0,28" 214 " srl %0,28"
180 : "=d" (ccode) 215 : "=d" (ccode)
181 : "d" (irq | 0x10000L) 216 : "d" (schid)
182 : "cc", "1" ); 217 : "cc", "1" );
183 return ccode; 218 return ccode;
184} 219}
185 220
186static inline int chsc(void *chsc_area) 221static inline int chsc(void *chsc_area)
187{ 222{
223 typedef struct { char _[4096]; } addr_type;
188 int cc; 224 int cc;
189 225
190 __asm__ __volatile__ ( 226 __asm__ __volatile__ (
191 ".insn rre,0xb25f0000,%1,0 \n\t" 227 ".insn rre,0xb25f0000,%2,0 \n\t"
192 "ipm %0 \n\t" 228 "ipm %0 \n\t"
193 "srl %0,28 \n\t" 229 "srl %0,28 \n\t"
194 : "=d" (cc) 230 : "=d" (cc), "=m" (*(addr_type *) chsc_area)
195 : "d" (chsc_area) 231 : "d" (chsc_area), "m" (*(addr_type *) chsc_area)
196 : "cc" ); 232 : "cc" );
197 233
198 return cc; 234 return cc;
diff --git a/drivers/s390/cio/qdio.c b/drivers/s390/cio/qdio.c
index eb39218b925e..30a836ffc31f 100644
--- a/drivers/s390/cio/qdio.c
+++ b/drivers/s390/cio/qdio.c
@@ -56,7 +56,7 @@
56#include "ioasm.h" 56#include "ioasm.h"
57#include "chsc.h" 57#include "chsc.h"
58 58
59#define VERSION_QDIO_C "$Revision: 1.108 $" 59#define VERSION_QDIO_C "$Revision: 1.114 $"
60 60
61/****************** MODULE PARAMETER VARIABLES ********************/ 61/****************** MODULE PARAMETER VARIABLES ********************/
62MODULE_AUTHOR("Utz Bacher <utz.bacher@de.ibm.com>"); 62MODULE_AUTHOR("Utz Bacher <utz.bacher@de.ibm.com>");
@@ -76,6 +76,7 @@ static struct qdio_perf_stats perf_stats;
76#endif /* QDIO_PERFORMANCE_STATS */ 76#endif /* QDIO_PERFORMANCE_STATS */
77 77
78static int hydra_thinints; 78static int hydra_thinints;
79static int is_passthrough = 0;
79static int omit_svs; 80static int omit_svs;
80 81
81static int indicator_used[INDICATORS_PER_CACHELINE]; 82static int indicator_used[INDICATORS_PER_CACHELINE];
@@ -136,12 +137,126 @@ qdio_release_q(struct qdio_q *q)
136 atomic_dec(&q->use_count); 137 atomic_dec(&q->use_count);
137} 138}
138 139
139static volatile inline void 140/*check ccq */
140qdio_set_slsb(volatile char *slsb, unsigned char value) 141static inline int
142qdio_check_ccq(struct qdio_q *q, unsigned int ccq)
143{
144 char dbf_text[15];
145
146 if (ccq == 0 || ccq == 32 || ccq == 96)
147 return 0;
148 if (ccq == 97)
149 return 1;
150 /*notify devices immediately*/
151 sprintf(dbf_text,"%d", ccq);
152 QDIO_DBF_TEXT2(1,trace,dbf_text);
153 return -EIO;
154}
155/* EQBS: extract buffer states */
156static inline int
157qdio_do_eqbs(struct qdio_q *q, unsigned char *state,
158 unsigned int *start, unsigned int *cnt)
159{
160 struct qdio_irq *irq;
161 unsigned int tmp_cnt, q_no, ccq;
162 int rc ;
163 char dbf_text[15];
164
165 ccq = 0;
166 tmp_cnt = *cnt;
167 irq = (struct qdio_irq*)q->irq_ptr;
168 q_no = q->q_no;
169 if(!q->is_input_q)
170 q_no += irq->no_input_qs;
171 ccq = do_eqbs(irq->sch_token, state, q_no, start, cnt);
172 rc = qdio_check_ccq(q, ccq);
173 if (rc < 0) {
174 QDIO_DBF_TEXT2(1,trace,"eqberr");
175 sprintf(dbf_text,"%2x,%2x,%d,%d",tmp_cnt, *cnt, ccq, q_no);
176 QDIO_DBF_TEXT2(1,trace,dbf_text);
177 q->handler(q->cdev,QDIO_STATUS_ACTIVATE_CHECK_CONDITION|
178 QDIO_STATUS_LOOK_FOR_ERROR,
179 0, 0, 0, -1, -1, q->int_parm);
180 return 0;
181 }
182 return (tmp_cnt - *cnt);
183}
184
185/* SQBS: set buffer states */
186static inline int
187qdio_do_sqbs(struct qdio_q *q, unsigned char state,
188 unsigned int *start, unsigned int *cnt)
141{ 189{
142 xchg((char*)slsb,value); 190 struct qdio_irq *irq;
191 unsigned int tmp_cnt, q_no, ccq;
192 int rc;
193 char dbf_text[15];
194
195 ccq = 0;
196 tmp_cnt = *cnt;
197 irq = (struct qdio_irq*)q->irq_ptr;
198 q_no = q->q_no;
199 if(!q->is_input_q)
200 q_no += irq->no_input_qs;
201 ccq = do_sqbs(irq->sch_token, state, q_no, start, cnt);
202 rc = qdio_check_ccq(q, ccq);
203 if (rc < 0) {
204 QDIO_DBF_TEXT3(1,trace,"sqberr");
205 sprintf(dbf_text,"%2x,%2x,%d,%d",tmp_cnt,*cnt,ccq,q_no);
206 QDIO_DBF_TEXT3(1,trace,dbf_text);
207 q->handler(q->cdev,QDIO_STATUS_ACTIVATE_CHECK_CONDITION|
208 QDIO_STATUS_LOOK_FOR_ERROR,
209 0, 0, 0, -1, -1, q->int_parm);
210 return 0;
211 }
212 return (tmp_cnt - *cnt);
143} 213}
144 214
215static inline int
216qdio_set_slsb(struct qdio_q *q, unsigned int *bufno,
217 unsigned char state, unsigned int *count)
218{
219 volatile char *slsb;
220 struct qdio_irq *irq;
221
222 irq = (struct qdio_irq*)q->irq_ptr;
223 if (!irq->is_qebsm) {
224 slsb = (char *)&q->slsb.acc.val[(*bufno)];
225 xchg(slsb, state);
226 return 1;
227 }
228 return qdio_do_sqbs(q, state, bufno, count);
229}
230
231#ifdef CONFIG_QDIO_DEBUG
232static inline void
233qdio_trace_slsb(struct qdio_q *q)
234{
235 if (q->queue_type==QDIO_TRACE_QTYPE) {
236 if (q->is_input_q)
237 QDIO_DBF_HEX2(0,slsb_in,&q->slsb,
238 QDIO_MAX_BUFFERS_PER_Q);
239 else
240 QDIO_DBF_HEX2(0,slsb_out,&q->slsb,
241 QDIO_MAX_BUFFERS_PER_Q);
242 }
243}
244#endif
245
246static inline int
247set_slsb(struct qdio_q *q, unsigned int *bufno,
248 unsigned char state, unsigned int *count)
249{
250 int rc;
251#ifdef CONFIG_QDIO_DEBUG
252 qdio_trace_slsb(q);
253#endif
254 rc = qdio_set_slsb(q, bufno, state, count);
255#ifdef CONFIG_QDIO_DEBUG
256 qdio_trace_slsb(q);
257#endif
258 return rc;
259}
145static inline int 260static inline int
146qdio_siga_sync(struct qdio_q *q, unsigned int gpr2, 261qdio_siga_sync(struct qdio_q *q, unsigned int gpr2,
147 unsigned int gpr3) 262 unsigned int gpr3)
@@ -155,7 +270,7 @@ qdio_siga_sync(struct qdio_q *q, unsigned int gpr2,
155 perf_stats.siga_syncs++; 270 perf_stats.siga_syncs++;
156#endif /* QDIO_PERFORMANCE_STATS */ 271#endif /* QDIO_PERFORMANCE_STATS */
157 272
158 cc = do_siga_sync(q->irq, gpr2, gpr3); 273 cc = do_siga_sync(q->schid, gpr2, gpr3);
159 if (cc) 274 if (cc)
160 QDIO_DBF_HEX3(0,trace,&cc,sizeof(int*)); 275 QDIO_DBF_HEX3(0,trace,&cc,sizeof(int*));
161 276
@@ -170,6 +285,23 @@ qdio_siga_sync_q(struct qdio_q *q)
170 return qdio_siga_sync(q, q->mask, 0); 285 return qdio_siga_sync(q, q->mask, 0);
171} 286}
172 287
288static int
289__do_siga_output(struct qdio_q *q, unsigned int *busy_bit)
290{
291 struct qdio_irq *irq;
292 unsigned int fc = 0;
293 unsigned long schid;
294
295 irq = (struct qdio_irq *) q->irq_ptr;
296 if (!irq->is_qebsm)
297 schid = *((u32 *)&q->schid);
298 else {
299 schid = irq->sch_token;
300 fc |= 0x80;
301 }
302 return do_siga_output(schid, q->mask, busy_bit, fc);
303}
304
173/* 305/*
174 * returns QDIO_SIGA_ERROR_ACCESS_EXCEPTION as cc, when SIGA returns 306 * returns QDIO_SIGA_ERROR_ACCESS_EXCEPTION as cc, when SIGA returns
175 * an access exception 307 * an access exception
@@ -189,7 +321,7 @@ qdio_siga_output(struct qdio_q *q)
189 QDIO_DBF_HEX4(0,trace,&q,sizeof(void*)); 321 QDIO_DBF_HEX4(0,trace,&q,sizeof(void*));
190 322
191 for (;;) { 323 for (;;) {
192 cc = do_siga_output(q->irq, q->mask, &busy_bit); 324 cc = __do_siga_output(q, &busy_bit);
193//QDIO_PRINT_ERR("cc=%x, busy=%x\n",cc,busy_bit); 325//QDIO_PRINT_ERR("cc=%x, busy=%x\n",cc,busy_bit);
194 if ((cc==2) && (busy_bit) && (q->is_iqdio_q)) { 326 if ((cc==2) && (busy_bit) && (q->is_iqdio_q)) {
195 if (!start_time) 327 if (!start_time)
@@ -221,7 +353,7 @@ qdio_siga_input(struct qdio_q *q)
221 perf_stats.siga_ins++; 353 perf_stats.siga_ins++;
222#endif /* QDIO_PERFORMANCE_STATS */ 354#endif /* QDIO_PERFORMANCE_STATS */
223 355
224 cc = do_siga_input(q->irq, q->mask); 356 cc = do_siga_input(q->schid, q->mask);
225 357
226 if (cc) 358 if (cc)
227 QDIO_DBF_HEX3(0,trace,&cc,sizeof(int*)); 359 QDIO_DBF_HEX3(0,trace,&cc,sizeof(int*));
@@ -230,7 +362,7 @@ qdio_siga_input(struct qdio_q *q)
230} 362}
231 363
232/* locked by the locks in qdio_activate and qdio_cleanup */ 364/* locked by the locks in qdio_activate and qdio_cleanup */
233static __u32 volatile * 365static __u32 *
234qdio_get_indicator(void) 366qdio_get_indicator(void)
235{ 367{
236 int i; 368 int i;
@@ -258,7 +390,7 @@ qdio_put_indicator(__u32 *addr)
258 atomic_dec(&spare_indicator_usecount); 390 atomic_dec(&spare_indicator_usecount);
259} 391}
260 392
261static inline volatile void 393static inline void
262tiqdio_clear_summary_bit(__u32 *location) 394tiqdio_clear_summary_bit(__u32 *location)
263{ 395{
264 QDIO_DBF_TEXT5(0,trace,"clrsummb"); 396 QDIO_DBF_TEXT5(0,trace,"clrsummb");
@@ -267,7 +399,7 @@ tiqdio_clear_summary_bit(__u32 *location)
267 xchg(location,0); 399 xchg(location,0);
268} 400}
269 401
270static inline volatile void 402static inline void
271tiqdio_set_summary_bit(__u32 *location) 403tiqdio_set_summary_bit(__u32 *location)
272{ 404{
273 QDIO_DBF_TEXT5(0,trace,"setsummb"); 405 QDIO_DBF_TEXT5(0,trace,"setsummb");
@@ -336,7 +468,9 @@ static inline int
336qdio_stop_polling(struct qdio_q *q) 468qdio_stop_polling(struct qdio_q *q)
337{ 469{
338#ifdef QDIO_USE_PROCESSING_STATE 470#ifdef QDIO_USE_PROCESSING_STATE
339 int gsf; 471 unsigned int tmp, gsf, count = 1;
472 unsigned char state = 0;
473 struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr;
340 474
341 if (!atomic_swap(&q->polling,0)) 475 if (!atomic_swap(&q->polling,0))
342 return 1; 476 return 1;
@@ -348,17 +482,22 @@ qdio_stop_polling(struct qdio_q *q)
348 if (!q->is_input_q) 482 if (!q->is_input_q)
349 return 1; 483 return 1;
350 484
351 gsf=GET_SAVED_FRONTIER(q); 485 tmp = gsf = GET_SAVED_FRONTIER(q);
352 set_slsb(&q->slsb.acc.val[(gsf+QDIO_MAX_BUFFERS_PER_Q-1)& 486 tmp = ((tmp + QDIO_MAX_BUFFERS_PER_Q-1) & (QDIO_MAX_BUFFERS_PER_Q-1) );
353 (QDIO_MAX_BUFFERS_PER_Q-1)], 487 set_slsb(q, &tmp, SLSB_P_INPUT_NOT_INIT, &count);
354 SLSB_P_INPUT_NOT_INIT); 488
355 /* 489 /*
356 * we don't issue this SYNC_MEMORY, as we trust Rick T and 490 * we don't issue this SYNC_MEMORY, as we trust Rick T and
357 * moreover will not use the PROCESSING state under VM, so 491 * moreover will not use the PROCESSING state under VM, so
358 * q->polling was 0 anyway 492 * q->polling was 0 anyway
359 */ 493 */
360 /*SYNC_MEMORY;*/ 494 /*SYNC_MEMORY;*/
361 if (q->slsb.acc.val[gsf]!=SLSB_P_INPUT_PRIMED) 495 if (irq->is_qebsm) {
496 count = 1;
497 qdio_do_eqbs(q, &state, &gsf, &count);
498 } else
499 state = q->slsb.acc.val[gsf];
500 if (state != SLSB_P_INPUT_PRIMED)
362 return 1; 501 return 1;
363 /* 502 /*
364 * set our summary bit again, as otherwise there is a 503 * set our summary bit again, as otherwise there is a
@@ -431,18 +570,136 @@ tiqdio_clear_global_summary(void)
431 570
432 571
433/************************* OUTBOUND ROUTINES *******************************/ 572/************************* OUTBOUND ROUTINES *******************************/
573static int
574qdio_qebsm_get_outbound_buffer_frontier(struct qdio_q *q)
575{
576 struct qdio_irq *irq;
577 unsigned char state;
578 unsigned int cnt, count, ftc;
579
580 irq = (struct qdio_irq *) q->irq_ptr;
581 if ((!q->is_iqdio_q) && (!q->hydra_gives_outbound_pcis))
582 SYNC_MEMORY;
583
584 ftc = q->first_to_check;
585 count = qdio_min(atomic_read(&q->number_of_buffers_used),
586 (QDIO_MAX_BUFFERS_PER_Q-1));
587 if (count == 0)
588 return q->first_to_check;
589 cnt = qdio_do_eqbs(q, &state, &ftc, &count);
590 if (cnt == 0)
591 return q->first_to_check;
592 switch (state) {
593 case SLSB_P_OUTPUT_ERROR:
594 QDIO_DBF_TEXT3(0,trace,"outperr");
595 atomic_sub(cnt , &q->number_of_buffers_used);
596 if (q->qdio_error)
597 q->error_status_flags |=
598 QDIO_STATUS_MORE_THAN_ONE_QDIO_ERROR;
599 q->qdio_error = SLSB_P_OUTPUT_ERROR;
600 q->error_status_flags |= QDIO_STATUS_LOOK_FOR_ERROR;
601 q->first_to_check = ftc;
602 break;
603 case SLSB_P_OUTPUT_EMPTY:
604 QDIO_DBF_TEXT5(0,trace,"outpempt");
605 atomic_sub(cnt, &q->number_of_buffers_used);
606 q->first_to_check = ftc;
607 break;
608 case SLSB_CU_OUTPUT_PRIMED:
609 /* all buffers primed */
610 QDIO_DBF_TEXT5(0,trace,"outpprim");
611 break;
612 default:
613 break;
614 }
615 QDIO_DBF_HEX4(0,trace,&q->first_to_check,sizeof(int));
616 return q->first_to_check;
617}
618
619static int
620qdio_qebsm_get_inbound_buffer_frontier(struct qdio_q *q)
621{
622 struct qdio_irq *irq;
623 unsigned char state;
624 int tmp, ftc, count, cnt;
625 char dbf_text[15];
626
627
628 irq = (struct qdio_irq *) q->irq_ptr;
629 ftc = q->first_to_check;
630 count = qdio_min(atomic_read(&q->number_of_buffers_used),
631 (QDIO_MAX_BUFFERS_PER_Q-1));
632 if (count == 0)
633 return q->first_to_check;
634 cnt = qdio_do_eqbs(q, &state, &ftc, &count);
635 if (cnt == 0)
636 return q->first_to_check;
637 switch (state) {
638 case SLSB_P_INPUT_ERROR :
639#ifdef CONFIG_QDIO_DEBUG
640 QDIO_DBF_TEXT3(1,trace,"inperr");
641 sprintf(dbf_text,"%2x,%2x",ftc,count);
642 QDIO_DBF_TEXT3(1,trace,dbf_text);
643#endif /* CONFIG_QDIO_DEBUG */
644 if (q->qdio_error)
645 q->error_status_flags |=
646 QDIO_STATUS_MORE_THAN_ONE_QDIO_ERROR;
647 q->qdio_error = SLSB_P_INPUT_ERROR;
648 q->error_status_flags |= QDIO_STATUS_LOOK_FOR_ERROR;
649 atomic_sub(cnt, &q->number_of_buffers_used);
650 q->first_to_check = ftc;
651 break;
652 case SLSB_P_INPUT_PRIMED :
653 QDIO_DBF_TEXT3(0,trace,"inptprim");
654 sprintf(dbf_text,"%2x,%2x",ftc,count);
655 QDIO_DBF_TEXT3(1,trace,dbf_text);
656 tmp = 0;
657 ftc = q->first_to_check;
658#ifdef QDIO_USE_PROCESSING_STATE
659 if (cnt > 1) {
660 cnt -= 1;
661 tmp = set_slsb(q, &ftc, SLSB_P_INPUT_NOT_INIT, &cnt);
662 if (!tmp)
663 break;
664 }
665 cnt = 1;
666 tmp += set_slsb(q, &ftc,
667 SLSB_P_INPUT_PROCESSING, &cnt);
668 atomic_set(&q->polling, 1);
669#else
670 tmp = set_slsb(q, &ftc, SLSB_P_INPUT_NOT_INIT, &cnt);
671#endif
672 atomic_sub(tmp, &q->number_of_buffers_used);
673 q->first_to_check = ftc;
674 break;
675 case SLSB_CU_INPUT_EMPTY:
676 case SLSB_P_INPUT_NOT_INIT:
677 case SLSB_P_INPUT_PROCESSING:
678 QDIO_DBF_TEXT5(0,trace,"inpnipro");
679 break;
680 default:
681 break;
682 }
683 QDIO_DBF_HEX4(0,trace,&q->first_to_check,sizeof(int));
684 return q->first_to_check;
685}
434 686
435static inline int 687static inline int
436qdio_get_outbound_buffer_frontier(struct qdio_q *q) 688qdio_get_outbound_buffer_frontier(struct qdio_q *q)
437{ 689{
438 int f,f_mod_no; 690 struct qdio_irq *irq;
439 volatile char *slsb; 691 volatile char *slsb;
440 int first_not_to_check; 692 unsigned int count = 1;
693 int first_not_to_check, f, f_mod_no;
441 char dbf_text[15]; 694 char dbf_text[15];
442 695
443 QDIO_DBF_TEXT4(0,trace,"getobfro"); 696 QDIO_DBF_TEXT4(0,trace,"getobfro");
444 QDIO_DBF_HEX4(0,trace,&q,sizeof(void*)); 697 QDIO_DBF_HEX4(0,trace,&q,sizeof(void*));
445 698
699 irq = (struct qdio_irq *) q->irq_ptr;
700 if (irq->is_qebsm)
701 return qdio_qebsm_get_outbound_buffer_frontier(q);
702
446 slsb=&q->slsb.acc.val[0]; 703 slsb=&q->slsb.acc.val[0];
447 f_mod_no=f=q->first_to_check; 704 f_mod_no=f=q->first_to_check;
448 /* 705 /*
@@ -484,7 +741,7 @@ check_next:
484 QDIO_DBF_HEX2(1,sbal,q->sbal[f_mod_no],256); 741 QDIO_DBF_HEX2(1,sbal,q->sbal[f_mod_no],256);
485 742
486 /* kind of process the buffer */ 743 /* kind of process the buffer */
487 set_slsb(&q->slsb.acc.val[f_mod_no], SLSB_P_OUTPUT_NOT_INIT); 744 set_slsb(q, &f_mod_no, SLSB_P_OUTPUT_NOT_INIT, &count);
488 745
489 /* 746 /*
490 * we increment the frontier, as this buffer 747 * we increment the frontier, as this buffer
@@ -597,48 +854,48 @@ qdio_kick_outbound_q(struct qdio_q *q)
597 854
598 result=qdio_siga_output(q); 855 result=qdio_siga_output(q);
599 856
600 switch (result) { 857 switch (result) {
601 case 0: 858 case 0:
602 /* went smooth this time, reset timestamp */ 859 /* went smooth this time, reset timestamp */
603#ifdef CONFIG_QDIO_DEBUG 860#ifdef CONFIG_QDIO_DEBUG
604 QDIO_DBF_TEXT3(0,trace,"cc2reslv"); 861 QDIO_DBF_TEXT3(0,trace,"cc2reslv");
605 sprintf(dbf_text,"%4x%2x%2x",q->irq,q->q_no, 862 sprintf(dbf_text,"%4x%2x%2x",q->schid.sch_no,q->q_no,
606 atomic_read(&q->busy_siga_counter)); 863 atomic_read(&q->busy_siga_counter));
607 QDIO_DBF_TEXT3(0,trace,dbf_text); 864 QDIO_DBF_TEXT3(0,trace,dbf_text);
608#endif /* CONFIG_QDIO_DEBUG */ 865#endif /* CONFIG_QDIO_DEBUG */
609 q->timing.busy_start=0; 866 q->timing.busy_start=0;
867 break;
868 case (2|QDIO_SIGA_ERROR_B_BIT_SET):
869 /* cc=2 and busy bit: */
870 atomic_inc(&q->busy_siga_counter);
871
872 /* if the last siga was successful, save
873 * timestamp here */
874 if (!q->timing.busy_start)
875 q->timing.busy_start=NOW;
876
877 /* if we're in time, don't touch error_status_flags
878 * and siga_error */
879 if (NOW-q->timing.busy_start<QDIO_BUSY_BIT_GIVE_UP) {
880 qdio_mark_q(q);
610 break; 881 break;
611 case (2|QDIO_SIGA_ERROR_B_BIT_SET): 882 }
612 /* cc=2 and busy bit: */ 883 QDIO_DBF_TEXT2(0,trace,"cc2REPRT");
613 atomic_inc(&q->busy_siga_counter);
614
615 /* if the last siga was successful, save
616 * timestamp here */
617 if (!q->timing.busy_start)
618 q->timing.busy_start=NOW;
619
620 /* if we're in time, don't touch error_status_flags
621 * and siga_error */
622 if (NOW-q->timing.busy_start<QDIO_BUSY_BIT_GIVE_UP) {
623 qdio_mark_q(q);
624 break;
625 }
626 QDIO_DBF_TEXT2(0,trace,"cc2REPRT");
627#ifdef CONFIG_QDIO_DEBUG 884#ifdef CONFIG_QDIO_DEBUG
628 sprintf(dbf_text,"%4x%2x%2x",q->irq,q->q_no, 885 sprintf(dbf_text,"%4x%2x%2x",q->schid.sch_no,q->q_no,
629 atomic_read(&q->busy_siga_counter)); 886 atomic_read(&q->busy_siga_counter));
630 QDIO_DBF_TEXT3(0,trace,dbf_text); 887 QDIO_DBF_TEXT3(0,trace,dbf_text);
631#endif /* CONFIG_QDIO_DEBUG */ 888#endif /* CONFIG_QDIO_DEBUG */
632 /* else fallthrough and report error */ 889 /* else fallthrough and report error */
633 default: 890 default:
634 /* for plain cc=1, 2 or 3: */ 891 /* for plain cc=1, 2 or 3: */
635 if (q->siga_error) 892 if (q->siga_error)
636 q->error_status_flags|=
637 QDIO_STATUS_MORE_THAN_ONE_SIGA_ERROR;
638 q->error_status_flags|= 893 q->error_status_flags|=
639 QDIO_STATUS_LOOK_FOR_ERROR; 894 QDIO_STATUS_MORE_THAN_ONE_SIGA_ERROR;
640 q->siga_error=result; 895 q->error_status_flags|=
641 } 896 QDIO_STATUS_LOOK_FOR_ERROR;
897 q->siga_error=result;
898 }
642} 899}
643 900
644static inline void 901static inline void
@@ -743,8 +1000,10 @@ qdio_outbound_processing(struct qdio_q *q)
743static inline int 1000static inline int
744qdio_get_inbound_buffer_frontier(struct qdio_q *q) 1001qdio_get_inbound_buffer_frontier(struct qdio_q *q)
745{ 1002{
1003 struct qdio_irq *irq;
746 int f,f_mod_no; 1004 int f,f_mod_no;
747 volatile char *slsb; 1005 volatile char *slsb;
1006 unsigned int count = 1;
748 int first_not_to_check; 1007 int first_not_to_check;
749#ifdef CONFIG_QDIO_DEBUG 1008#ifdef CONFIG_QDIO_DEBUG
750 char dbf_text[15]; 1009 char dbf_text[15];
@@ -756,6 +1015,10 @@ qdio_get_inbound_buffer_frontier(struct qdio_q *q)
756 QDIO_DBF_TEXT4(0,trace,"getibfro"); 1015 QDIO_DBF_TEXT4(0,trace,"getibfro");
757 QDIO_DBF_HEX4(0,trace,&q,sizeof(void*)); 1016 QDIO_DBF_HEX4(0,trace,&q,sizeof(void*));
758 1017
1018 irq = (struct qdio_irq *) q->irq_ptr;
1019 if (irq->is_qebsm)
1020 return qdio_qebsm_get_inbound_buffer_frontier(q);
1021
759 slsb=&q->slsb.acc.val[0]; 1022 slsb=&q->slsb.acc.val[0];
760 f_mod_no=f=q->first_to_check; 1023 f_mod_no=f=q->first_to_check;
761 /* 1024 /*
@@ -792,19 +1055,19 @@ check_next:
792 * kill VM in terms of CP overhead 1055 * kill VM in terms of CP overhead
793 */ 1056 */
794 if (q->siga_sync) { 1057 if (q->siga_sync) {
795 set_slsb(&slsb[f_mod_no],SLSB_P_INPUT_NOT_INIT); 1058 set_slsb(q, &f_mod_no, SLSB_P_INPUT_NOT_INIT, &count);
796 } else { 1059 } else {
797 /* set the previous buffer to NOT_INIT. The current 1060 /* set the previous buffer to NOT_INIT. The current
798 * buffer will be set to PROCESSING at the end of 1061 * buffer will be set to PROCESSING at the end of
799 * this function to avoid further interrupts. */ 1062 * this function to avoid further interrupts. */
800 if (last_position>=0) 1063 if (last_position>=0)
801 set_slsb(&slsb[last_position], 1064 set_slsb(q, &last_position,
802 SLSB_P_INPUT_NOT_INIT); 1065 SLSB_P_INPUT_NOT_INIT, &count);
803 atomic_set(&q->polling,1); 1066 atomic_set(&q->polling,1);
804 last_position=f_mod_no; 1067 last_position=f_mod_no;
805 } 1068 }
806#else /* QDIO_USE_PROCESSING_STATE */ 1069#else /* QDIO_USE_PROCESSING_STATE */
807 set_slsb(&slsb[f_mod_no],SLSB_P_INPUT_NOT_INIT); 1070 set_slsb(q, &f_mod_no, SLSB_P_INPUT_NOT_INIT, &count);
808#endif /* QDIO_USE_PROCESSING_STATE */ 1071#endif /* QDIO_USE_PROCESSING_STATE */
809 /* 1072 /*
810 * not needed, as the inbound queue will be synced on the next 1073 * not needed, as the inbound queue will be synced on the next
@@ -829,7 +1092,7 @@ check_next:
829 QDIO_DBF_HEX2(1,sbal,q->sbal[f_mod_no],256); 1092 QDIO_DBF_HEX2(1,sbal,q->sbal[f_mod_no],256);
830 1093
831 /* kind of process the buffer */ 1094 /* kind of process the buffer */
832 set_slsb(&slsb[f_mod_no],SLSB_P_INPUT_NOT_INIT); 1095 set_slsb(q, &f_mod_no, SLSB_P_INPUT_NOT_INIT, &count);
833 1096
834 if (q->qdio_error) 1097 if (q->qdio_error)
835 q->error_status_flags|= 1098 q->error_status_flags|=
@@ -857,7 +1120,7 @@ out:
857 1120
858#ifdef QDIO_USE_PROCESSING_STATE 1121#ifdef QDIO_USE_PROCESSING_STATE
859 if (last_position>=0) 1122 if (last_position>=0)
860 set_slsb(&slsb[last_position],SLSB_P_INPUT_PROCESSING); 1123 set_slsb(q, &last_position, SLSB_P_INPUT_NOT_INIT, &count);
861#endif /* QDIO_USE_PROCESSING_STATE */ 1124#endif /* QDIO_USE_PROCESSING_STATE */
862 1125
863 QDIO_DBF_HEX4(0,trace,&q->first_to_check,sizeof(int)); 1126 QDIO_DBF_HEX4(0,trace,&q->first_to_check,sizeof(int));
@@ -902,6 +1165,10 @@ static inline int
902tiqdio_is_inbound_q_done(struct qdio_q *q) 1165tiqdio_is_inbound_q_done(struct qdio_q *q)
903{ 1166{
904 int no_used; 1167 int no_used;
1168 unsigned int start_buf, count;
1169 unsigned char state = 0;
1170 struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr;
1171
905#ifdef CONFIG_QDIO_DEBUG 1172#ifdef CONFIG_QDIO_DEBUG
906 char dbf_text[15]; 1173 char dbf_text[15];
907#endif 1174#endif
@@ -927,8 +1194,13 @@ tiqdio_is_inbound_q_done(struct qdio_q *q)
927 if (!q->siga_sync) 1194 if (!q->siga_sync)
928 /* we'll check for more primed buffers in qeth_stop_polling */ 1195 /* we'll check for more primed buffers in qeth_stop_polling */
929 return 0; 1196 return 0;
930 1197 if (irq->is_qebsm) {
931 if (q->slsb.acc.val[q->first_to_check]!=SLSB_P_INPUT_PRIMED) 1198 count = 1;
1199 start_buf = q->first_to_check;
1200 qdio_do_eqbs(q, &state, &start_buf, &count);
1201 } else
1202 state = q->slsb.acc.val[q->first_to_check];
1203 if (state != SLSB_P_INPUT_PRIMED)
932 /* 1204 /*
933 * nothing more to do, if next buffer is not PRIMED. 1205 * nothing more to do, if next buffer is not PRIMED.
934 * note that we did a SYNC_MEMORY before, that there 1206 * note that we did a SYNC_MEMORY before, that there
@@ -955,6 +1227,10 @@ static inline int
955qdio_is_inbound_q_done(struct qdio_q *q) 1227qdio_is_inbound_q_done(struct qdio_q *q)
956{ 1228{
957 int no_used; 1229 int no_used;
1230 unsigned int start_buf, count;
1231 unsigned char state = 0;
1232 struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr;
1233
958#ifdef CONFIG_QDIO_DEBUG 1234#ifdef CONFIG_QDIO_DEBUG
959 char dbf_text[15]; 1235 char dbf_text[15];
960#endif 1236#endif
@@ -973,8 +1249,13 @@ qdio_is_inbound_q_done(struct qdio_q *q)
973 QDIO_DBF_TEXT4(0,trace,dbf_text); 1249 QDIO_DBF_TEXT4(0,trace,dbf_text);
974 return 1; 1250 return 1;
975 } 1251 }
976 1252 if (irq->is_qebsm) {
977 if (q->slsb.acc.val[q->first_to_check]==SLSB_P_INPUT_PRIMED) { 1253 count = 1;
1254 start_buf = q->first_to_check;
1255 qdio_do_eqbs(q, &state, &start_buf, &count);
1256 } else
1257 state = q->slsb.acc.val[q->first_to_check];
1258 if (state == SLSB_P_INPUT_PRIMED) {
978 /* we got something to do */ 1259 /* we got something to do */
979 QDIO_DBF_TEXT4(0,trace,"inqisntA"); 1260 QDIO_DBF_TEXT4(0,trace,"inqisntA");
980 QDIO_DBF_HEX4(0,trace,&q,sizeof(void*)); 1261 QDIO_DBF_HEX4(0,trace,&q,sizeof(void*));
@@ -1456,7 +1737,7 @@ qdio_fill_qs(struct qdio_irq *irq_ptr, struct ccw_device *cdev,
1456 void *ptr; 1737 void *ptr;
1457 int available; 1738 int available;
1458 1739
1459 sprintf(dbf_text,"qfqs%4x",cdev->private->irq); 1740 sprintf(dbf_text,"qfqs%4x",cdev->private->sch_no);
1460 QDIO_DBF_TEXT0(0,setup,dbf_text); 1741 QDIO_DBF_TEXT0(0,setup,dbf_text);
1461 for (i=0;i<no_input_qs;i++) { 1742 for (i=0;i<no_input_qs;i++) {
1462 q=irq_ptr->input_qs[i]; 1743 q=irq_ptr->input_qs[i];
@@ -1476,7 +1757,7 @@ qdio_fill_qs(struct qdio_irq *irq_ptr, struct ccw_device *cdev,
1476 1757
1477 q->queue_type=q_format; 1758 q->queue_type=q_format;
1478 q->int_parm=int_parm; 1759 q->int_parm=int_parm;
1479 q->irq=irq_ptr->irq; 1760 q->schid = irq_ptr->schid;
1480 q->irq_ptr = irq_ptr; 1761 q->irq_ptr = irq_ptr;
1481 q->cdev = cdev; 1762 q->cdev = cdev;
1482 q->mask=1<<(31-i); 1763 q->mask=1<<(31-i);
@@ -1523,11 +1804,11 @@ qdio_fill_qs(struct qdio_irq *irq_ptr, struct ccw_device *cdev,
1523 QDIO_DBF_HEX2(0,setup,&ptr,sizeof(void*)); 1804 QDIO_DBF_HEX2(0,setup,&ptr,sizeof(void*));
1524 1805
1525 /* fill in slsb */ 1806 /* fill in slsb */
1526 for (j=0;j<QDIO_MAX_BUFFERS_PER_Q;j++) { 1807 if (!irq_ptr->is_qebsm) {
1527 set_slsb(&q->slsb.acc.val[j], 1808 unsigned int count = 1;
1528 SLSB_P_INPUT_NOT_INIT); 1809 for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++)
1529/* q->sbal[j]->element[1].sbalf.i1.key=QDIO_STORAGE_KEY;*/ 1810 set_slsb(q, &j, SLSB_P_INPUT_NOT_INIT, &count);
1530 } 1811 }
1531 } 1812 }
1532 1813
1533 for (i=0;i<no_output_qs;i++) { 1814 for (i=0;i<no_output_qs;i++) {
@@ -1549,7 +1830,7 @@ qdio_fill_qs(struct qdio_irq *irq_ptr, struct ccw_device *cdev,
1549 q->queue_type=q_format; 1830 q->queue_type=q_format;
1550 q->int_parm=int_parm; 1831 q->int_parm=int_parm;
1551 q->is_input_q=0; 1832 q->is_input_q=0;
1552 q->irq=irq_ptr->irq; 1833 q->schid = irq_ptr->schid;
1553 q->cdev = cdev; 1834 q->cdev = cdev;
1554 q->irq_ptr = irq_ptr; 1835 q->irq_ptr = irq_ptr;
1555 q->mask=1<<(31-i); 1836 q->mask=1<<(31-i);
@@ -1584,11 +1865,11 @@ qdio_fill_qs(struct qdio_irq *irq_ptr, struct ccw_device *cdev,
1584 QDIO_DBF_HEX2(0,setup,&ptr,sizeof(void*)); 1865 QDIO_DBF_HEX2(0,setup,&ptr,sizeof(void*));
1585 1866
1586 /* fill in slsb */ 1867 /* fill in slsb */
1587 for (j=0;j<QDIO_MAX_BUFFERS_PER_Q;j++) { 1868 if (!irq_ptr->is_qebsm) {
1588 set_slsb(&q->slsb.acc.val[j], 1869 unsigned int count = 1;
1589 SLSB_P_OUTPUT_NOT_INIT); 1870 for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++)
1590/* q->sbal[j]->element[1].sbalf.i1.key=QDIO_STORAGE_KEY;*/ 1871 set_slsb(q, &j, SLSB_P_OUTPUT_NOT_INIT, &count);
1591 } 1872 }
1592 } 1873 }
1593} 1874}
1594 1875
@@ -1656,7 +1937,7 @@ qdio_set_state(struct qdio_irq *irq_ptr, enum qdio_irq_states state)
1656 char dbf_text[15]; 1937 char dbf_text[15];
1657 1938
1658 QDIO_DBF_TEXT5(0,trace,"newstate"); 1939 QDIO_DBF_TEXT5(0,trace,"newstate");
1659 sprintf(dbf_text,"%4x%4x",irq_ptr->irq,state); 1940 sprintf(dbf_text,"%4x%4x",irq_ptr->schid.sch_no,state);
1660 QDIO_DBF_TEXT5(0,trace,dbf_text); 1941 QDIO_DBF_TEXT5(0,trace,dbf_text);
1661#endif /* CONFIG_QDIO_DEBUG */ 1942#endif /* CONFIG_QDIO_DEBUG */
1662 1943
@@ -1669,12 +1950,12 @@ qdio_set_state(struct qdio_irq *irq_ptr, enum qdio_irq_states state)
1669} 1950}
1670 1951
1671static inline void 1952static inline void
1672qdio_irq_check_sense(int irq, struct irb *irb) 1953qdio_irq_check_sense(struct subchannel_id schid, struct irb *irb)
1673{ 1954{
1674 char dbf_text[15]; 1955 char dbf_text[15];
1675 1956
1676 if (irb->esw.esw0.erw.cons) { 1957 if (irb->esw.esw0.erw.cons) {
1677 sprintf(dbf_text,"sens%4x",irq); 1958 sprintf(dbf_text,"sens%4x",schid.sch_no);
1678 QDIO_DBF_TEXT2(1,trace,dbf_text); 1959 QDIO_DBF_TEXT2(1,trace,dbf_text);
1679 QDIO_DBF_HEX0(0,sense,irb,QDIO_DBF_SENSE_LEN); 1960 QDIO_DBF_HEX0(0,sense,irb,QDIO_DBF_SENSE_LEN);
1680 1961
@@ -1785,21 +2066,22 @@ qdio_timeout_handler(struct ccw_device *cdev)
1785 2066
1786 switch (irq_ptr->state) { 2067 switch (irq_ptr->state) {
1787 case QDIO_IRQ_STATE_INACTIVE: 2068 case QDIO_IRQ_STATE_INACTIVE:
1788 QDIO_PRINT_ERR("establish queues on irq %04x: timed out\n", 2069 QDIO_PRINT_ERR("establish queues on irq 0.%x.%04x: timed out\n",
1789 irq_ptr->irq); 2070 irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
1790 QDIO_DBF_TEXT2(1,setup,"eq:timeo"); 2071 QDIO_DBF_TEXT2(1,setup,"eq:timeo");
1791 qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR); 2072 qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR);
1792 break; 2073 break;
1793 case QDIO_IRQ_STATE_CLEANUP: 2074 case QDIO_IRQ_STATE_CLEANUP:
1794 QDIO_PRINT_INFO("Did not get interrupt on cleanup, irq=0x%x.\n", 2075 QDIO_PRINT_INFO("Did not get interrupt on cleanup, "
1795 irq_ptr->irq); 2076 "irq=0.%x.%x.\n",
2077 irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
1796 qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR); 2078 qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR);
1797 break; 2079 break;
1798 case QDIO_IRQ_STATE_ESTABLISHED: 2080 case QDIO_IRQ_STATE_ESTABLISHED:
1799 case QDIO_IRQ_STATE_ACTIVE: 2081 case QDIO_IRQ_STATE_ACTIVE:
1800 /* I/O has been terminated by common I/O layer. */ 2082 /* I/O has been terminated by common I/O layer. */
1801 QDIO_PRINT_INFO("Queues on irq %04x killed by cio.\n", 2083 QDIO_PRINT_INFO("Queues on irq 0.%x.%04x killed by cio.\n",
1802 irq_ptr->irq); 2084 irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
1803 QDIO_DBF_TEXT2(1, trace, "cio:term"); 2085 QDIO_DBF_TEXT2(1, trace, "cio:term");
1804 qdio_set_state(irq_ptr, QDIO_IRQ_STATE_STOPPED); 2086 qdio_set_state(irq_ptr, QDIO_IRQ_STATE_STOPPED);
1805 if (get_device(&cdev->dev)) { 2087 if (get_device(&cdev->dev)) {
@@ -1862,7 +2144,7 @@ qdio_handler(struct ccw_device *cdev, unsigned long intparm, struct irb *irb)
1862 } 2144 }
1863 } 2145 }
1864 2146
1865 qdio_irq_check_sense(irq_ptr->irq, irb); 2147 qdio_irq_check_sense(irq_ptr->schid, irb);
1866 2148
1867#ifdef CONFIG_QDIO_DEBUG 2149#ifdef CONFIG_QDIO_DEBUG
1868 sprintf(dbf_text, "state:%d", irq_ptr->state); 2150 sprintf(dbf_text, "state:%d", irq_ptr->state);
@@ -1905,7 +2187,7 @@ int
1905qdio_synchronize(struct ccw_device *cdev, unsigned int flags, 2187qdio_synchronize(struct ccw_device *cdev, unsigned int flags,
1906 unsigned int queue_number) 2188 unsigned int queue_number)
1907{ 2189{
1908 int cc; 2190 int cc = 0;
1909 struct qdio_q *q; 2191 struct qdio_q *q;
1910 struct qdio_irq *irq_ptr; 2192 struct qdio_irq *irq_ptr;
1911 void *ptr; 2193 void *ptr;
@@ -1918,7 +2200,7 @@ qdio_synchronize(struct ccw_device *cdev, unsigned int flags,
1918 return -ENODEV; 2200 return -ENODEV;
1919 2201
1920#ifdef CONFIG_QDIO_DEBUG 2202#ifdef CONFIG_QDIO_DEBUG
1921 *((int*)(&dbf_text[4])) = irq_ptr->irq; 2203 *((int*)(&dbf_text[4])) = irq_ptr->schid.sch_no;
1922 QDIO_DBF_HEX4(0,trace,dbf_text,QDIO_DBF_TRACE_LEN); 2204 QDIO_DBF_HEX4(0,trace,dbf_text,QDIO_DBF_TRACE_LEN);
1923 *((int*)(&dbf_text[0]))=flags; 2205 *((int*)(&dbf_text[0]))=flags;
1924 *((int*)(&dbf_text[4]))=queue_number; 2206 *((int*)(&dbf_text[4]))=queue_number;
@@ -1929,12 +2211,14 @@ qdio_synchronize(struct ccw_device *cdev, unsigned int flags,
1929 q=irq_ptr->input_qs[queue_number]; 2211 q=irq_ptr->input_qs[queue_number];
1930 if (!q) 2212 if (!q)
1931 return -EINVAL; 2213 return -EINVAL;
1932 cc = do_siga_sync(q->irq, 0, q->mask); 2214 if (!(irq_ptr->is_qebsm))
2215 cc = do_siga_sync(q->schid, 0, q->mask);
1933 } else if (flags&QDIO_FLAG_SYNC_OUTPUT) { 2216 } else if (flags&QDIO_FLAG_SYNC_OUTPUT) {
1934 q=irq_ptr->output_qs[queue_number]; 2217 q=irq_ptr->output_qs[queue_number];
1935 if (!q) 2218 if (!q)
1936 return -EINVAL; 2219 return -EINVAL;
1937 cc = do_siga_sync(q->irq, q->mask, 0); 2220 if (!(irq_ptr->is_qebsm))
2221 cc = do_siga_sync(q->schid, q->mask, 0);
1938 } else 2222 } else
1939 return -EINVAL; 2223 return -EINVAL;
1940 2224
@@ -1945,15 +2229,54 @@ qdio_synchronize(struct ccw_device *cdev, unsigned int flags,
1945 return cc; 2229 return cc;
1946} 2230}
1947 2231
1948static unsigned char 2232static inline void
1949qdio_check_siga_needs(int sch) 2233qdio_check_subchannel_qebsm(struct qdio_irq *irq_ptr, unsigned char qdioac,
2234 unsigned long token)
2235{
2236 struct qdio_q *q;
2237 int i;
2238 unsigned int count, start_buf;
2239 char dbf_text[15];
2240
2241 /*check if QEBSM is disabled */
2242 if (!(irq_ptr->is_qebsm) || !(qdioac & 0x01)) {
2243 irq_ptr->is_qebsm = 0;
2244 irq_ptr->sch_token = 0;
2245 irq_ptr->qib.rflags &= ~QIB_RFLAGS_ENABLE_QEBSM;
2246 QDIO_DBF_TEXT0(0,setup,"noV=V");
2247 return;
2248 }
2249 irq_ptr->sch_token = token;
2250 /*input queue*/
2251 for (i = 0; i < irq_ptr->no_input_qs;i++) {
2252 q = irq_ptr->input_qs[i];
2253 count = QDIO_MAX_BUFFERS_PER_Q;
2254 start_buf = 0;
2255 set_slsb(q, &start_buf, SLSB_P_INPUT_NOT_INIT, &count);
2256 }
2257 sprintf(dbf_text,"V=V:%2x",irq_ptr->is_qebsm);
2258 QDIO_DBF_TEXT0(0,setup,dbf_text);
2259 sprintf(dbf_text,"%8lx",irq_ptr->sch_token);
2260 QDIO_DBF_TEXT0(0,setup,dbf_text);
2261 /*output queue*/
2262 for (i = 0; i < irq_ptr->no_output_qs; i++) {
2263 q = irq_ptr->output_qs[i];
2264 count = QDIO_MAX_BUFFERS_PER_Q;
2265 start_buf = 0;
2266 set_slsb(q, &start_buf, SLSB_P_OUTPUT_NOT_INIT, &count);
2267 }
2268}
2269
2270static void
2271qdio_get_ssqd_information(struct qdio_irq *irq_ptr)
1950{ 2272{
1951 int result; 2273 int result;
1952 unsigned char qdioac; 2274 unsigned char qdioac;
1953
1954 struct { 2275 struct {
1955 struct chsc_header request; 2276 struct chsc_header request;
1956 u16 reserved1; 2277 u16 reserved1:10;
2278 u16 ssid:2;
2279 u16 fmt:4;
1957 u16 first_sch; 2280 u16 first_sch;
1958 u16 reserved2; 2281 u16 reserved2;
1959 u16 last_sch; 2282 u16 last_sch;
@@ -1964,67 +2287,83 @@ qdio_check_siga_needs(int sch)
1964 u8 reserved5; 2287 u8 reserved5;
1965 u16 sch; 2288 u16 sch;
1966 u8 qfmt; 2289 u8 qfmt;
1967 u8 reserved6; 2290 u8 parm;
1968 u8 qdioac; 2291 u8 qdioac1;
1969 u8 sch_class; 2292 u8 sch_class;
1970 u8 reserved7; 2293 u8 reserved7;
1971 u8 icnt; 2294 u8 icnt;
1972 u8 reserved8; 2295 u8 reserved8;
1973 u8 ocnt; 2296 u8 ocnt;
2297 u8 reserved9;
2298 u8 mbccnt;
2299 u16 qdioac2;
2300 u64 sch_token;
1974 } *ssqd_area; 2301 } *ssqd_area;
1975 2302
2303 QDIO_DBF_TEXT0(0,setup,"getssqd");
2304 qdioac = 0;
1976 ssqd_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); 2305 ssqd_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
1977 if (!ssqd_area) { 2306 if (!ssqd_area) {
1978 QDIO_PRINT_WARN("Could not get memory for chsc. Using all " \ 2307 QDIO_PRINT_WARN("Could not get memory for chsc. Using all " \
1979 "SIGAs for sch x%x.\n", sch); 2308 "SIGAs for sch x%x.\n", irq_ptr->schid.sch_no);
1980 return CHSC_FLAG_SIGA_INPUT_NECESSARY || 2309 irq_ptr->qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY ||
1981 CHSC_FLAG_SIGA_OUTPUT_NECESSARY || 2310 CHSC_FLAG_SIGA_OUTPUT_NECESSARY ||
1982 CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */ 2311 CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */
2312 irq_ptr->is_qebsm = 0;
2313 irq_ptr->sch_token = 0;
2314 irq_ptr->qib.rflags &= ~QIB_RFLAGS_ENABLE_QEBSM;
2315 return;
1983 } 2316 }
2317
1984 ssqd_area->request = (struct chsc_header) { 2318 ssqd_area->request = (struct chsc_header) {
1985 .length = 0x0010, 2319 .length = 0x0010,
1986 .code = 0x0024, 2320 .code = 0x0024,
1987 }; 2321 };
1988 2322 ssqd_area->first_sch = irq_ptr->schid.sch_no;
1989 ssqd_area->first_sch = sch; 2323 ssqd_area->last_sch = irq_ptr->schid.sch_no;
1990 ssqd_area->last_sch = sch; 2324 ssqd_area->ssid = irq_ptr->schid.ssid;
1991 2325 result = chsc(ssqd_area);
1992 result=chsc(ssqd_area);
1993 2326
1994 if (result) { 2327 if (result) {
1995 QDIO_PRINT_WARN("CHSC returned cc %i. Using all " \ 2328 QDIO_PRINT_WARN("CHSC returned cc %i. Using all " \
1996 "SIGAs for sch x%x.\n", 2329 "SIGAs for sch 0.%x.%x.\n", result,
1997 result,sch); 2330 irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
1998 qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY || 2331 qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY ||
1999 CHSC_FLAG_SIGA_OUTPUT_NECESSARY || 2332 CHSC_FLAG_SIGA_OUTPUT_NECESSARY ||
2000 CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */ 2333 CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */
2334 irq_ptr->is_qebsm = 0;
2001 goto out; 2335 goto out;
2002 } 2336 }
2003 2337
2004 if (ssqd_area->response.code != QDIO_CHSC_RESPONSE_CODE_OK) { 2338 if (ssqd_area->response.code != QDIO_CHSC_RESPONSE_CODE_OK) {
2005 QDIO_PRINT_WARN("response upon checking SIGA needs " \ 2339 QDIO_PRINT_WARN("response upon checking SIGA needs " \
2006 "is 0x%x. Using all SIGAs for sch x%x.\n", 2340 "is 0x%x. Using all SIGAs for sch 0.%x.%x.\n",
2007 ssqd_area->response.code, sch); 2341 ssqd_area->response.code,
2342 irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
2008 qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY || 2343 qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY ||
2009 CHSC_FLAG_SIGA_OUTPUT_NECESSARY || 2344 CHSC_FLAG_SIGA_OUTPUT_NECESSARY ||
2010 CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */ 2345 CHSC_FLAG_SIGA_SYNC_NECESSARY; /* all flags set */
2346 irq_ptr->is_qebsm = 0;
2011 goto out; 2347 goto out;
2012 } 2348 }
2013 if (!(ssqd_area->flags & CHSC_FLAG_QDIO_CAPABILITY) || 2349 if (!(ssqd_area->flags & CHSC_FLAG_QDIO_CAPABILITY) ||
2014 !(ssqd_area->flags & CHSC_FLAG_VALIDITY) || 2350 !(ssqd_area->flags & CHSC_FLAG_VALIDITY) ||
2015 (ssqd_area->sch != sch)) { 2351 (ssqd_area->sch != irq_ptr->schid.sch_no)) {
2016 QDIO_PRINT_WARN("huh? problems checking out sch x%x... " \ 2352 QDIO_PRINT_WARN("huh? problems checking out sch 0.%x.%x... " \
2017 "using all SIGAs.\n",sch); 2353 "using all SIGAs.\n",
2354 irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
2018 qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY | 2355 qdioac = CHSC_FLAG_SIGA_INPUT_NECESSARY |
2019 CHSC_FLAG_SIGA_OUTPUT_NECESSARY | 2356 CHSC_FLAG_SIGA_OUTPUT_NECESSARY |
2020 CHSC_FLAG_SIGA_SYNC_NECESSARY; /* worst case */ 2357 CHSC_FLAG_SIGA_SYNC_NECESSARY; /* worst case */
2358 irq_ptr->is_qebsm = 0;
2021 goto out; 2359 goto out;
2022 } 2360 }
2023 2361 qdioac = ssqd_area->qdioac1;
2024 qdioac = ssqd_area->qdioac;
2025out: 2362out:
2363 qdio_check_subchannel_qebsm(irq_ptr, qdioac,
2364 ssqd_area->sch_token);
2026 free_page ((unsigned long) ssqd_area); 2365 free_page ((unsigned long) ssqd_area);
2027 return qdioac; 2366 irq_ptr->qdioac = qdioac;
2028} 2367}
2029 2368
2030static unsigned int 2369static unsigned int
@@ -2055,6 +2394,13 @@ tiqdio_check_chsc_availability(void)
2055 sprintf(dbf_text,"hydrati%1x", hydra_thinints); 2394 sprintf(dbf_text,"hydrati%1x", hydra_thinints);
2056 QDIO_DBF_TEXT0(0,setup,dbf_text); 2395 QDIO_DBF_TEXT0(0,setup,dbf_text);
2057 2396
2397#ifdef CONFIG_64BIT
2398 /* Check for QEBSM support in general (bit 58). */
2399 is_passthrough = css_general_characteristics.qebsm;
2400#endif
2401 sprintf(dbf_text,"cssQBS:%1x", is_passthrough);
2402 QDIO_DBF_TEXT0(0,setup,dbf_text);
2403
2058 /* Check for aif time delay disablement fac (bit 56). If installed, 2404 /* Check for aif time delay disablement fac (bit 56). If installed,
2059 * omit svs even under lpar (good point by rick again) */ 2405 * omit svs even under lpar (good point by rick again) */
2060 omit_svs = css_general_characteristics.aif_tdd; 2406 omit_svs = css_general_characteristics.aif_tdd;
@@ -2091,7 +2437,7 @@ tiqdio_set_subchannel_ind(struct qdio_irq *irq_ptr, int reset_to_zero)
2091 /* set to 0x10000000 to enable 2437 /* set to 0x10000000 to enable
2092 * time delay disablement facility */ 2438 * time delay disablement facility */
2093 u32 reserved5; 2439 u32 reserved5;
2094 u32 subsystem_id; 2440 struct subchannel_id schid;
2095 u32 reserved6[1004]; 2441 u32 reserved6[1004];
2096 struct chsc_header response; 2442 struct chsc_header response;
2097 u32 reserved7; 2443 u32 reserved7;
@@ -2113,7 +2459,8 @@ tiqdio_set_subchannel_ind(struct qdio_irq *irq_ptr, int reset_to_zero)
2113 scssc_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); 2459 scssc_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
2114 if (!scssc_area) { 2460 if (!scssc_area) {
2115 QDIO_PRINT_WARN("No memory for setting indicators on " \ 2461 QDIO_PRINT_WARN("No memory for setting indicators on " \
2116 "subchannel x%x.\n", irq_ptr->irq); 2462 "subchannel 0.%x.%x.\n",
2463 irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
2117 return -ENOMEM; 2464 return -ENOMEM;
2118 } 2465 }
2119 scssc_area->request = (struct chsc_header) { 2466 scssc_area->request = (struct chsc_header) {
@@ -2127,7 +2474,7 @@ tiqdio_set_subchannel_ind(struct qdio_irq *irq_ptr, int reset_to_zero)
2127 scssc_area->ks = QDIO_STORAGE_KEY; 2474 scssc_area->ks = QDIO_STORAGE_KEY;
2128 scssc_area->kc = QDIO_STORAGE_KEY; 2475 scssc_area->kc = QDIO_STORAGE_KEY;
2129 scssc_area->isc = TIQDIO_THININT_ISC; 2476 scssc_area->isc = TIQDIO_THININT_ISC;
2130 scssc_area->subsystem_id = (1<<16) + irq_ptr->irq; 2477 scssc_area->schid = irq_ptr->schid;
2131 /* enables the time delay disablement facility. Don't care 2478 /* enables the time delay disablement facility. Don't care
2132 * whether it is really there (i.e. we haven't checked for 2479 * whether it is really there (i.e. we haven't checked for
2133 * it) */ 2480 * it) */
@@ -2137,12 +2484,11 @@ tiqdio_set_subchannel_ind(struct qdio_irq *irq_ptr, int reset_to_zero)
2137 QDIO_PRINT_WARN("Time delay disablement facility " \ 2484 QDIO_PRINT_WARN("Time delay disablement facility " \
2138 "not available\n"); 2485 "not available\n");
2139 2486
2140
2141
2142 result = chsc(scssc_area); 2487 result = chsc(scssc_area);
2143 if (result) { 2488 if (result) {
2144 QDIO_PRINT_WARN("could not set indicators on irq x%x, " \ 2489 QDIO_PRINT_WARN("could not set indicators on irq 0.%x.%x, " \
2145 "cc=%i.\n",irq_ptr->irq,result); 2490 "cc=%i.\n",
2491 irq_ptr->schid.ssid, irq_ptr->schid.sch_no,result);
2146 result = -EIO; 2492 result = -EIO;
2147 goto out; 2493 goto out;
2148 } 2494 }
@@ -2198,7 +2544,8 @@ tiqdio_set_delay_target(struct qdio_irq *irq_ptr, unsigned long delay_target)
2198 scsscf_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA); 2544 scsscf_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
2199 if (!scsscf_area) { 2545 if (!scsscf_area) {
2200 QDIO_PRINT_WARN("No memory for setting delay target on " \ 2546 QDIO_PRINT_WARN("No memory for setting delay target on " \
2201 "subchannel x%x.\n", irq_ptr->irq); 2547 "subchannel 0.%x.%x.\n",
2548 irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
2202 return -ENOMEM; 2549 return -ENOMEM;
2203 } 2550 }
2204 scsscf_area->request = (struct chsc_header) { 2551 scsscf_area->request = (struct chsc_header) {
@@ -2210,8 +2557,10 @@ tiqdio_set_delay_target(struct qdio_irq *irq_ptr, unsigned long delay_target)
2210 2557
2211 result=chsc(scsscf_area); 2558 result=chsc(scsscf_area);
2212 if (result) { 2559 if (result) {
2213 QDIO_PRINT_WARN("could not set delay target on irq x%x, " \ 2560 QDIO_PRINT_WARN("could not set delay target on irq 0.%x.%x, " \
2214 "cc=%i. Continuing.\n",irq_ptr->irq,result); 2561 "cc=%i. Continuing.\n",
2562 irq_ptr->schid.ssid, irq_ptr->schid.sch_no,
2563 result);
2215 result = -EIO; 2564 result = -EIO;
2216 goto out; 2565 goto out;
2217 } 2566 }
@@ -2245,7 +2594,7 @@ qdio_cleanup(struct ccw_device *cdev, int how)
2245 if (!irq_ptr) 2594 if (!irq_ptr)
2246 return -ENODEV; 2595 return -ENODEV;
2247 2596
2248 sprintf(dbf_text,"qcln%4x",irq_ptr->irq); 2597 sprintf(dbf_text,"qcln%4x",irq_ptr->schid.sch_no);
2249 QDIO_DBF_TEXT1(0,trace,dbf_text); 2598 QDIO_DBF_TEXT1(0,trace,dbf_text);
2250 QDIO_DBF_TEXT0(0,setup,dbf_text); 2599 QDIO_DBF_TEXT0(0,setup,dbf_text);
2251 2600
@@ -2272,7 +2621,7 @@ qdio_shutdown(struct ccw_device *cdev, int how)
2272 2621
2273 down(&irq_ptr->setting_up_sema); 2622 down(&irq_ptr->setting_up_sema);
2274 2623
2275 sprintf(dbf_text,"qsqs%4x",irq_ptr->irq); 2624 sprintf(dbf_text,"qsqs%4x",irq_ptr->schid.sch_no);
2276 QDIO_DBF_TEXT1(0,trace,dbf_text); 2625 QDIO_DBF_TEXT1(0,trace,dbf_text);
2277 QDIO_DBF_TEXT0(0,setup,dbf_text); 2626 QDIO_DBF_TEXT0(0,setup,dbf_text);
2278 2627
@@ -2378,7 +2727,7 @@ qdio_free(struct ccw_device *cdev)
2378 2727
2379 down(&irq_ptr->setting_up_sema); 2728 down(&irq_ptr->setting_up_sema);
2380 2729
2381 sprintf(dbf_text,"qfqs%4x",irq_ptr->irq); 2730 sprintf(dbf_text,"qfqs%4x",irq_ptr->schid.sch_no);
2382 QDIO_DBF_TEXT1(0,trace,dbf_text); 2731 QDIO_DBF_TEXT1(0,trace,dbf_text);
2383 QDIO_DBF_TEXT0(0,setup,dbf_text); 2732 QDIO_DBF_TEXT0(0,setup,dbf_text);
2384 2733
@@ -2526,13 +2875,14 @@ qdio_establish_irq_check_for_errors(struct ccw_device *cdev, int cstat,
2526 irq_ptr = cdev->private->qdio_data; 2875 irq_ptr = cdev->private->qdio_data;
2527 2876
2528 if (cstat || (dstat & ~(DEV_STAT_CHN_END|DEV_STAT_DEV_END))) { 2877 if (cstat || (dstat & ~(DEV_STAT_CHN_END|DEV_STAT_DEV_END))) {
2529 sprintf(dbf_text,"ick1%4x",irq_ptr->irq); 2878 sprintf(dbf_text,"ick1%4x",irq_ptr->schid.sch_no);
2530 QDIO_DBF_TEXT2(1,trace,dbf_text); 2879 QDIO_DBF_TEXT2(1,trace,dbf_text);
2531 QDIO_DBF_HEX2(0,trace,&dstat,sizeof(int)); 2880 QDIO_DBF_HEX2(0,trace,&dstat,sizeof(int));
2532 QDIO_DBF_HEX2(0,trace,&cstat,sizeof(int)); 2881 QDIO_DBF_HEX2(0,trace,&cstat,sizeof(int));
2533 QDIO_PRINT_ERR("received check condition on establish " \ 2882 QDIO_PRINT_ERR("received check condition on establish " \
2534 "queues on irq 0x%x (cs=x%x, ds=x%x).\n", 2883 "queues on irq 0.%x.%x (cs=x%x, ds=x%x).\n",
2535 irq_ptr->irq,cstat,dstat); 2884 irq_ptr->schid.ssid, irq_ptr->schid.sch_no,
2885 cstat,dstat);
2536 qdio_set_state(irq_ptr,QDIO_IRQ_STATE_ERR); 2886 qdio_set_state(irq_ptr,QDIO_IRQ_STATE_ERR);
2537 } 2887 }
2538 2888
@@ -2540,9 +2890,10 @@ qdio_establish_irq_check_for_errors(struct ccw_device *cdev, int cstat,
2540 QDIO_DBF_TEXT2(1,setup,"eq:no de"); 2890 QDIO_DBF_TEXT2(1,setup,"eq:no de");
2541 QDIO_DBF_HEX2(0,setup,&dstat, sizeof(dstat)); 2891 QDIO_DBF_HEX2(0,setup,&dstat, sizeof(dstat));
2542 QDIO_DBF_HEX2(0,setup,&cstat, sizeof(cstat)); 2892 QDIO_DBF_HEX2(0,setup,&cstat, sizeof(cstat));
2543 QDIO_PRINT_ERR("establish queues on irq %04x: didn't get " 2893 QDIO_PRINT_ERR("establish queues on irq 0.%x.%04x: didn't get "
2544 "device end: dstat=%02x, cstat=%02x\n", 2894 "device end: dstat=%02x, cstat=%02x\n",
2545 irq_ptr->irq, dstat, cstat); 2895 irq_ptr->schid.ssid, irq_ptr->schid.sch_no,
2896 dstat, cstat);
2546 qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR); 2897 qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR);
2547 return 1; 2898 return 1;
2548 } 2899 }
@@ -2551,10 +2902,10 @@ qdio_establish_irq_check_for_errors(struct ccw_device *cdev, int cstat,
2551 QDIO_DBF_TEXT2(1,setup,"eq:badio"); 2902 QDIO_DBF_TEXT2(1,setup,"eq:badio");
2552 QDIO_DBF_HEX2(0,setup,&dstat, sizeof(dstat)); 2903 QDIO_DBF_HEX2(0,setup,&dstat, sizeof(dstat));
2553 QDIO_DBF_HEX2(0,setup,&cstat, sizeof(cstat)); 2904 QDIO_DBF_HEX2(0,setup,&cstat, sizeof(cstat));
2554 QDIO_PRINT_ERR("establish queues on irq %04x: got " 2905 QDIO_PRINT_ERR("establish queues on irq 0.%x.%04x: got "
2555 "the following devstat: dstat=%02x, " 2906 "the following devstat: dstat=%02x, "
2556 "cstat=%02x\n", 2907 "cstat=%02x\n", irq_ptr->schid.ssid,
2557 irq_ptr->irq, dstat, cstat); 2908 irq_ptr->schid.sch_no, dstat, cstat);
2558 qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR); 2909 qdio_set_state(irq_ptr, QDIO_IRQ_STATE_ERR);
2559 return 1; 2910 return 1;
2560 } 2911 }
@@ -2569,7 +2920,7 @@ qdio_establish_handle_irq(struct ccw_device *cdev, int cstat, int dstat)
2569 2920
2570 irq_ptr = cdev->private->qdio_data; 2921 irq_ptr = cdev->private->qdio_data;
2571 2922
2572 sprintf(dbf_text,"qehi%4x",cdev->private->irq); 2923 sprintf(dbf_text,"qehi%4x",cdev->private->sch_no);
2573 QDIO_DBF_TEXT0(0,setup,dbf_text); 2924 QDIO_DBF_TEXT0(0,setup,dbf_text);
2574 QDIO_DBF_TEXT0(0,trace,dbf_text); 2925 QDIO_DBF_TEXT0(0,trace,dbf_text);
2575 2926
@@ -2588,7 +2939,7 @@ qdio_initialize(struct qdio_initialize *init_data)
2588 int rc; 2939 int rc;
2589 char dbf_text[15]; 2940 char dbf_text[15];
2590 2941
2591 sprintf(dbf_text,"qini%4x",init_data->cdev->private->irq); 2942 sprintf(dbf_text,"qini%4x",init_data->cdev->private->sch_no);
2592 QDIO_DBF_TEXT0(0,setup,dbf_text); 2943 QDIO_DBF_TEXT0(0,setup,dbf_text);
2593 QDIO_DBF_TEXT0(0,trace,dbf_text); 2944 QDIO_DBF_TEXT0(0,trace,dbf_text);
2594 2945
@@ -2609,7 +2960,7 @@ qdio_allocate(struct qdio_initialize *init_data)
2609 struct qdio_irq *irq_ptr; 2960 struct qdio_irq *irq_ptr;
2610 char dbf_text[15]; 2961 char dbf_text[15];
2611 2962
2612 sprintf(dbf_text,"qalc%4x",init_data->cdev->private->irq); 2963 sprintf(dbf_text,"qalc%4x",init_data->cdev->private->sch_no);
2613 QDIO_DBF_TEXT0(0,setup,dbf_text); 2964 QDIO_DBF_TEXT0(0,setup,dbf_text);
2614 QDIO_DBF_TEXT0(0,trace,dbf_text); 2965 QDIO_DBF_TEXT0(0,trace,dbf_text);
2615 if ( (init_data->no_input_qs>QDIO_MAX_QUEUES_PER_IRQ) || 2966 if ( (init_data->no_input_qs>QDIO_MAX_QUEUES_PER_IRQ) ||
@@ -2682,7 +3033,7 @@ int qdio_fill_irq(struct qdio_initialize *init_data)
2682 3033
2683 irq_ptr->int_parm=init_data->int_parm; 3034 irq_ptr->int_parm=init_data->int_parm;
2684 3035
2685 irq_ptr->irq = init_data->cdev->private->irq; 3036 irq_ptr->schid = ccw_device_get_subchannel_id(init_data->cdev);
2686 irq_ptr->no_input_qs=init_data->no_input_qs; 3037 irq_ptr->no_input_qs=init_data->no_input_qs;
2687 irq_ptr->no_output_qs=init_data->no_output_qs; 3038 irq_ptr->no_output_qs=init_data->no_output_qs;
2688 3039
@@ -2698,11 +3049,12 @@ int qdio_fill_irq(struct qdio_initialize *init_data)
2698 QDIO_DBF_TEXT2(0,setup,dbf_text); 3049 QDIO_DBF_TEXT2(0,setup,dbf_text);
2699 3050
2700 if (irq_ptr->is_thinint_irq) { 3051 if (irq_ptr->is_thinint_irq) {
2701 irq_ptr->dev_st_chg_ind=qdio_get_indicator(); 3052 irq_ptr->dev_st_chg_ind = qdio_get_indicator();
2702 QDIO_DBF_HEX1(0,setup,&irq_ptr->dev_st_chg_ind,sizeof(void*)); 3053 QDIO_DBF_HEX1(0,setup,&irq_ptr->dev_st_chg_ind,sizeof(void*));
2703 if (!irq_ptr->dev_st_chg_ind) { 3054 if (!irq_ptr->dev_st_chg_ind) {
2704 QDIO_PRINT_WARN("no indicator location available " \ 3055 QDIO_PRINT_WARN("no indicator location available " \
2705 "for irq 0x%x\n",irq_ptr->irq); 3056 "for irq 0.%x.%x\n",
3057 irq_ptr->schid.ssid, irq_ptr->schid.sch_no);
2706 qdio_release_irq_memory(irq_ptr); 3058 qdio_release_irq_memory(irq_ptr);
2707 return -ENOBUFS; 3059 return -ENOBUFS;
2708 } 3060 }
@@ -2747,6 +3099,10 @@ int qdio_fill_irq(struct qdio_initialize *init_data)
2747 irq_ptr->qdr->qkey=QDIO_STORAGE_KEY; 3099 irq_ptr->qdr->qkey=QDIO_STORAGE_KEY;
2748 3100
2749 /* fill in qib */ 3101 /* fill in qib */
3102 irq_ptr->is_qebsm = is_passthrough;
3103 if (irq_ptr->is_qebsm)
3104 irq_ptr->qib.rflags |= QIB_RFLAGS_ENABLE_QEBSM;
3105
2750 irq_ptr->qib.qfmt=init_data->q_format; 3106 irq_ptr->qib.qfmt=init_data->q_format;
2751 if (init_data->no_input_qs) 3107 if (init_data->no_input_qs)
2752 irq_ptr->qib.isliba=(unsigned long)(irq_ptr->input_qs[0]->slib); 3108 irq_ptr->qib.isliba=(unsigned long)(irq_ptr->input_qs[0]->slib);
@@ -2829,7 +3185,7 @@ qdio_establish(struct qdio_initialize *init_data)
2829 tiqdio_set_delay_target(irq_ptr,TIQDIO_DELAY_TARGET); 3185 tiqdio_set_delay_target(irq_ptr,TIQDIO_DELAY_TARGET);
2830 } 3186 }
2831 3187
2832 sprintf(dbf_text,"qest%4x",cdev->private->irq); 3188 sprintf(dbf_text,"qest%4x",cdev->private->sch_no);
2833 QDIO_DBF_TEXT0(0,setup,dbf_text); 3189 QDIO_DBF_TEXT0(0,setup,dbf_text);
2834 QDIO_DBF_TEXT0(0,trace,dbf_text); 3190 QDIO_DBF_TEXT0(0,trace,dbf_text);
2835 3191
@@ -2855,9 +3211,10 @@ qdio_establish(struct qdio_initialize *init_data)
2855 sprintf(dbf_text,"eq:io%4x",result); 3211 sprintf(dbf_text,"eq:io%4x",result);
2856 QDIO_DBF_TEXT2(1,setup,dbf_text); 3212 QDIO_DBF_TEXT2(1,setup,dbf_text);
2857 } 3213 }
2858 QDIO_PRINT_WARN("establish queues on irq %04x: do_IO " \ 3214 QDIO_PRINT_WARN("establish queues on irq 0.%x.%04x: do_IO " \
2859 "returned %i, next try returned %i\n", 3215 "returned %i, next try returned %i\n",
2860 irq_ptr->irq,result,result2); 3216 irq_ptr->schid.ssid, irq_ptr->schid.sch_no,
3217 result, result2);
2861 result=result2; 3218 result=result2;
2862 if (result) 3219 if (result)
2863 ccw_device_set_timeout(cdev, 0); 3220 ccw_device_set_timeout(cdev, 0);
@@ -2884,7 +3241,7 @@ qdio_establish(struct qdio_initialize *init_data)
2884 return -EIO; 3241 return -EIO;
2885 } 3242 }
2886 3243
2887 irq_ptr->qdioac=qdio_check_siga_needs(irq_ptr->irq); 3244 qdio_get_ssqd_information(irq_ptr);
2888 /* if this gets set once, we're running under VM and can omit SVSes */ 3245 /* if this gets set once, we're running under VM and can omit SVSes */
2889 if (irq_ptr->qdioac&CHSC_FLAG_SIGA_SYNC_NECESSARY) 3246 if (irq_ptr->qdioac&CHSC_FLAG_SIGA_SYNC_NECESSARY)
2890 omit_svs=1; 3247 omit_svs=1;
@@ -2930,7 +3287,7 @@ qdio_activate(struct ccw_device *cdev, int flags)
2930 goto out; 3287 goto out;
2931 } 3288 }
2932 3289
2933 sprintf(dbf_text,"qact%4x", irq_ptr->irq); 3290 sprintf(dbf_text,"qact%4x", irq_ptr->schid.sch_no);
2934 QDIO_DBF_TEXT2(0,setup,dbf_text); 3291 QDIO_DBF_TEXT2(0,setup,dbf_text);
2935 QDIO_DBF_TEXT2(0,trace,dbf_text); 3292 QDIO_DBF_TEXT2(0,trace,dbf_text);
2936 3293
@@ -2955,9 +3312,10 @@ qdio_activate(struct ccw_device *cdev, int flags)
2955 sprintf(dbf_text,"aq:io%4x",result); 3312 sprintf(dbf_text,"aq:io%4x",result);
2956 QDIO_DBF_TEXT2(1,setup,dbf_text); 3313 QDIO_DBF_TEXT2(1,setup,dbf_text);
2957 } 3314 }
2958 QDIO_PRINT_WARN("activate queues on irq %04x: do_IO " \ 3315 QDIO_PRINT_WARN("activate queues on irq 0.%x.%04x: do_IO " \
2959 "returned %i, next try returned %i\n", 3316 "returned %i, next try returned %i\n",
2960 irq_ptr->irq,result,result2); 3317 irq_ptr->schid.ssid, irq_ptr->schid.sch_no,
3318 result, result2);
2961 result=result2; 3319 result=result2;
2962 } 3320 }
2963 3321
@@ -3015,30 +3373,40 @@ static inline void
3015qdio_do_qdio_fill_input(struct qdio_q *q, unsigned int qidx, 3373qdio_do_qdio_fill_input(struct qdio_q *q, unsigned int qidx,
3016 unsigned int count, struct qdio_buffer *buffers) 3374 unsigned int count, struct qdio_buffer *buffers)
3017{ 3375{
3376 struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr;
3377 qidx &= (QDIO_MAX_BUFFERS_PER_Q - 1);
3378 if (irq->is_qebsm) {
3379 while (count)
3380 set_slsb(q, &qidx, SLSB_CU_INPUT_EMPTY, &count);
3381 return;
3382 }
3018 for (;;) { 3383 for (;;) {
3019 set_slsb(&q->slsb.acc.val[qidx],SLSB_CU_INPUT_EMPTY); 3384 set_slsb(q, &qidx, SLSB_CU_INPUT_EMPTY, &count);
3020 count--; 3385 count--;
3021 if (!count) break; 3386 if (!count) break;
3022 qidx=(qidx+1)&(QDIO_MAX_BUFFERS_PER_Q-1); 3387 qidx = (qidx + 1) & (QDIO_MAX_BUFFERS_PER_Q - 1);
3023 } 3388 }
3024
3025 /* not necessary, as the queues are synced during the SIGA read */
3026 /*SYNC_MEMORY;*/
3027} 3389}
3028 3390
3029static inline void 3391static inline void
3030qdio_do_qdio_fill_output(struct qdio_q *q, unsigned int qidx, 3392qdio_do_qdio_fill_output(struct qdio_q *q, unsigned int qidx,
3031 unsigned int count, struct qdio_buffer *buffers) 3393 unsigned int count, struct qdio_buffer *buffers)
3032{ 3394{
3395 struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr;
3396
3397 qidx &= (QDIO_MAX_BUFFERS_PER_Q - 1);
3398 if (irq->is_qebsm) {
3399 while (count)
3400 set_slsb(q, &qidx, SLSB_CU_OUTPUT_PRIMED, &count);
3401 return;
3402 }
3403
3033 for (;;) { 3404 for (;;) {
3034 set_slsb(&q->slsb.acc.val[qidx],SLSB_CU_OUTPUT_PRIMED); 3405 set_slsb(q, &qidx, SLSB_CU_OUTPUT_PRIMED, &count);
3035 count--; 3406 count--;
3036 if (!count) break; 3407 if (!count) break;
3037 qidx=(qidx+1)&(QDIO_MAX_BUFFERS_PER_Q-1); 3408 qidx = (qidx + 1) & (QDIO_MAX_BUFFERS_PER_Q - 1);
3038 } 3409 }
3039
3040 /* SIGA write will sync the queues */
3041 /*SYNC_MEMORY;*/
3042} 3410}
3043 3411
3044static inline void 3412static inline void
@@ -3083,6 +3451,9 @@ do_qdio_handle_outbound(struct qdio_q *q, unsigned int callflags,
3083 struct qdio_buffer *buffers) 3451 struct qdio_buffer *buffers)
3084{ 3452{
3085 int used_elements; 3453 int used_elements;
3454 unsigned int cnt, start_buf;
3455 unsigned char state = 0;
3456 struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr;
3086 3457
3087 /* This is the outbound handling of queues */ 3458 /* This is the outbound handling of queues */
3088#ifdef QDIO_PERFORMANCE_STATS 3459#ifdef QDIO_PERFORMANCE_STATS
@@ -3115,9 +3486,15 @@ do_qdio_handle_outbound(struct qdio_q *q, unsigned int callflags,
3115 * SYNC_MEMORY :-/ ), we try to 3486 * SYNC_MEMORY :-/ ), we try to
3116 * fast-requeue buffers 3487 * fast-requeue buffers
3117 */ 3488 */
3118 if (q->slsb.acc.val[(qidx+QDIO_MAX_BUFFERS_PER_Q-1) 3489 if (irq->is_qebsm) {
3119 &(QDIO_MAX_BUFFERS_PER_Q-1)]!= 3490 cnt = 1;
3120 SLSB_CU_OUTPUT_PRIMED) { 3491 start_buf = ((qidx+QDIO_MAX_BUFFERS_PER_Q-1) &
3492 (QDIO_MAX_BUFFERS_PER_Q-1));
3493 qdio_do_eqbs(q, &state, &start_buf, &cnt);
3494 } else
3495 state = q->slsb.acc.val[(qidx+QDIO_MAX_BUFFERS_PER_Q-1)
3496 &(QDIO_MAX_BUFFERS_PER_Q-1) ];
3497 if (state != SLSB_CU_OUTPUT_PRIMED) {
3121 qdio_kick_outbound_q(q); 3498 qdio_kick_outbound_q(q);
3122 } else { 3499 } else {
3123 QDIO_DBF_TEXT3(0,trace, "fast-req"); 3500 QDIO_DBF_TEXT3(0,trace, "fast-req");
@@ -3150,7 +3527,7 @@ do_QDIO(struct ccw_device *cdev,unsigned int callflags,
3150#ifdef CONFIG_QDIO_DEBUG 3527#ifdef CONFIG_QDIO_DEBUG
3151 char dbf_text[20]; 3528 char dbf_text[20];
3152 3529
3153 sprintf(dbf_text,"doQD%04x",cdev->private->irq); 3530 sprintf(dbf_text,"doQD%04x",cdev->private->sch_no);
3154 QDIO_DBF_TEXT3(0,trace,dbf_text); 3531 QDIO_DBF_TEXT3(0,trace,dbf_text);
3155#endif /* CONFIG_QDIO_DEBUG */ 3532#endif /* CONFIG_QDIO_DEBUG */
3156 3533
diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h
index 328e31cc6854..fa385e761fe1 100644
--- a/drivers/s390/cio/qdio.h
+++ b/drivers/s390/cio/qdio.h
@@ -3,14 +3,15 @@
3 3
4#include <asm/page.h> 4#include <asm/page.h>
5 5
6#define VERSION_CIO_QDIO_H "$Revision: 1.33 $" 6#include "schid.h"
7
8#define VERSION_CIO_QDIO_H "$Revision: 1.40 $"
7 9
8#ifdef CONFIG_QDIO_DEBUG 10#ifdef CONFIG_QDIO_DEBUG
9#define QDIO_VERBOSE_LEVEL 9 11#define QDIO_VERBOSE_LEVEL 9
10#else /* CONFIG_QDIO_DEBUG */ 12#else /* CONFIG_QDIO_DEBUG */
11#define QDIO_VERBOSE_LEVEL 5 13#define QDIO_VERBOSE_LEVEL 5
12#endif /* CONFIG_QDIO_DEBUG */ 14#endif /* CONFIG_QDIO_DEBUG */
13
14#define QDIO_USE_PROCESSING_STATE 15#define QDIO_USE_PROCESSING_STATE
15 16
16#ifdef CONFIG_QDIO_PERF_STATS 17#ifdef CONFIG_QDIO_PERF_STATS
@@ -265,12 +266,64 @@ QDIO_PRINT_##importance(header "%02x %02x %02x %02x %02x %02x %02x %02x " \
265/* 266/*
266 * Some instructions as assembly 267 * Some instructions as assembly
267 */ 268 */
269
270static inline int
271do_sqbs(unsigned long sch, unsigned char state, int queue,
272 unsigned int *start, unsigned int *count)
273{
274#ifdef CONFIG_64BIT
275 register unsigned long _ccq asm ("0") = *count;
276 register unsigned long _sch asm ("1") = sch;
277 unsigned long _queuestart = ((unsigned long)queue << 32) | *start;
278
279 asm volatile (
280 " .insn rsy,0xeb000000008A,%1,0,0(%2)\n\t"
281 : "+d" (_ccq), "+d" (_queuestart)
282 : "d" ((unsigned long)state), "d" (_sch)
283 : "memory", "cc"
284 );
285 *count = _ccq & 0xff;
286 *start = _queuestart & 0xff;
287
288 return (_ccq >> 32) & 0xff;
289#else
290 return 0;
291#endif
292}
293
294static inline int
295do_eqbs(unsigned long sch, unsigned char *state, int queue,
296 unsigned int *start, unsigned int *count)
297{
298#ifdef CONFIG_64BIT
299 register unsigned long _ccq asm ("0") = *count;
300 register unsigned long _sch asm ("1") = sch;
301 unsigned long _queuestart = ((unsigned long)queue << 32) | *start;
302 unsigned long _state = 0;
303
304 asm volatile (
305 " .insn rrf,0xB99c0000,%1,%2,0,0 \n\t"
306 : "+d" (_ccq), "+d" (_queuestart), "+d" (_state)
307 : "d" (_sch)
308 : "memory", "cc"
309 );
310 *count = _ccq & 0xff;
311 *start = _queuestart & 0xff;
312 *state = _state & 0xff;
313
314 return (_ccq >> 32) & 0xff;
315#else
316 return 0;
317#endif
318}
319
320
268static inline int 321static inline int
269do_siga_sync(unsigned int irq, unsigned int mask1, unsigned int mask2) 322do_siga_sync(struct subchannel_id schid, unsigned int mask1, unsigned int mask2)
270{ 323{
271 int cc; 324 int cc;
272 325
273#ifndef CONFIG_ARCH_S390X 326#ifndef CONFIG_64BIT
274 asm volatile ( 327 asm volatile (
275 "lhi 0,2 \n\t" 328 "lhi 0,2 \n\t"
276 "lr 1,%1 \n\t" 329 "lr 1,%1 \n\t"
@@ -280,10 +333,10 @@ do_siga_sync(unsigned int irq, unsigned int mask1, unsigned int mask2)
280 "ipm %0 \n\t" 333 "ipm %0 \n\t"
281 "srl %0,28 \n\t" 334 "srl %0,28 \n\t"
282 : "=d" (cc) 335 : "=d" (cc)
283 : "d" (0x10000|irq), "d" (mask1), "d" (mask2) 336 : "d" (schid), "d" (mask1), "d" (mask2)
284 : "cc", "0", "1", "2", "3" 337 : "cc", "0", "1", "2", "3"
285 ); 338 );
286#else /* CONFIG_ARCH_S390X */ 339#else /* CONFIG_64BIT */
287 asm volatile ( 340 asm volatile (
288 "lghi 0,2 \n\t" 341 "lghi 0,2 \n\t"
289 "llgfr 1,%1 \n\t" 342 "llgfr 1,%1 \n\t"
@@ -293,19 +346,19 @@ do_siga_sync(unsigned int irq, unsigned int mask1, unsigned int mask2)
293 "ipm %0 \n\t" 346 "ipm %0 \n\t"
294 "srl %0,28 \n\t" 347 "srl %0,28 \n\t"
295 : "=d" (cc) 348 : "=d" (cc)
296 : "d" (0x10000|irq), "d" (mask1), "d" (mask2) 349 : "d" (schid), "d" (mask1), "d" (mask2)
297 : "cc", "0", "1", "2", "3" 350 : "cc", "0", "1", "2", "3"
298 ); 351 );
299#endif /* CONFIG_ARCH_S390X */ 352#endif /* CONFIG_64BIT */
300 return cc; 353 return cc;
301} 354}
302 355
303static inline int 356static inline int
304do_siga_input(unsigned int irq, unsigned int mask) 357do_siga_input(struct subchannel_id schid, unsigned int mask)
305{ 358{
306 int cc; 359 int cc;
307 360
308#ifndef CONFIG_ARCH_S390X 361#ifndef CONFIG_64BIT
309 asm volatile ( 362 asm volatile (
310 "lhi 0,1 \n\t" 363 "lhi 0,1 \n\t"
311 "lr 1,%1 \n\t" 364 "lr 1,%1 \n\t"
@@ -314,10 +367,10 @@ do_siga_input(unsigned int irq, unsigned int mask)
314 "ipm %0 \n\t" 367 "ipm %0 \n\t"
315 "srl %0,28 \n\t" 368 "srl %0,28 \n\t"
316 : "=d" (cc) 369 : "=d" (cc)
317 : "d" (0x10000|irq), "d" (mask) 370 : "d" (schid), "d" (mask)
318 : "cc", "0", "1", "2", "memory" 371 : "cc", "0", "1", "2", "memory"
319 ); 372 );
320#else /* CONFIG_ARCH_S390X */ 373#else /* CONFIG_64BIT */
321 asm volatile ( 374 asm volatile (
322 "lghi 0,1 \n\t" 375 "lghi 0,1 \n\t"
323 "llgfr 1,%1 \n\t" 376 "llgfr 1,%1 \n\t"
@@ -326,21 +379,22 @@ do_siga_input(unsigned int irq, unsigned int mask)
326 "ipm %0 \n\t" 379 "ipm %0 \n\t"
327 "srl %0,28 \n\t" 380 "srl %0,28 \n\t"
328 : "=d" (cc) 381 : "=d" (cc)
329 : "d" (0x10000|irq), "d" (mask) 382 : "d" (schid), "d" (mask)
330 : "cc", "0", "1", "2", "memory" 383 : "cc", "0", "1", "2", "memory"
331 ); 384 );
332#endif /* CONFIG_ARCH_S390X */ 385#endif /* CONFIG_64BIT */
333 386
334 return cc; 387 return cc;
335} 388}
336 389
337static inline int 390static inline int
338do_siga_output(unsigned long irq, unsigned long mask, __u32 *bb) 391do_siga_output(unsigned long schid, unsigned long mask, __u32 *bb,
392 unsigned int fc)
339{ 393{
340 int cc; 394 int cc;
341 __u32 busy_bit; 395 __u32 busy_bit;
342 396
343#ifndef CONFIG_ARCH_S390X 397#ifndef CONFIG_64BIT
344 asm volatile ( 398 asm volatile (
345 "lhi 0,0 \n\t" 399 "lhi 0,0 \n\t"
346 "lr 1,%2 \n\t" 400 "lr 1,%2 \n\t"
@@ -366,14 +420,14 @@ do_siga_output(unsigned long irq, unsigned long mask, __u32 *bb)
366 ".long 0b,2b \n\t" 420 ".long 0b,2b \n\t"
367 ".previous \n\t" 421 ".previous \n\t"
368 : "=d" (cc), "=d" (busy_bit) 422 : "=d" (cc), "=d" (busy_bit)
369 : "d" (0x10000|irq), "d" (mask), 423 : "d" (schid), "d" (mask),
370 "i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION) 424 "i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION)
371 : "cc", "0", "1", "2", "memory" 425 : "cc", "0", "1", "2", "memory"
372 ); 426 );
373#else /* CONFIG_ARCH_S390X */ 427#else /* CONFIG_64BIT */
374 asm volatile ( 428 asm volatile (
375 "lghi 0,0 \n\t" 429 "llgfr 0,%5 \n\t"
376 "llgfr 1,%2 \n\t" 430 "lgr 1,%2 \n\t"
377 "llgfr 2,%3 \n\t" 431 "llgfr 2,%3 \n\t"
378 "siga 0 \n\t" 432 "siga 0 \n\t"
379 "0:" 433 "0:"
@@ -391,11 +445,11 @@ do_siga_output(unsigned long irq, unsigned long mask, __u32 *bb)
391 ".quad 0b,1b \n\t" 445 ".quad 0b,1b \n\t"
392 ".previous \n\t" 446 ".previous \n\t"
393 : "=d" (cc), "=d" (busy_bit) 447 : "=d" (cc), "=d" (busy_bit)
394 : "d" (0x10000|irq), "d" (mask), 448 : "d" (schid), "d" (mask),
395 "i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION) 449 "i" (QDIO_SIGA_ERROR_ACCESS_EXCEPTION), "d" (fc)
396 : "cc", "0", "1", "2", "memory" 450 : "cc", "0", "1", "2", "memory"
397 ); 451 );
398#endif /* CONFIG_ARCH_S390X */ 452#endif /* CONFIG_64BIT */
399 453
400 (*bb) = busy_bit; 454 (*bb) = busy_bit;
401 return cc; 455 return cc;
@@ -407,21 +461,21 @@ do_clear_global_summary(void)
407 461
408 unsigned long time; 462 unsigned long time;
409 463
410#ifndef CONFIG_ARCH_S390X 464#ifndef CONFIG_64BIT
411 asm volatile ( 465 asm volatile (
412 "lhi 1,3 \n\t" 466 "lhi 1,3 \n\t"
413 ".insn rre,0xb2650000,2,0 \n\t" 467 ".insn rre,0xb2650000,2,0 \n\t"
414 "lr %0,3 \n\t" 468 "lr %0,3 \n\t"
415 : "=d" (time) : : "cc", "1", "2", "3" 469 : "=d" (time) : : "cc", "1", "2", "3"
416 ); 470 );
417#else /* CONFIG_ARCH_S390X */ 471#else /* CONFIG_64BIT */
418 asm volatile ( 472 asm volatile (
419 "lghi 1,3 \n\t" 473 "lghi 1,3 \n\t"
420 ".insn rre,0xb2650000,2,0 \n\t" 474 ".insn rre,0xb2650000,2,0 \n\t"
421 "lgr %0,3 \n\t" 475 "lgr %0,3 \n\t"
422 : "=d" (time) : : "cc", "1", "2", "3" 476 : "=d" (time) : : "cc", "1", "2", "3"
423 ); 477 );
424#endif /* CONFIG_ARCH_S390X */ 478#endif /* CONFIG_64BIT */
425 479
426 return time; 480 return time;
427} 481}
@@ -488,42 +542,21 @@ struct qdio_perf_stats {
488 542
489#define MY_MODULE_STRING(x) #x 543#define MY_MODULE_STRING(x) #x
490 544
491#ifdef CONFIG_ARCH_S390X 545#ifdef CONFIG_64BIT
492#define QDIO_GET_ADDR(x) ((__u32)(unsigned long)x) 546#define QDIO_GET_ADDR(x) ((__u32)(unsigned long)x)
493#else /* CONFIG_ARCH_S390X */ 547#else /* CONFIG_64BIT */
494#define QDIO_GET_ADDR(x) ((__u32)(long)x) 548#define QDIO_GET_ADDR(x) ((__u32)(long)x)
495#endif /* CONFIG_ARCH_S390X */ 549#endif /* CONFIG_64BIT */
496
497#ifdef CONFIG_QDIO_DEBUG
498#define set_slsb(x,y) \
499 if(q->queue_type==QDIO_TRACE_QTYPE) { \
500 if(q->is_input_q) { \
501 QDIO_DBF_HEX2(0,slsb_in,&q->slsb,QDIO_MAX_BUFFERS_PER_Q); \
502 } else { \
503 QDIO_DBF_HEX2(0,slsb_out,&q->slsb,QDIO_MAX_BUFFERS_PER_Q); \
504 } \
505 } \
506 qdio_set_slsb(x,y); \
507 if(q->queue_type==QDIO_TRACE_QTYPE) { \
508 if(q->is_input_q) { \
509 QDIO_DBF_HEX2(0,slsb_in,&q->slsb,QDIO_MAX_BUFFERS_PER_Q); \
510 } else { \
511 QDIO_DBF_HEX2(0,slsb_out,&q->slsb,QDIO_MAX_BUFFERS_PER_Q); \
512 } \
513 }
514#else /* CONFIG_QDIO_DEBUG */
515#define set_slsb(x,y) qdio_set_slsb(x,y)
516#endif /* CONFIG_QDIO_DEBUG */
517 550
518struct qdio_q { 551struct qdio_q {
519 volatile struct slsb slsb; 552 volatile struct slsb slsb;
520 553
521 char unused[QDIO_MAX_BUFFERS_PER_Q]; 554 char unused[QDIO_MAX_BUFFERS_PER_Q];
522 555
523 __u32 * volatile dev_st_chg_ind; 556 __u32 * dev_st_chg_ind;
524 557
525 int is_input_q; 558 int is_input_q;
526 int irq; 559 struct subchannel_id schid;
527 struct ccw_device *cdev; 560 struct ccw_device *cdev;
528 561
529 unsigned int is_iqdio_q; 562 unsigned int is_iqdio_q;
@@ -568,6 +601,7 @@ struct qdio_q {
568 struct tasklet_struct tasklet; 601 struct tasklet_struct tasklet;
569#endif /* QDIO_USE_TIMERS_FOR_POLLING */ 602#endif /* QDIO_USE_TIMERS_FOR_POLLING */
570 603
604
571 enum qdio_irq_states state; 605 enum qdio_irq_states state;
572 606
573 /* used to store the error condition during a data transfer */ 607 /* used to store the error condition during a data transfer */
@@ -617,13 +651,17 @@ struct qdio_irq {
617 __u32 * volatile dev_st_chg_ind; 651 __u32 * volatile dev_st_chg_ind;
618 652
619 unsigned long int_parm; 653 unsigned long int_parm;
620 int irq; 654 struct subchannel_id schid;
621 655
622 unsigned int is_iqdio_irq; 656 unsigned int is_iqdio_irq;
623 unsigned int is_thinint_irq; 657 unsigned int is_thinint_irq;
624 unsigned int hydra_gives_outbound_pcis; 658 unsigned int hydra_gives_outbound_pcis;
625 unsigned int sync_done_on_outb_pcis; 659 unsigned int sync_done_on_outb_pcis;
626 660
661 /* QEBSM facility */
662 unsigned int is_qebsm;
663 unsigned long sch_token;
664
627 enum qdio_irq_states state; 665 enum qdio_irq_states state;
628 666
629 unsigned int no_input_qs; 667 unsigned int no_input_qs;
diff --git a/drivers/s390/cio/schid.h b/drivers/s390/cio/schid.h
new file mode 100644
index 000000000000..54328fec5ade
--- /dev/null
+++ b/drivers/s390/cio/schid.h
@@ -0,0 +1,26 @@
1#ifndef S390_SCHID_H
2#define S390_SCHID_H
3
4struct subchannel_id {
5 __u32 reserved:13;
6 __u32 ssid:2;
7 __u32 one:1;
8 __u32 sch_no:16;
9} __attribute__ ((packed,aligned(4)));
10
11
12/* Helper function for sane state of pre-allocated subchannel_id. */
13static inline void
14init_subchannel_id(struct subchannel_id *schid)
15{
16 memset(schid, 0, sizeof(struct subchannel_id));
17 schid->one = 1;
18}
19
20static inline int
21schid_equal(struct subchannel_id *schid1, struct subchannel_id *schid2)
22{
23 return !memcmp(schid1, schid2, sizeof(struct subchannel_id));
24}
25
26#endif /* S390_SCHID_H */
diff --git a/drivers/s390/crypto/z90common.h b/drivers/s390/crypto/z90common.h
index e319e78b5ea2..f87c785f2039 100644
--- a/drivers/s390/crypto/z90common.h
+++ b/drivers/s390/crypto/z90common.h
@@ -1,9 +1,9 @@
1/* 1/*
2 * linux/drivers/s390/crypto/z90common.h 2 * linux/drivers/s390/crypto/z90common.h
3 * 3 *
4 * z90crypt 1.3.2 4 * z90crypt 1.3.3
5 * 5 *
6 * Copyright (C) 2001, 2004 IBM Corporation 6 * Copyright (C) 2001, 2005 IBM Corporation
7 * Author(s): Robert Burroughs (burrough@us.ibm.com) 7 * Author(s): Robert Burroughs (burrough@us.ibm.com)
8 * Eric Rossman (edrossma@us.ibm.com) 8 * Eric Rossman (edrossma@us.ibm.com)
9 * 9 *
@@ -91,12 +91,13 @@ enum hdstat {
91#define TSQ_FATAL_ERROR 34 91#define TSQ_FATAL_ERROR 34
92#define RSQ_FATAL_ERROR 35 92#define RSQ_FATAL_ERROR 35
93 93
94#define Z90CRYPT_NUM_TYPES 5 94#define Z90CRYPT_NUM_TYPES 6
95#define PCICA 0 95#define PCICA 0
96#define PCICC 1 96#define PCICC 1
97#define PCIXCC_MCL2 2 97#define PCIXCC_MCL2 2
98#define PCIXCC_MCL3 3 98#define PCIXCC_MCL3 3
99#define CEX2C 4 99#define CEX2C 4
100#define CEX2A 5
100#define NILDEV -1 101#define NILDEV -1
101#define ANYDEV -1 102#define ANYDEV -1
102#define PCIXCC_UNK -2 103#define PCIXCC_UNK -2
@@ -105,7 +106,7 @@ enum hdevice_type {
105 PCICC_HW = 3, 106 PCICC_HW = 3,
106 PCICA_HW = 4, 107 PCICA_HW = 4,
107 PCIXCC_HW = 5, 108 PCIXCC_HW = 5,
108 OTHER_HW = 6, 109 CEX2A_HW = 6,
109 CEX2C_HW = 7 110 CEX2C_HW = 7
110}; 111};
111 112
diff --git a/drivers/s390/crypto/z90crypt.h b/drivers/s390/crypto/z90crypt.h
index 0a3bb5a10dd4..3a18443fdfa7 100644
--- a/drivers/s390/crypto/z90crypt.h
+++ b/drivers/s390/crypto/z90crypt.h
@@ -1,9 +1,9 @@
1/* 1/*
2 * linux/drivers/s390/crypto/z90crypt.h 2 * linux/drivers/s390/crypto/z90crypt.h
3 * 3 *
4 * z90crypt 1.3.2 4 * z90crypt 1.3.3
5 * 5 *
6 * Copyright (C) 2001, 2004 IBM Corporation 6 * Copyright (C) 2001, 2005 IBM Corporation
7 * Author(s): Robert Burroughs (burrough@us.ibm.com) 7 * Author(s): Robert Burroughs (burrough@us.ibm.com)
8 * Eric Rossman (edrossma@us.ibm.com) 8 * Eric Rossman (edrossma@us.ibm.com)
9 * 9 *
@@ -29,11 +29,11 @@
29 29
30#include <linux/ioctl.h> 30#include <linux/ioctl.h>
31 31
32#define VERSION_Z90CRYPT_H "$Revision: 1.11 $" 32#define VERSION_Z90CRYPT_H "$Revision: 1.2.2.4 $"
33 33
34#define z90crypt_VERSION 1 34#define z90crypt_VERSION 1
35#define z90crypt_RELEASE 3 // 2 = PCIXCC, 3 = rewrite for coding standards 35#define z90crypt_RELEASE 3 // 2 = PCIXCC, 3 = rewrite for coding standards
36#define z90crypt_VARIANT 2 // 2 = added PCIXCC MCL3 and CEX2C support 36#define z90crypt_VARIANT 3 // 3 = CEX2A support
37 37
38/** 38/**
39 * struct ica_rsa_modexpo 39 * struct ica_rsa_modexpo
@@ -122,6 +122,9 @@ struct ica_rsa_modexpo_crt {
122 * Z90STAT_CEX2CCOUNT 122 * Z90STAT_CEX2CCOUNT
123 * Return an integer count of all CEX2Cs. 123 * Return an integer count of all CEX2Cs.
124 * 124 *
125 * Z90STAT_CEX2ACOUNT
126 * Return an integer count of all CEX2As.
127 *
125 * Z90STAT_REQUESTQ_COUNT 128 * Z90STAT_REQUESTQ_COUNT
126 * Return an integer count of the number of entries waiting to be 129 * Return an integer count of the number of entries waiting to be
127 * sent to a device. 130 * sent to a device.
@@ -144,6 +147,7 @@ struct ica_rsa_modexpo_crt {
144 * 0x03: PCIXCC_MCL2 147 * 0x03: PCIXCC_MCL2
145 * 0x04: PCIXCC_MCL3 148 * 0x04: PCIXCC_MCL3
146 * 0x05: CEX2C 149 * 0x05: CEX2C
150 * 0x06: CEX2A
147 * 0x0d: device is disabled via the proc filesystem 151 * 0x0d: device is disabled via the proc filesystem
148 * 152 *
149 * Z90STAT_QDEPTH_MASK 153 * Z90STAT_QDEPTH_MASK
@@ -199,6 +203,7 @@ struct ica_rsa_modexpo_crt {
199#define Z90STAT_PCIXCCMCL2COUNT _IOR(Z90_IOCTL_MAGIC, 0x4b, int) 203#define Z90STAT_PCIXCCMCL2COUNT _IOR(Z90_IOCTL_MAGIC, 0x4b, int)
200#define Z90STAT_PCIXCCMCL3COUNT _IOR(Z90_IOCTL_MAGIC, 0x4c, int) 204#define Z90STAT_PCIXCCMCL3COUNT _IOR(Z90_IOCTL_MAGIC, 0x4c, int)
201#define Z90STAT_CEX2CCOUNT _IOR(Z90_IOCTL_MAGIC, 0x4d, int) 205#define Z90STAT_CEX2CCOUNT _IOR(Z90_IOCTL_MAGIC, 0x4d, int)
206#define Z90STAT_CEX2ACOUNT _IOR(Z90_IOCTL_MAGIC, 0x4e, int)
202#define Z90STAT_REQUESTQ_COUNT _IOR(Z90_IOCTL_MAGIC, 0x44, int) 207#define Z90STAT_REQUESTQ_COUNT _IOR(Z90_IOCTL_MAGIC, 0x44, int)
203#define Z90STAT_PENDINGQ_COUNT _IOR(Z90_IOCTL_MAGIC, 0x45, int) 208#define Z90STAT_PENDINGQ_COUNT _IOR(Z90_IOCTL_MAGIC, 0x45, int)
204#define Z90STAT_TOTALOPEN_COUNT _IOR(Z90_IOCTL_MAGIC, 0x46, int) 209#define Z90STAT_TOTALOPEN_COUNT _IOR(Z90_IOCTL_MAGIC, 0x46, int)
diff --git a/drivers/s390/crypto/z90hardware.c b/drivers/s390/crypto/z90hardware.c
index c215e0889736..d7f7494a0cbe 100644
--- a/drivers/s390/crypto/z90hardware.c
+++ b/drivers/s390/crypto/z90hardware.c
@@ -1,9 +1,9 @@
1/* 1/*
2 * linux/drivers/s390/crypto/z90hardware.c 2 * linux/drivers/s390/crypto/z90hardware.c
3 * 3 *
4 * z90crypt 1.3.2 4 * z90crypt 1.3.3
5 * 5 *
6 * Copyright (C) 2001, 2004 IBM Corporation 6 * Copyright (C) 2001, 2005 IBM Corporation
7 * Author(s): Robert Burroughs (burrough@us.ibm.com) 7 * Author(s): Robert Burroughs (burrough@us.ibm.com)
8 * Eric Rossman (edrossma@us.ibm.com) 8 * Eric Rossman (edrossma@us.ibm.com)
9 * 9 *
@@ -648,6 +648,87 @@ static struct cca_public_sec static_cca_pub_sec = {
648#define RESPONSE_CPRB_SIZE 0x000006B8 648#define RESPONSE_CPRB_SIZE 0x000006B8
649#define RESPONSE_CPRBX_SIZE 0x00000724 649#define RESPONSE_CPRBX_SIZE 0x00000724
650 650
651struct type50_hdr {
652 u8 reserved1;
653 u8 msg_type_code;
654 u16 msg_len;
655 u8 reserved2;
656 u8 ignored;
657 u16 reserved3;
658};
659
660#define TYPE50_TYPE_CODE 0x50
661
662#define TYPE50_MEB1_LEN (sizeof(struct type50_meb1_msg))
663#define TYPE50_MEB2_LEN (sizeof(struct type50_meb2_msg))
664#define TYPE50_CRB1_LEN (sizeof(struct type50_crb1_msg))
665#define TYPE50_CRB2_LEN (sizeof(struct type50_crb2_msg))
666
667#define TYPE50_MEB1_FMT 0x0001
668#define TYPE50_MEB2_FMT 0x0002
669#define TYPE50_CRB1_FMT 0x0011
670#define TYPE50_CRB2_FMT 0x0012
671
672struct type50_meb1_msg {
673 struct type50_hdr header;
674 u16 keyblock_type;
675 u8 reserved[6];
676 u8 exponent[128];
677 u8 modulus[128];
678 u8 message[128];
679};
680
681struct type50_meb2_msg {
682 struct type50_hdr header;
683 u16 keyblock_type;
684 u8 reserved[6];
685 u8 exponent[256];
686 u8 modulus[256];
687 u8 message[256];
688};
689
690struct type50_crb1_msg {
691 struct type50_hdr header;
692 u16 keyblock_type;
693 u8 reserved[6];
694 u8 p[64];
695 u8 q[64];
696 u8 dp[64];
697 u8 dq[64];
698 u8 u[64];
699 u8 message[128];
700};
701
702struct type50_crb2_msg {
703 struct type50_hdr header;
704 u16 keyblock_type;
705 u8 reserved[6];
706 u8 p[128];
707 u8 q[128];
708 u8 dp[128];
709 u8 dq[128];
710 u8 u[128];
711 u8 message[256];
712};
713
714union type50_msg {
715 struct type50_meb1_msg meb1;
716 struct type50_meb2_msg meb2;
717 struct type50_crb1_msg crb1;
718 struct type50_crb2_msg crb2;
719};
720
721struct type80_hdr {
722 u8 reserved1;
723 u8 type;
724 u16 len;
725 u8 code;
726 u8 reserved2[3];
727 u8 reserved3[8];
728};
729
730#define TYPE80_RSP_CODE 0x80
731
651struct error_hdr { 732struct error_hdr {
652 unsigned char reserved1; 733 unsigned char reserved1;
653 unsigned char type; 734 unsigned char type;
@@ -657,6 +738,7 @@ struct error_hdr {
657}; 738};
658 739
659#define TYPE82_RSP_CODE 0x82 740#define TYPE82_RSP_CODE 0x82
741#define TYPE88_RSP_CODE 0x88
660 742
661#define REP82_ERROR_MACHINE_FAILURE 0x10 743#define REP82_ERROR_MACHINE_FAILURE 0x10
662#define REP82_ERROR_PREEMPT_FAILURE 0x12 744#define REP82_ERROR_PREEMPT_FAILURE 0x12
@@ -679,6 +761,22 @@ struct error_hdr {
679#define REP82_ERROR_PACKET_TRUNCATED 0xA0 761#define REP82_ERROR_PACKET_TRUNCATED 0xA0
680#define REP82_ERROR_ZERO_BUFFER_LEN 0xB0 762#define REP82_ERROR_ZERO_BUFFER_LEN 0xB0
681 763
764#define REP88_ERROR_MODULE_FAILURE 0x10
765#define REP88_ERROR_MODULE_TIMEOUT 0x11
766#define REP88_ERROR_MODULE_NOTINIT 0x13
767#define REP88_ERROR_MODULE_NOTAVAIL 0x14
768#define REP88_ERROR_MODULE_DISABLED 0x15
769#define REP88_ERROR_MODULE_IN_DIAGN 0x17
770#define REP88_ERROR_FASTPATH_DISABLD 0x19
771#define REP88_ERROR_MESSAGE_TYPE 0x20
772#define REP88_ERROR_MESSAGE_MALFORMD 0x22
773#define REP88_ERROR_MESSAGE_LENGTH 0x23
774#define REP88_ERROR_RESERVED_FIELD 0x24
775#define REP88_ERROR_KEY_TYPE 0x34
776#define REP88_ERROR_INVALID_KEY 0x82
777#define REP88_ERROR_OPERAND 0x84
778#define REP88_ERROR_OPERAND_EVEN_MOD 0x85
779
682#define CALLER_HEADER 12 780#define CALLER_HEADER 12
683 781
684static inline int 782static inline int
@@ -687,7 +785,7 @@ testq(int q_nr, int *q_depth, int *dev_type, struct ap_status_word *stat)
687 int ccode; 785 int ccode;
688 786
689 asm volatile 787 asm volatile
690#ifdef __s390x__ 788#ifdef CONFIG_64BIT
691 (" llgfr 0,%4 \n" 789 (" llgfr 0,%4 \n"
692 " slgr 1,1 \n" 790 " slgr 1,1 \n"
693 " lgr 2,1 \n" 791 " lgr 2,1 \n"
@@ -757,7 +855,7 @@ resetq(int q_nr, struct ap_status_word *stat_p)
757 int ccode; 855 int ccode;
758 856
759 asm volatile 857 asm volatile
760#ifdef __s390x__ 858#ifdef CONFIG_64BIT
761 (" llgfr 0,%2 \n" 859 (" llgfr 0,%2 \n"
762 " lghi 1,1 \n" 860 " lghi 1,1 \n"
763 " sll 1,24 \n" 861 " sll 1,24 \n"
@@ -823,7 +921,7 @@ sen(int msg_len, unsigned char *msg_ext, struct ap_status_word *stat)
823 int ccode; 921 int ccode;
824 922
825 asm volatile 923 asm volatile
826#ifdef __s390x__ 924#ifdef CONFIG_64BIT
827 (" lgr 6,%3 \n" 925 (" lgr 6,%3 \n"
828 " llgfr 7,%2 \n" 926 " llgfr 7,%2 \n"
829 " llgt 0,0(6) \n" 927 " llgt 0,0(6) \n"
@@ -902,7 +1000,7 @@ rec(int q_nr, int buff_l, unsigned char *rsp, unsigned char *id,
902 int ccode; 1000 int ccode;
903 1001
904 asm volatile 1002 asm volatile
905#ifdef __s390x__ 1003#ifdef CONFIG_64BIT
906 (" llgfr 0,%2 \n" 1004 (" llgfr 0,%2 \n"
907 " lgr 3,%4 \n" 1005 " lgr 3,%4 \n"
908 " lgr 6,%3 \n" 1006 " lgr 6,%3 \n"
@@ -1029,10 +1127,6 @@ query_online(int deviceNr, int cdx, int resetNr, int *q_depth, int *dev_type)
1029 stat = HD_ONLINE; 1127 stat = HD_ONLINE;
1030 *q_depth = t_depth + 1; 1128 *q_depth = t_depth + 1;
1031 switch (t_dev_type) { 1129 switch (t_dev_type) {
1032 case OTHER_HW:
1033 stat = HD_NOT_THERE;
1034 *dev_type = NILDEV;
1035 break;
1036 case PCICA_HW: 1130 case PCICA_HW:
1037 *dev_type = PCICA; 1131 *dev_type = PCICA;
1038 break; 1132 break;
@@ -1045,6 +1139,9 @@ query_online(int deviceNr, int cdx, int resetNr, int *q_depth, int *dev_type)
1045 case CEX2C_HW: 1139 case CEX2C_HW:
1046 *dev_type = CEX2C; 1140 *dev_type = CEX2C;
1047 break; 1141 break;
1142 case CEX2A_HW:
1143 *dev_type = CEX2A;
1144 break;
1048 default: 1145 default:
1049 *dev_type = NILDEV; 1146 *dev_type = NILDEV;
1050 break; 1147 break;
@@ -2029,6 +2126,177 @@ ICACRT_msg_to_type6CRT_msgX(struct ica_rsa_modexpo_crt *icaMsg_p, int cdx,
2029 return 0; 2126 return 0;
2030} 2127}
2031 2128
2129static int
2130ICAMEX_msg_to_type50MEX_msg(struct ica_rsa_modexpo *icaMex_p, int *z90cMsg_l_p,
2131 union type50_msg *z90cMsg_p)
2132{
2133 int mod_len, msg_size, mod_tgt_len, exp_tgt_len, inp_tgt_len;
2134 unsigned char *mod_tgt, *exp_tgt, *inp_tgt;
2135 union type50_msg *tmp_type50_msg;
2136
2137 mod_len = icaMex_p->inputdatalength;
2138
2139 msg_size = ((mod_len <= 128) ? TYPE50_MEB1_LEN : TYPE50_MEB2_LEN) +
2140 CALLER_HEADER;
2141
2142 memset(z90cMsg_p, 0, msg_size);
2143
2144 tmp_type50_msg = (union type50_msg *)
2145 ((unsigned char *) z90cMsg_p + CALLER_HEADER);
2146
2147 tmp_type50_msg->meb1.header.msg_type_code = TYPE50_TYPE_CODE;
2148
2149 if (mod_len <= 128) {
2150 tmp_type50_msg->meb1.header.msg_len = TYPE50_MEB1_LEN;
2151 tmp_type50_msg->meb1.keyblock_type = TYPE50_MEB1_FMT;
2152 mod_tgt = tmp_type50_msg->meb1.modulus;
2153 mod_tgt_len = sizeof(tmp_type50_msg->meb1.modulus);
2154 exp_tgt = tmp_type50_msg->meb1.exponent;
2155 exp_tgt_len = sizeof(tmp_type50_msg->meb1.exponent);
2156 inp_tgt = tmp_type50_msg->meb1.message;
2157 inp_tgt_len = sizeof(tmp_type50_msg->meb1.message);
2158 } else {
2159 tmp_type50_msg->meb2.header.msg_len = TYPE50_MEB2_LEN;
2160 tmp_type50_msg->meb2.keyblock_type = TYPE50_MEB2_FMT;
2161 mod_tgt = tmp_type50_msg->meb2.modulus;
2162 mod_tgt_len = sizeof(tmp_type50_msg->meb2.modulus);
2163 exp_tgt = tmp_type50_msg->meb2.exponent;
2164 exp_tgt_len = sizeof(tmp_type50_msg->meb2.exponent);
2165 inp_tgt = tmp_type50_msg->meb2.message;
2166 inp_tgt_len = sizeof(tmp_type50_msg->meb2.message);
2167 }
2168
2169 mod_tgt += (mod_tgt_len - mod_len);
2170 if (copy_from_user(mod_tgt, icaMex_p->n_modulus, mod_len))
2171 return SEN_RELEASED;
2172 if (is_empty(mod_tgt, mod_len))
2173 return SEN_USER_ERROR;
2174 exp_tgt += (exp_tgt_len - mod_len);
2175 if (copy_from_user(exp_tgt, icaMex_p->b_key, mod_len))
2176 return SEN_RELEASED;
2177 if (is_empty(exp_tgt, mod_len))
2178 return SEN_USER_ERROR;
2179 inp_tgt += (inp_tgt_len - mod_len);
2180 if (copy_from_user(inp_tgt, icaMex_p->inputdata, mod_len))
2181 return SEN_RELEASED;
2182 if (is_empty(inp_tgt, mod_len))
2183 return SEN_USER_ERROR;
2184
2185 *z90cMsg_l_p = msg_size - CALLER_HEADER;
2186
2187 return 0;
2188}
2189
2190static int
2191ICACRT_msg_to_type50CRT_msg(struct ica_rsa_modexpo_crt *icaMsg_p,
2192 int *z90cMsg_l_p, union type50_msg *z90cMsg_p)
2193{
2194 int mod_len, short_len, long_len, tmp_size, p_tgt_len, q_tgt_len,
2195 dp_tgt_len, dq_tgt_len, u_tgt_len, inp_tgt_len, long_offset;
2196 unsigned char *p_tgt, *q_tgt, *dp_tgt, *dq_tgt, *u_tgt, *inp_tgt,
2197 temp[8];
2198 union type50_msg *tmp_type50_msg;
2199
2200 mod_len = icaMsg_p->inputdatalength;
2201 short_len = mod_len / 2;
2202 long_len = mod_len / 2 + 8;
2203 long_offset = 0;
2204
2205 if (long_len > 128) {
2206 memset(temp, 0x00, sizeof(temp));
2207 if (copy_from_user(temp, icaMsg_p->np_prime, long_len-128))
2208 return SEN_RELEASED;
2209 if (!is_empty(temp, 8))
2210 return SEN_NOT_AVAIL;
2211 if (copy_from_user(temp, icaMsg_p->bp_key, long_len-128))
2212 return SEN_RELEASED;
2213 if (!is_empty(temp, 8))
2214 return SEN_NOT_AVAIL;
2215 if (copy_from_user(temp, icaMsg_p->u_mult_inv, long_len-128))
2216 return SEN_RELEASED;
2217 if (!is_empty(temp, 8))
2218 return SEN_NOT_AVAIL;
2219 long_offset = long_len - 128;
2220 long_len = 128;
2221 }
2222
2223 tmp_size = ((mod_len <= 128) ? TYPE50_CRB1_LEN : TYPE50_CRB2_LEN) +
2224 CALLER_HEADER;
2225
2226 memset(z90cMsg_p, 0, tmp_size);
2227
2228 tmp_type50_msg = (union type50_msg *)
2229 ((unsigned char *) z90cMsg_p + CALLER_HEADER);
2230
2231 tmp_type50_msg->crb1.header.msg_type_code = TYPE50_TYPE_CODE;
2232 if (long_len <= 64) {
2233 tmp_type50_msg->crb1.header.msg_len = TYPE50_CRB1_LEN;
2234 tmp_type50_msg->crb1.keyblock_type = TYPE50_CRB1_FMT;
2235 p_tgt = tmp_type50_msg->crb1.p;
2236 p_tgt_len = sizeof(tmp_type50_msg->crb1.p);
2237 q_tgt = tmp_type50_msg->crb1.q;
2238 q_tgt_len = sizeof(tmp_type50_msg->crb1.q);
2239 dp_tgt = tmp_type50_msg->crb1.dp;
2240 dp_tgt_len = sizeof(tmp_type50_msg->crb1.dp);
2241 dq_tgt = tmp_type50_msg->crb1.dq;
2242 dq_tgt_len = sizeof(tmp_type50_msg->crb1.dq);
2243 u_tgt = tmp_type50_msg->crb1.u;
2244 u_tgt_len = sizeof(tmp_type50_msg->crb1.u);
2245 inp_tgt = tmp_type50_msg->crb1.message;
2246 inp_tgt_len = sizeof(tmp_type50_msg->crb1.message);
2247 } else {
2248 tmp_type50_msg->crb2.header.msg_len = TYPE50_CRB2_LEN;
2249 tmp_type50_msg->crb2.keyblock_type = TYPE50_CRB2_FMT;
2250 p_tgt = tmp_type50_msg->crb2.p;
2251 p_tgt_len = sizeof(tmp_type50_msg->crb2.p);
2252 q_tgt = tmp_type50_msg->crb2.q;
2253 q_tgt_len = sizeof(tmp_type50_msg->crb2.q);
2254 dp_tgt = tmp_type50_msg->crb2.dp;
2255 dp_tgt_len = sizeof(tmp_type50_msg->crb2.dp);
2256 dq_tgt = tmp_type50_msg->crb2.dq;
2257 dq_tgt_len = sizeof(tmp_type50_msg->crb2.dq);
2258 u_tgt = tmp_type50_msg->crb2.u;
2259 u_tgt_len = sizeof(tmp_type50_msg->crb2.u);
2260 inp_tgt = tmp_type50_msg->crb2.message;
2261 inp_tgt_len = sizeof(tmp_type50_msg->crb2.message);
2262 }
2263
2264 p_tgt += (p_tgt_len - long_len);
2265 if (copy_from_user(p_tgt, icaMsg_p->np_prime + long_offset, long_len))
2266 return SEN_RELEASED;
2267 if (is_empty(p_tgt, long_len))
2268 return SEN_USER_ERROR;
2269 q_tgt += (q_tgt_len - short_len);
2270 if (copy_from_user(q_tgt, icaMsg_p->nq_prime, short_len))
2271 return SEN_RELEASED;
2272 if (is_empty(q_tgt, short_len))
2273 return SEN_USER_ERROR;
2274 dp_tgt += (dp_tgt_len - long_len);
2275 if (copy_from_user(dp_tgt, icaMsg_p->bp_key + long_offset, long_len))
2276 return SEN_RELEASED;
2277 if (is_empty(dp_tgt, long_len))
2278 return SEN_USER_ERROR;
2279 dq_tgt += (dq_tgt_len - short_len);
2280 if (copy_from_user(dq_tgt, icaMsg_p->bq_key, short_len))
2281 return SEN_RELEASED;
2282 if (is_empty(dq_tgt, short_len))
2283 return SEN_USER_ERROR;
2284 u_tgt += (u_tgt_len - long_len);
2285 if (copy_from_user(u_tgt, icaMsg_p->u_mult_inv + long_offset, long_len))
2286 return SEN_RELEASED;
2287 if (is_empty(u_tgt, long_len))
2288 return SEN_USER_ERROR;
2289 inp_tgt += (inp_tgt_len - mod_len);
2290 if (copy_from_user(inp_tgt, icaMsg_p->inputdata, mod_len))
2291 return SEN_RELEASED;
2292 if (is_empty(inp_tgt, mod_len))
2293 return SEN_USER_ERROR;
2294
2295 *z90cMsg_l_p = tmp_size - CALLER_HEADER;
2296
2297 return 0;
2298}
2299
2032int 2300int
2033convert_request(unsigned char *buffer, int func, unsigned short function, 2301convert_request(unsigned char *buffer, int func, unsigned short function,
2034 int cdx, int dev_type, int *msg_l_p, unsigned char *msg_p) 2302 int cdx, int dev_type, int *msg_l_p, unsigned char *msg_p)
@@ -2071,6 +2339,16 @@ convert_request(unsigned char *buffer, int func, unsigned short function,
2071 cdx, msg_l_p, (struct type6_msg *) msg_p, 2339 cdx, msg_l_p, (struct type6_msg *) msg_p,
2072 dev_type); 2340 dev_type);
2073 } 2341 }
2342 if (dev_type == CEX2A) {
2343 if (func == ICARSACRT)
2344 return ICACRT_msg_to_type50CRT_msg(
2345 (struct ica_rsa_modexpo_crt *) buffer,
2346 msg_l_p, (union type50_msg *) msg_p);
2347 else
2348 return ICAMEX_msg_to_type50MEX_msg(
2349 (struct ica_rsa_modexpo *) buffer,
2350 msg_l_p, (union type50_msg *) msg_p);
2351 }
2074 2352
2075 return 0; 2353 return 0;
2076} 2354}
@@ -2081,8 +2359,8 @@ unset_ext_bitlens(void)
2081{ 2359{
2082 if (!ext_bitlens_msg_count) { 2360 if (!ext_bitlens_msg_count) {
2083 PRINTK("Unable to use coprocessors for extended bitlengths. " 2361 PRINTK("Unable to use coprocessors for extended bitlengths. "
2084 "Using PCICAs (if present) for extended bitlengths. " 2362 "Using PCICAs/CEX2As (if present) for extended "
2085 "This is not an error.\n"); 2363 "bitlengths. This is not an error.\n");
2086 ext_bitlens_msg_count++; 2364 ext_bitlens_msg_count++;
2087 } 2365 }
2088 ext_bitlens = 0; 2366 ext_bitlens = 0;
@@ -2094,6 +2372,7 @@ convert_response(unsigned char *response, unsigned char *buffer,
2094{ 2372{
2095 struct ica_rsa_modexpo *icaMsg_p = (struct ica_rsa_modexpo *) buffer; 2373 struct ica_rsa_modexpo *icaMsg_p = (struct ica_rsa_modexpo *) buffer;
2096 struct error_hdr *errh_p = (struct error_hdr *) response; 2374 struct error_hdr *errh_p = (struct error_hdr *) response;
2375 struct type80_hdr *t80h_p = (struct type80_hdr *) response;
2097 struct type84_hdr *t84h_p = (struct type84_hdr *) response; 2376 struct type84_hdr *t84h_p = (struct type84_hdr *) response;
2098 struct type86_fmt2_msg *t86m_p = (struct type86_fmt2_msg *) response; 2377 struct type86_fmt2_msg *t86m_p = (struct type86_fmt2_msg *) response;
2099 int reply_code, service_rc, service_rs, src_l; 2378 int reply_code, service_rc, service_rs, src_l;
@@ -2108,6 +2387,7 @@ convert_response(unsigned char *response, unsigned char *buffer,
2108 src_l = 0; 2387 src_l = 0;
2109 switch (errh_p->type) { 2388 switch (errh_p->type) {
2110 case TYPE82_RSP_CODE: 2389 case TYPE82_RSP_CODE:
2390 case TYPE88_RSP_CODE:
2111 reply_code = errh_p->reply_code; 2391 reply_code = errh_p->reply_code;
2112 src_p = (unsigned char *)errh_p; 2392 src_p = (unsigned char *)errh_p;
2113 PRINTK("Hardware error: Type %02X Message Header: " 2393 PRINTK("Hardware error: Type %02X Message Header: "
@@ -2116,6 +2396,10 @@ convert_response(unsigned char *response, unsigned char *buffer,
2116 src_p[0], src_p[1], src_p[2], src_p[3], 2396 src_p[0], src_p[1], src_p[2], src_p[3],
2117 src_p[4], src_p[5], src_p[6], src_p[7]); 2397 src_p[4], src_p[5], src_p[6], src_p[7]);
2118 break; 2398 break;
2399 case TYPE80_RSP_CODE:
2400 src_l = icaMsg_p->outputdatalength;
2401 src_p = response + (int)t80h_p->len - src_l;
2402 break;
2119 case TYPE84_RSP_CODE: 2403 case TYPE84_RSP_CODE:
2120 src_l = icaMsg_p->outputdatalength; 2404 src_l = icaMsg_p->outputdatalength;
2121 src_p = response + (int)t84h_p->len - src_l; 2405 src_p = response + (int)t84h_p->len - src_l;
@@ -2202,6 +2486,7 @@ convert_response(unsigned char *response, unsigned char *buffer,
2202 if (reply_code) 2486 if (reply_code)
2203 switch (reply_code) { 2487 switch (reply_code) {
2204 case REP82_ERROR_OPERAND_INVALID: 2488 case REP82_ERROR_OPERAND_INVALID:
2489 case REP88_ERROR_MESSAGE_MALFORMD:
2205 return REC_OPERAND_INV; 2490 return REC_OPERAND_INV;
2206 case REP82_ERROR_OPERAND_SIZE: 2491 case REP82_ERROR_OPERAND_SIZE:
2207 return REC_OPERAND_SIZE; 2492 return REC_OPERAND_SIZE;
diff --git a/drivers/s390/crypto/z90main.c b/drivers/s390/crypto/z90main.c
index 790fcbb74b43..135ae04e6e75 100644
--- a/drivers/s390/crypto/z90main.c
+++ b/drivers/s390/crypto/z90main.c
@@ -228,7 +228,7 @@ struct device_x {
228 */ 228 */
229struct device { 229struct device {
230 int dev_type; // PCICA, PCICC, PCIXCC_MCL2, 230 int dev_type; // PCICA, PCICC, PCIXCC_MCL2,
231 // PCIXCC_MCL3, CEX2C 231 // PCIXCC_MCL3, CEX2C, CEX2A
232 enum devstat dev_stat; // current device status 232 enum devstat dev_stat; // current device status
233 int dev_self_x; // Index in array 233 int dev_self_x; // Index in array
234 int disabled; // Set when device is in error 234 int disabled; // Set when device is in error
@@ -295,26 +295,30 @@ struct caller {
295/** 295/**
296 * Function prototypes from z90hardware.c 296 * Function prototypes from z90hardware.c
297 */ 297 */
298enum hdstat query_online(int, int, int, int *, int *); 298enum hdstat query_online(int deviceNr, int cdx, int resetNr, int *q_depth,
299enum devstat reset_device(int, int, int); 299 int *dev_type);
300enum devstat send_to_AP(int, int, int, unsigned char *); 300enum devstat reset_device(int deviceNr, int cdx, int resetNr);
301enum devstat receive_from_AP(int, int, int, unsigned char *, unsigned char *); 301enum devstat send_to_AP(int dev_nr, int cdx, int msg_len, unsigned char *msg_ext);
302int convert_request(unsigned char *, int, short, int, int, int *, 302enum devstat receive_from_AP(int dev_nr, int cdx, int resplen,
303 unsigned char *); 303 unsigned char *resp, unsigned char *psmid);
304int convert_response(unsigned char *, unsigned char *, int *, unsigned char *); 304int convert_request(unsigned char *buffer, int func, unsigned short function,
305 int cdx, int dev_type, int *msg_l_p, unsigned char *msg_p);
306int convert_response(unsigned char *response, unsigned char *buffer,
307 int *respbufflen_p, unsigned char *resp_buff);
305 308
306/** 309/**
307 * Low level function prototypes 310 * Low level function prototypes
308 */ 311 */
309static int create_z90crypt(int *); 312static int create_z90crypt(int *cdx_p);
310static int refresh_z90crypt(int *); 313static int refresh_z90crypt(int *cdx_p);
311static int find_crypto_devices(struct status *); 314static int find_crypto_devices(struct status *deviceMask);
312static int create_crypto_device(int); 315static int create_crypto_device(int index);
313static int destroy_crypto_device(int); 316static int destroy_crypto_device(int index);
314static void destroy_z90crypt(void); 317static void destroy_z90crypt(void);
315static int refresh_index_array(struct status *, struct device_x *); 318static int refresh_index_array(struct status *status_str,
316static int probe_device_type(struct device *); 319 struct device_x *index_array);
317static int probe_PCIXCC_type(struct device *); 320static int probe_device_type(struct device *devPtr);
321static int probe_PCIXCC_type(struct device *devPtr);
318 322
319/** 323/**
320 * proc fs definitions 324 * proc fs definitions
@@ -425,7 +429,7 @@ static struct miscdevice z90crypt_misc_device = {
425MODULE_AUTHOR("zSeries Linux Crypto Team: Robert H. Burroughs, Eric D. Rossman" 429MODULE_AUTHOR("zSeries Linux Crypto Team: Robert H. Burroughs, Eric D. Rossman"
426 "and Jochen Roehrig"); 430 "and Jochen Roehrig");
427MODULE_DESCRIPTION("zSeries Linux Cryptographic Coprocessor device driver, " 431MODULE_DESCRIPTION("zSeries Linux Cryptographic Coprocessor device driver, "
428 "Copyright 2001, 2004 IBM Corporation"); 432 "Copyright 2001, 2005 IBM Corporation");
429MODULE_LICENSE("GPL"); 433MODULE_LICENSE("GPL");
430module_param(domain, int, 0); 434module_param(domain, int, 0);
431MODULE_PARM_DESC(domain, "domain index for device"); 435MODULE_PARM_DESC(domain, "domain index for device");
@@ -860,6 +864,12 @@ get_status_CEX2Ccount(void)
860} 864}
861 865
862static inline int 866static inline int
867get_status_CEX2Acount(void)
868{
869 return z90crypt.hdware_info->type_mask[CEX2A].st_count;
870}
871
872static inline int
863get_status_requestq_count(void) 873get_status_requestq_count(void)
864{ 874{
865 return requestq_count; 875 return requestq_count;
@@ -1008,11 +1018,13 @@ static inline int
1008select_device_type(int *dev_type_p, int bytelength) 1018select_device_type(int *dev_type_p, int bytelength)
1009{ 1019{
1010 static int count = 0; 1020 static int count = 0;
1011 int PCICA_avail, PCIXCC_MCL3_avail, CEX2C_avail, index_to_use; 1021 int PCICA_avail, PCIXCC_MCL3_avail, CEX2C_avail, CEX2A_avail,
1022 index_to_use;
1012 struct status *stat; 1023 struct status *stat;
1013 if ((*dev_type_p != PCICC) && (*dev_type_p != PCICA) && 1024 if ((*dev_type_p != PCICC) && (*dev_type_p != PCICA) &&
1014 (*dev_type_p != PCIXCC_MCL2) && (*dev_type_p != PCIXCC_MCL3) && 1025 (*dev_type_p != PCIXCC_MCL2) && (*dev_type_p != PCIXCC_MCL3) &&
1015 (*dev_type_p != CEX2C) && (*dev_type_p != ANYDEV)) 1026 (*dev_type_p != CEX2C) && (*dev_type_p != CEX2A) &&
1027 (*dev_type_p != ANYDEV))
1016 return -1; 1028 return -1;
1017 if (*dev_type_p != ANYDEV) { 1029 if (*dev_type_p != ANYDEV) {
1018 stat = &z90crypt.hdware_info->type_mask[*dev_type_p]; 1030 stat = &z90crypt.hdware_info->type_mask[*dev_type_p];
@@ -1022,7 +1034,13 @@ select_device_type(int *dev_type_p, int bytelength)
1022 return -1; 1034 return -1;
1023 } 1035 }
1024 1036
1025 /* Assumption: PCICA, PCIXCC_MCL3, and CEX2C are all similar in speed */ 1037 /**
1038 * Assumption: PCICA, PCIXCC_MCL3, CEX2C, and CEX2A are all similar in
1039 * speed.
1040 *
1041 * PCICA and CEX2A do NOT co-exist, so it would be either one or the
1042 * other present.
1043 */
1026 stat = &z90crypt.hdware_info->type_mask[PCICA]; 1044 stat = &z90crypt.hdware_info->type_mask[PCICA];
1027 PCICA_avail = stat->st_count - 1045 PCICA_avail = stat->st_count -
1028 (stat->disabled_count + stat->user_disabled_count); 1046 (stat->disabled_count + stat->user_disabled_count);
@@ -1032,29 +1050,38 @@ select_device_type(int *dev_type_p, int bytelength)
1032 stat = &z90crypt.hdware_info->type_mask[CEX2C]; 1050 stat = &z90crypt.hdware_info->type_mask[CEX2C];
1033 CEX2C_avail = stat->st_count - 1051 CEX2C_avail = stat->st_count -
1034 (stat->disabled_count + stat->user_disabled_count); 1052 (stat->disabled_count + stat->user_disabled_count);
1035 if (PCICA_avail || PCIXCC_MCL3_avail || CEX2C_avail) { 1053 stat = &z90crypt.hdware_info->type_mask[CEX2A];
1054 CEX2A_avail = stat->st_count -
1055 (stat->disabled_count + stat->user_disabled_count);
1056 if (PCICA_avail || PCIXCC_MCL3_avail || CEX2C_avail || CEX2A_avail) {
1036 /** 1057 /**
1037 * bitlength is a factor, PCICA is the most capable, even with 1058 * bitlength is a factor, PCICA or CEX2A are the most capable,
1038 * the new MCL for PCIXCC. 1059 * even with the new MCL for PCIXCC.
1039 */ 1060 */
1040 if ((bytelength < PCIXCC_MIN_MOD_SIZE) || 1061 if ((bytelength < PCIXCC_MIN_MOD_SIZE) ||
1041 (!ext_bitlens && (bytelength < OLD_PCIXCC_MIN_MOD_SIZE))) { 1062 (!ext_bitlens && (bytelength < OLD_PCIXCC_MIN_MOD_SIZE))) {
1042 if (!PCICA_avail) 1063 if (PCICA_avail) {
1043 return -1;
1044 else {
1045 *dev_type_p = PCICA; 1064 *dev_type_p = PCICA;
1046 return 0; 1065 return 0;
1047 } 1066 }
1067 if (CEX2A_avail) {
1068 *dev_type_p = CEX2A;
1069 return 0;
1070 }
1071 return -1;
1048 } 1072 }
1049 1073
1050 index_to_use = count % (PCICA_avail + PCIXCC_MCL3_avail + 1074 index_to_use = count % (PCICA_avail + PCIXCC_MCL3_avail +
1051 CEX2C_avail); 1075 CEX2C_avail + CEX2A_avail);
1052 if (index_to_use < PCICA_avail) 1076 if (index_to_use < PCICA_avail)
1053 *dev_type_p = PCICA; 1077 *dev_type_p = PCICA;
1054 else if (index_to_use < (PCICA_avail + PCIXCC_MCL3_avail)) 1078 else if (index_to_use < (PCICA_avail + PCIXCC_MCL3_avail))
1055 *dev_type_p = PCIXCC_MCL3; 1079 *dev_type_p = PCIXCC_MCL3;
1056 else 1080 else if (index_to_use < (PCICA_avail + PCIXCC_MCL3_avail +
1081 CEX2C_avail))
1057 *dev_type_p = CEX2C; 1082 *dev_type_p = CEX2C;
1083 else
1084 *dev_type_p = CEX2A;
1058 count++; 1085 count++;
1059 return 0; 1086 return 0;
1060 } 1087 }
@@ -1359,7 +1386,7 @@ build_caller(struct work_element *we_p, short function)
1359 1386
1360 if ((we_p->devtype != PCICC) && (we_p->devtype != PCICA) && 1387 if ((we_p->devtype != PCICC) && (we_p->devtype != PCICA) &&
1361 (we_p->devtype != PCIXCC_MCL2) && (we_p->devtype != PCIXCC_MCL3) && 1388 (we_p->devtype != PCIXCC_MCL2) && (we_p->devtype != PCIXCC_MCL3) &&
1362 (we_p->devtype != CEX2C)) 1389 (we_p->devtype != CEX2C) && (we_p->devtype != CEX2A))
1363 return SEN_NOT_AVAIL; 1390 return SEN_NOT_AVAIL;
1364 1391
1365 memcpy(caller_p->caller_id, we_p->caller_id, 1392 memcpy(caller_p->caller_id, we_p->caller_id,
@@ -1428,7 +1455,8 @@ get_crypto_request_buffer(struct work_element *we_p)
1428 1455
1429 if ((we_p->devtype != PCICA) && (we_p->devtype != PCICC) && 1456 if ((we_p->devtype != PCICA) && (we_p->devtype != PCICC) &&
1430 (we_p->devtype != PCIXCC_MCL2) && (we_p->devtype != PCIXCC_MCL3) && 1457 (we_p->devtype != PCIXCC_MCL2) && (we_p->devtype != PCIXCC_MCL3) &&
1431 (we_p->devtype != CEX2C) && (we_p->devtype != ANYDEV)) { 1458 (we_p->devtype != CEX2C) && (we_p->devtype != CEX2A) &&
1459 (we_p->devtype != ANYDEV)) {
1432 PRINTK("invalid device type\n"); 1460 PRINTK("invalid device type\n");
1433 return SEN_USER_ERROR; 1461 return SEN_USER_ERROR;
1434 } 1462 }
@@ -1503,8 +1531,9 @@ get_crypto_request_buffer(struct work_element *we_p)
1503 1531
1504 function = PCI_FUNC_KEY_ENCRYPT; 1532 function = PCI_FUNC_KEY_ENCRYPT;
1505 switch (we_p->devtype) { 1533 switch (we_p->devtype) {
1506 /* PCICA does everything with a simple RSA mod-expo operation */ 1534 /* PCICA and CEX2A do everything with a simple RSA mod-expo operation */
1507 case PCICA: 1535 case PCICA:
1536 case CEX2A:
1508 function = PCI_FUNC_KEY_ENCRYPT; 1537 function = PCI_FUNC_KEY_ENCRYPT;
1509 break; 1538 break;
1510 /** 1539 /**
@@ -1662,7 +1691,8 @@ z90crypt_rsa(struct priv_data *private_data_p, pid_t pid,
1662 * trigger a fallback to software. 1691 * trigger a fallback to software.
1663 */ 1692 */
1664 case -EINVAL: 1693 case -EINVAL:
1665 if (we_p->devtype != PCICA) 1694 if ((we_p->devtype != PCICA) &&
1695 (we_p->devtype != CEX2A))
1666 rv = -EGETBUFF; 1696 rv = -EGETBUFF;
1667 break; 1697 break;
1668 case -ETIMEOUT: 1698 case -ETIMEOUT:
@@ -1779,6 +1809,12 @@ z90crypt_unlocked_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1779 ret = -EFAULT; 1809 ret = -EFAULT;
1780 break; 1810 break;
1781 1811
1812 case Z90STAT_CEX2ACOUNT:
1813 tempstat = get_status_CEX2Acount();
1814 if (copy_to_user((int __user *)arg, &tempstat, sizeof(int)) != 0)
1815 ret = -EFAULT;
1816 break;
1817
1782 case Z90STAT_REQUESTQ_COUNT: 1818 case Z90STAT_REQUESTQ_COUNT:
1783 tempstat = get_status_requestq_count(); 1819 tempstat = get_status_requestq_count();
1784 if (copy_to_user((int __user *)arg, &tempstat, sizeof(int)) != 0) 1820 if (copy_to_user((int __user *)arg, &tempstat, sizeof(int)) != 0)
@@ -2019,6 +2055,8 @@ z90crypt_status(char *resp_buff, char **start, off_t offset,
2019 get_status_PCIXCCMCL3count()); 2055 get_status_PCIXCCMCL3count());
2020 len += sprintf(resp_buff+len, "CEX2C count: %d\n", 2056 len += sprintf(resp_buff+len, "CEX2C count: %d\n",
2021 get_status_CEX2Ccount()); 2057 get_status_CEX2Ccount());
2058 len += sprintf(resp_buff+len, "CEX2A count: %d\n",
2059 get_status_CEX2Acount());
2022 len += sprintf(resp_buff+len, "requestq count: %d\n", 2060 len += sprintf(resp_buff+len, "requestq count: %d\n",
2023 get_status_requestq_count()); 2061 get_status_requestq_count());
2024 len += sprintf(resp_buff+len, "pendingq count: %d\n", 2062 len += sprintf(resp_buff+len, "pendingq count: %d\n",
@@ -2026,8 +2064,8 @@ z90crypt_status(char *resp_buff, char **start, off_t offset,
2026 len += sprintf(resp_buff+len, "Total open handles: %d\n\n", 2064 len += sprintf(resp_buff+len, "Total open handles: %d\n\n",
2027 get_status_totalopen_count()); 2065 get_status_totalopen_count());
2028 len += sprinthx( 2066 len += sprinthx(
2029 "Online devices: 1: PCICA, 2: PCICC, 3: PCIXCC (MCL2), " 2067 "Online devices: 1=PCICA 2=PCICC 3=PCIXCC(MCL2) "
2030 "4: PCIXCC (MCL3), 5: CEX2C", 2068 "4=PCIXCC(MCL3) 5=CEX2C 6=CEX2A",
2031 resp_buff+len, 2069 resp_buff+len,
2032 get_status_status_mask(workarea), 2070 get_status_status_mask(workarea),
2033 Z90CRYPT_NUM_APS); 2071 Z90CRYPT_NUM_APS);
@@ -2140,6 +2178,7 @@ z90crypt_status_write(struct file *file, const char __user *buffer,
2140 case '3': // PCIXCC_MCL2 2178 case '3': // PCIXCC_MCL2
2141 case '4': // PCIXCC_MCL3 2179 case '4': // PCIXCC_MCL3
2142 case '5': // CEX2C 2180 case '5': // CEX2C
2181 case '6': // CEX2A
2143 j++; 2182 j++;
2144 break; 2183 break;
2145 case 'd': 2184 case 'd':
@@ -3007,7 +3046,9 @@ create_crypto_device(int index)
3007 z90crypt.hdware_info->device_type_array[index] = 4; 3046 z90crypt.hdware_info->device_type_array[index] = 4;
3008 else if (deviceType == CEX2C) 3047 else if (deviceType == CEX2C)
3009 z90crypt.hdware_info->device_type_array[index] = 5; 3048 z90crypt.hdware_info->device_type_array[index] = 5;
3010 else 3049 else if (deviceType == CEX2A)
3050 z90crypt.hdware_info->device_type_array[index] = 6;
3051 else // No idea how this would happen.
3011 z90crypt.hdware_info->device_type_array[index] = -1; 3052 z90crypt.hdware_info->device_type_array[index] = -1;
3012 } 3053 }
3013 3054
diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig
index a7efc394515e..548854754921 100644
--- a/drivers/s390/net/Kconfig
+++ b/drivers/s390/net/Kconfig
@@ -1,5 +1,5 @@
1menu "S/390 network device drivers" 1menu "S/390 network device drivers"
2 depends on NETDEVICES && ARCH_S390 2 depends on NETDEVICES && S390
3 3
4config LCS 4config LCS
5 tristate "Lan Channel Station Interface" 5 tristate "Lan Channel Station Interface"
diff --git a/drivers/s390/net/claw.c b/drivers/s390/net/claw.c
index 6b63d21612ec..e70af7f39946 100644
--- a/drivers/s390/net/claw.c
+++ b/drivers/s390/net/claw.c
@@ -1603,7 +1603,7 @@ dumpit(char* buf, int len)
1603 __u32 ct, sw, rm, dup; 1603 __u32 ct, sw, rm, dup;
1604 char *ptr, *rptr; 1604 char *ptr, *rptr;
1605 char tbuf[82], tdup[82]; 1605 char tbuf[82], tdup[82];
1606#if (CONFIG_ARCH_S390X) 1606#if (CONFIG_64BIT)
1607 char addr[22]; 1607 char addr[22];
1608#else 1608#else
1609 char addr[12]; 1609 char addr[12];
@@ -1619,7 +1619,7 @@ dumpit(char* buf, int len)
1619 dup = 0; 1619 dup = 0;
1620 for ( ct=0; ct < len; ct++, ptr++, rptr++ ) { 1620 for ( ct=0; ct < len; ct++, ptr++, rptr++ ) {
1621 if (sw == 0) { 1621 if (sw == 0) {
1622#if (CONFIG_ARCH_S390X) 1622#if (CONFIG_64BIT)
1623 sprintf(addr, "%16.16lX",(unsigned long)rptr); 1623 sprintf(addr, "%16.16lX",(unsigned long)rptr);
1624#else 1624#else
1625 sprintf(addr, "%8.8X",(__u32)rptr); 1625 sprintf(addr, "%8.8X",(__u32)rptr);
@@ -1634,7 +1634,7 @@ dumpit(char* buf, int len)
1634 if (sw == 8) { 1634 if (sw == 8) {
1635 strcat(bhex, " "); 1635 strcat(bhex, " ");
1636 } 1636 }
1637#if (CONFIG_ARCH_S390X) 1637#if (CONFIG_64BIT)
1638 sprintf(tbuf,"%2.2lX", (unsigned long)*ptr); 1638 sprintf(tbuf,"%2.2lX", (unsigned long)*ptr);
1639#else 1639#else
1640 sprintf(tbuf,"%2.2X", (__u32)*ptr); 1640 sprintf(tbuf,"%2.2X", (__u32)*ptr);
diff --git a/drivers/s390/net/cu3088.c b/drivers/s390/net/cu3088.c
index 0075894c71db..77dacb465732 100644
--- a/drivers/s390/net/cu3088.c
+++ b/drivers/s390/net/cu3088.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * $Id: cu3088.c,v 1.35 2005/03/30 19:28:52 richtera Exp $ 2 * $Id: cu3088.c,v 1.36 2005/10/25 14:37:17 cohuck Exp $
3 * 3 *
4 * CTC / LCS ccw_device driver 4 * CTC / LCS ccw_device driver
5 * 5 *
@@ -27,6 +27,7 @@
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/err.h> 28#include <linux/err.h>
29 29
30#include <asm/s390_rdev.h>
30#include <asm/ccwdev.h> 31#include <asm/ccwdev.h>
31#include <asm/ccwgroup.h> 32#include <asm/ccwgroup.h>
32 33
diff --git a/drivers/s390/net/iucv.c b/drivers/s390/net/iucv.c
index df7647c3c100..ea8177392564 100644
--- a/drivers/s390/net/iucv.c
+++ b/drivers/s390/net/iucv.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * $Id: iucv.c,v 1.45 2005/04/26 22:59:06 braunu Exp $ 2 * $Id: iucv.c,v 1.47 2005/11/21 11:35:22 mschwide Exp $
3 * 3 *
4 * IUCV network driver 4 * IUCV network driver
5 * 5 *
@@ -29,7 +29,7 @@
29 * along with this program; if not, write to the Free Software 29 * along with this program; if not, write to the Free Software
30 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 30 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
31 * 31 *
32 * RELEASE-TAG: IUCV lowlevel driver $Revision: 1.45 $ 32 * RELEASE-TAG: IUCV lowlevel driver $Revision: 1.47 $
33 * 33 *
34 */ 34 */
35 35
@@ -54,7 +54,7 @@
54#include <asm/s390_ext.h> 54#include <asm/s390_ext.h>
55#include <asm/ebcdic.h> 55#include <asm/ebcdic.h>
56#include <asm/smp.h> 56#include <asm/smp.h>
57#include <asm/ccwdev.h> //for root device stuff 57#include <asm/s390_rdev.h>
58 58
59/* FLAGS: 59/* FLAGS:
60 * All flags are defined in the field IPFLAGS1 of each function 60 * All flags are defined in the field IPFLAGS1 of each function
@@ -355,7 +355,7 @@ do { \
355static void 355static void
356iucv_banner(void) 356iucv_banner(void)
357{ 357{
358 char vbuf[] = "$Revision: 1.45 $"; 358 char vbuf[] = "$Revision: 1.47 $";
359 char *version = vbuf; 359 char *version = vbuf;
360 360
361 if ((version = strchr(version, ':'))) { 361 if ((version = strchr(version, ':'))) {
@@ -477,7 +477,7 @@ grab_param(void)
477 ptr++; 477 ptr++;
478 if (ptr >= iucv_param_pool + PARAM_POOL_SIZE) 478 if (ptr >= iucv_param_pool + PARAM_POOL_SIZE)
479 ptr = iucv_param_pool; 479 ptr = iucv_param_pool;
480 } while (atomic_compare_and_swap(0, 1, &ptr->in_use)); 480 } while (atomic_cmpxchg(&ptr->in_use, 0, 1) != 0);
481 hint = ptr - iucv_param_pool; 481 hint = ptr - iucv_param_pool;
482 482
483 memset(&ptr->param, 0, sizeof(ptr->param)); 483 memset(&ptr->param, 0, sizeof(ptr->param));
diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c
index f8f55cc468ba..97f927c01a82 100644
--- a/drivers/s390/net/qeth_main.c
+++ b/drivers/s390/net/qeth_main.c
@@ -65,6 +65,7 @@
65#include <asm/timex.h> 65#include <asm/timex.h>
66#include <asm/semaphore.h> 66#include <asm/semaphore.h>
67#include <asm/uaccess.h> 67#include <asm/uaccess.h>
68#include <asm/s390_rdev.h>
68 69
69#include "qeth.h" 70#include "qeth.h"
70#include "qeth_mpc.h" 71#include "qeth_mpc.h"
@@ -1396,7 +1397,7 @@ qeth_idx_activate_get_answer(struct qeth_channel *channel,
1396 channel->ccw.cda = (__u32) __pa(iob->data); 1397 channel->ccw.cda = (__u32) __pa(iob->data);
1397 1398
1398 wait_event(card->wait_q, 1399 wait_event(card->wait_q,
1399 atomic_compare_and_swap(0,1,&channel->irq_pending) == 0); 1400 atomic_cmpxchg(&channel->irq_pending, 0, 1) == 0);
1400 QETH_DBF_TEXT(setup, 6, "noirqpnd"); 1401 QETH_DBF_TEXT(setup, 6, "noirqpnd");
1401 spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); 1402 spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags);
1402 rc = ccw_device_start(channel->ccwdev, 1403 rc = ccw_device_start(channel->ccwdev,
@@ -1463,7 +1464,7 @@ qeth_idx_activate_channel(struct qeth_channel *channel,
1463 memcpy(QETH_IDX_ACT_QDIO_DEV_REALADDR(iob->data), &temp, 2); 1464 memcpy(QETH_IDX_ACT_QDIO_DEV_REALADDR(iob->data), &temp, 2);
1464 1465
1465 wait_event(card->wait_q, 1466 wait_event(card->wait_q,
1466 atomic_compare_and_swap(0,1,&channel->irq_pending) == 0); 1467 atomic_cmpxchg(&channel->irq_pending, 0, 1) == 0);
1467 QETH_DBF_TEXT(setup, 6, "noirqpnd"); 1468 QETH_DBF_TEXT(setup, 6, "noirqpnd");
1468 spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags); 1469 spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags);
1469 rc = ccw_device_start(channel->ccwdev, 1470 rc = ccw_device_start(channel->ccwdev,
@@ -1616,7 +1617,7 @@ qeth_issue_next_read(struct qeth_card *card)
1616 } 1617 }
1617 qeth_setup_ccw(&card->read, iob->data, QETH_BUFSIZE); 1618 qeth_setup_ccw(&card->read, iob->data, QETH_BUFSIZE);
1618 wait_event(card->wait_q, 1619 wait_event(card->wait_q,
1619 atomic_compare_and_swap(0,1,&card->read.irq_pending) == 0); 1620 atomic_cmpxchg(&card->read.irq_pending, 0, 1) == 0);
1620 QETH_DBF_TEXT(trace, 6, "noirqpnd"); 1621 QETH_DBF_TEXT(trace, 6, "noirqpnd");
1621 rc = ccw_device_start(card->read.ccwdev, &card->read.ccw, 1622 rc = ccw_device_start(card->read.ccwdev, &card->read.ccw,
1622 (addr_t) iob, 0, 0); 1623 (addr_t) iob, 0, 0);
@@ -1882,7 +1883,7 @@ qeth_send_control_data(struct qeth_card *card, int len,
1882 spin_unlock_irqrestore(&card->lock, flags); 1883 spin_unlock_irqrestore(&card->lock, flags);
1883 QETH_DBF_HEX(control, 2, iob->data, QETH_DBF_CONTROL_LEN); 1884 QETH_DBF_HEX(control, 2, iob->data, QETH_DBF_CONTROL_LEN);
1884 wait_event(card->wait_q, 1885 wait_event(card->wait_q,
1885 atomic_compare_and_swap(0,1,&card->write.irq_pending) == 0); 1886 atomic_cmpxchg(&card->write.irq_pending, 0, 1) == 0);
1886 qeth_prepare_control_data(card, len, iob); 1887 qeth_prepare_control_data(card, len, iob);
1887 if (IS_IPA(iob->data)) 1888 if (IS_IPA(iob->data))
1888 timer.expires = jiffies + QETH_IPA_TIMEOUT; 1889 timer.expires = jiffies + QETH_IPA_TIMEOUT;
@@ -1924,7 +1925,7 @@ qeth_osn_send_control_data(struct qeth_card *card, int len,
1924 QETH_DBF_TEXT(trace, 5, "osndctrd"); 1925 QETH_DBF_TEXT(trace, 5, "osndctrd");
1925 1926
1926 wait_event(card->wait_q, 1927 wait_event(card->wait_q,
1927 atomic_compare_and_swap(0,1,&card->write.irq_pending) == 0); 1928 atomic_cmpxchg(&card->write.irq_pending, 0, 1) == 0);
1928 qeth_prepare_control_data(card, len, iob); 1929 qeth_prepare_control_data(card, len, iob);
1929 QETH_DBF_TEXT(trace, 6, "osnoirqp"); 1930 QETH_DBF_TEXT(trace, 6, "osnoirqp");
1930 spin_lock_irqsave(get_ccwdev_lock(card->write.ccwdev), flags); 1931 spin_lock_irqsave(get_ccwdev_lock(card->write.ccwdev), flags);
@@ -4236,9 +4237,8 @@ qeth_do_send_packet_fast(struct qeth_card *card, struct qeth_qdio_out_q *queue,
4236 QETH_DBF_TEXT(trace, 6, "dosndpfa"); 4237 QETH_DBF_TEXT(trace, 6, "dosndpfa");
4237 4238
4238 /* spin until we get the queue ... */ 4239 /* spin until we get the queue ... */
4239 while (atomic_compare_and_swap(QETH_OUT_Q_UNLOCKED, 4240 while (atomic_cmpxchg(&queue->state, QETH_OUT_Q_UNLOCKED,
4240 QETH_OUT_Q_LOCKED, 4241 QETH_OUT_Q_LOCKED) != QETH_OUT_Q_UNLOCKED);
4241 &queue->state));
4242 /* ... now we've got the queue */ 4242 /* ... now we've got the queue */
4243 index = queue->next_buf_to_fill; 4243 index = queue->next_buf_to_fill;
4244 buffer = &queue->bufs[queue->next_buf_to_fill]; 4244 buffer = &queue->bufs[queue->next_buf_to_fill];
@@ -4292,9 +4292,8 @@ qeth_do_send_packet(struct qeth_card *card, struct qeth_qdio_out_q *queue,
4292 QETH_DBF_TEXT(trace, 6, "dosndpkt"); 4292 QETH_DBF_TEXT(trace, 6, "dosndpkt");
4293 4293
4294 /* spin until we get the queue ... */ 4294 /* spin until we get the queue ... */
4295 while (atomic_compare_and_swap(QETH_OUT_Q_UNLOCKED, 4295 while (atomic_cmpxchg(&queue->state, QETH_OUT_Q_UNLOCKED,
4296 QETH_OUT_Q_LOCKED, 4296 QETH_OUT_Q_LOCKED) != QETH_OUT_Q_UNLOCKED);
4297 &queue->state));
4298 start_index = queue->next_buf_to_fill; 4297 start_index = queue->next_buf_to_fill;
4299 buffer = &queue->bufs[queue->next_buf_to_fill]; 4298 buffer = &queue->bufs[queue->next_buf_to_fill];
4300 /* 4299 /*
diff --git a/drivers/s390/s390_rdev.c b/drivers/s390/s390_rdev.c
new file mode 100644
index 000000000000..566cc3d185b6
--- /dev/null
+++ b/drivers/s390/s390_rdev.c
@@ -0,0 +1,53 @@
1/*
2 * drivers/s390/s390_rdev.c
3 * s390 root device
4 * $Revision: 1.2 $
5 *
6 * Copyright (C) 2002, 2005 IBM Deutschland Entwicklung GmbH,
7 * IBM Corporation
8 * Author(s): Cornelia Huck (cohuck@de.ibm.com)
9 * Carsten Otte (cotte@de.ibm.com)
10 */
11
12#include <linux/slab.h>
13#include <linux/err.h>
14#include <linux/device.h>
15#include <asm/s390_rdev.h>
16
17static void
18s390_root_dev_release(struct device *dev)
19{
20 kfree(dev);
21}
22
23struct device *
24s390_root_dev_register(const char *name)
25{
26 struct device *dev;
27 int ret;
28
29 if (!strlen(name))
30 return ERR_PTR(-EINVAL);
31 dev = kmalloc(sizeof(struct device), GFP_KERNEL);
32 if (!dev)
33 return ERR_PTR(-ENOMEM);
34 memset(dev, 0, sizeof(struct device));
35 strncpy(dev->bus_id, name, min(strlen(name), (size_t)BUS_ID_SIZE));
36 dev->release = s390_root_dev_release;
37 ret = device_register(dev);
38 if (ret) {
39 kfree(dev);
40 return ERR_PTR(ret);
41 }
42 return dev;
43}
44
45void
46s390_root_dev_unregister(struct device *dev)
47{
48 if (dev)
49 device_unregister(dev);
50}
51
52EXPORT_SYMBOL(s390_root_dev_register);
53EXPORT_SYMBOL(s390_root_dev_unregister);
diff --git a/drivers/s390/s390mach.c b/drivers/s390/s390mach.c
index 4191fd9d4d11..3bf466603512 100644
--- a/drivers/s390/s390mach.c
+++ b/drivers/s390/s390mach.c
@@ -23,7 +23,7 @@
23 23
24static struct semaphore m_sem; 24static struct semaphore m_sem;
25 25
26extern int css_process_crw(int); 26extern int css_process_crw(int, int);
27extern int chsc_process_crw(void); 27extern int chsc_process_crw(void);
28extern int chp_process_crw(int, int); 28extern int chp_process_crw(int, int);
29extern void css_reiterate_subchannels(void); 29extern void css_reiterate_subchannels(void);
@@ -49,9 +49,10 @@ s390_handle_damage(char *msg)
49static int 49static int
50s390_collect_crw_info(void *param) 50s390_collect_crw_info(void *param)
51{ 51{
52 struct crw crw; 52 struct crw crw[2];
53 int ccode, ret, slow; 53 int ccode, ret, slow;
54 struct semaphore *sem; 54 struct semaphore *sem;
55 unsigned int chain;
55 56
56 sem = (struct semaphore *)param; 57 sem = (struct semaphore *)param;
57 /* Set a nice name. */ 58 /* Set a nice name. */
@@ -59,25 +60,50 @@ s390_collect_crw_info(void *param)
59repeat: 60repeat:
60 down_interruptible(sem); 61 down_interruptible(sem);
61 slow = 0; 62 slow = 0;
63 chain = 0;
62 while (1) { 64 while (1) {
63 ccode = stcrw(&crw); 65 if (unlikely(chain > 1)) {
66 struct crw tmp_crw;
67
68 printk(KERN_WARNING"%s: Code does not support more "
69 "than two chained crws; please report to "
70 "linux390@de.ibm.com!\n", __FUNCTION__);
71 ccode = stcrw(&tmp_crw);
72 printk(KERN_WARNING"%s: crw reports slct=%d, oflw=%d, "
73 "chn=%d, rsc=%X, anc=%d, erc=%X, rsid=%X\n",
74 __FUNCTION__, tmp_crw.slct, tmp_crw.oflw,
75 tmp_crw.chn, tmp_crw.rsc, tmp_crw.anc,
76 tmp_crw.erc, tmp_crw.rsid);
77 printk(KERN_WARNING"%s: This was crw number %x in the "
78 "chain\n", __FUNCTION__, chain);
79 if (ccode != 0)
80 break;
81 chain = tmp_crw.chn ? chain + 1 : 0;
82 continue;
83 }
84 ccode = stcrw(&crw[chain]);
64 if (ccode != 0) 85 if (ccode != 0)
65 break; 86 break;
66 DBG(KERN_DEBUG "crw_info : CRW reports slct=%d, oflw=%d, " 87 DBG(KERN_DEBUG "crw_info : CRW reports slct=%d, oflw=%d, "
67 "chn=%d, rsc=%X, anc=%d, erc=%X, rsid=%X\n", 88 "chn=%d, rsc=%X, anc=%d, erc=%X, rsid=%X\n",
68 crw.slct, crw.oflw, crw.chn, crw.rsc, crw.anc, 89 crw[chain].slct, crw[chain].oflw, crw[chain].chn,
69 crw.erc, crw.rsid); 90 crw[chain].rsc, crw[chain].anc, crw[chain].erc,
91 crw[chain].rsid);
70 /* Check for overflows. */ 92 /* Check for overflows. */
71 if (crw.oflw) { 93 if (crw[chain].oflw) {
72 pr_debug("%s: crw overflow detected!\n", __FUNCTION__); 94 pr_debug("%s: crw overflow detected!\n", __FUNCTION__);
73 css_reiterate_subchannels(); 95 css_reiterate_subchannels();
96 chain = 0;
74 slow = 1; 97 slow = 1;
75 continue; 98 continue;
76 } 99 }
77 switch (crw.rsc) { 100 switch (crw[chain].rsc) {
78 case CRW_RSC_SCH: 101 case CRW_RSC_SCH:
79 pr_debug("source is subchannel %04X\n", crw.rsid); 102 if (crw[0].chn && !chain)
80 ret = css_process_crw (crw.rsid); 103 break;
104 pr_debug("source is subchannel %04X\n", crw[0].rsid);
105 ret = css_process_crw (crw[0].rsid,
106 chain ? crw[1].rsid : 0);
81 if (ret == -EAGAIN) 107 if (ret == -EAGAIN)
82 slow = 1; 108 slow = 1;
83 break; 109 break;
@@ -85,18 +111,18 @@ repeat:
85 pr_debug("source is monitoring facility\n"); 111 pr_debug("source is monitoring facility\n");
86 break; 112 break;
87 case CRW_RSC_CPATH: 113 case CRW_RSC_CPATH:
88 pr_debug("source is channel path %02X\n", crw.rsid); 114 pr_debug("source is channel path %02X\n", crw[0].rsid);
89 switch (crw.erc) { 115 switch (crw[0].erc) {
90 case CRW_ERC_IPARM: /* Path has come. */ 116 case CRW_ERC_IPARM: /* Path has come. */
91 ret = chp_process_crw(crw.rsid, 1); 117 ret = chp_process_crw(crw[0].rsid, 1);
92 break; 118 break;
93 case CRW_ERC_PERRI: /* Path has gone. */ 119 case CRW_ERC_PERRI: /* Path has gone. */
94 case CRW_ERC_PERRN: 120 case CRW_ERC_PERRN:
95 ret = chp_process_crw(crw.rsid, 0); 121 ret = chp_process_crw(crw[0].rsid, 0);
96 break; 122 break;
97 default: 123 default:
98 pr_debug("Don't know how to handle erc=%x\n", 124 pr_debug("Don't know how to handle erc=%x\n",
99 crw.erc); 125 crw[0].erc);
100 ret = 0; 126 ret = 0;
101 } 127 }
102 if (ret == -EAGAIN) 128 if (ret == -EAGAIN)
@@ -115,6 +141,8 @@ repeat:
115 pr_debug("unknown source\n"); 141 pr_debug("unknown source\n");
116 break; 142 break;
117 } 143 }
144 /* chain is always 0 or 1 here. */
145 chain = crw[chain].chn ? chain + 1 : 0;
118 } 146 }
119 if (slow) 147 if (slow)
120 queue_work(slow_path_wq, &slow_path_work); 148 queue_work(slow_path_wq, &slow_path_work);
@@ -218,7 +246,7 @@ s390_revalidate_registers(struct mci *mci)
218 */ 246 */
219 kill_task = 1; 247 kill_task = 1;
220 248
221#ifndef __s390x__ 249#ifndef CONFIG_64BIT
222 asm volatile("ld 0,0(%0)\n" 250 asm volatile("ld 0,0(%0)\n"
223 "ld 2,8(%0)\n" 251 "ld 2,8(%0)\n"
224 "ld 4,16(%0)\n" 252 "ld 4,16(%0)\n"
@@ -227,7 +255,7 @@ s390_revalidate_registers(struct mci *mci)
227#endif 255#endif
228 256
229 if (MACHINE_HAS_IEEE) { 257 if (MACHINE_HAS_IEEE) {
230#ifdef __s390x__ 258#ifdef CONFIG_64BIT
231 fpt_save_area = &S390_lowcore.floating_pt_save_area; 259 fpt_save_area = &S390_lowcore.floating_pt_save_area;
232 fpt_creg_save_area = &S390_lowcore.fpt_creg_save_area; 260 fpt_creg_save_area = &S390_lowcore.fpt_creg_save_area;
233#else 261#else
@@ -286,7 +314,7 @@ s390_revalidate_registers(struct mci *mci)
286 */ 314 */
287 s390_handle_damage("invalid control registers."); 315 s390_handle_damage("invalid control registers.");
288 else 316 else
289#ifdef __s390x__ 317#ifdef CONFIG_64BIT
290 asm volatile("lctlg 0,15,0(%0)" 318 asm volatile("lctlg 0,15,0(%0)"
291 : : "a" (&S390_lowcore.cregs_save_area)); 319 : : "a" (&S390_lowcore.cregs_save_area));
292#else 320#else
@@ -299,7 +327,7 @@ s390_revalidate_registers(struct mci *mci)
299 * can't write something sensible into that register. 327 * can't write something sensible into that register.
300 */ 328 */
301 329
302#ifdef __s390x__ 330#ifdef CONFIG_64BIT
303 /* 331 /*
304 * See if we can revalidate the TOD programmable register with its 332 * See if we can revalidate the TOD programmable register with its
305 * old contents (should be zero) otherwise set it to zero. 333 * old contents (should be zero) otherwise set it to zero.
@@ -356,7 +384,7 @@ s390_do_machine_check(struct pt_regs *regs)
356 if (mci->b) { 384 if (mci->b) {
357 /* Processing backup -> verify if we can survive this */ 385 /* Processing backup -> verify if we can survive this */
358 u64 z_mcic, o_mcic, t_mcic; 386 u64 z_mcic, o_mcic, t_mcic;
359#ifdef __s390x__ 387#ifdef CONFIG_64BIT
360 z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29); 388 z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29);
361 o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 | 389 o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 |
362 1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 | 390 1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 |
diff --git a/drivers/s390/sysinfo.c b/drivers/s390/sysinfo.c
index 87c2db1bd4f5..66da840c9316 100644
--- a/drivers/s390/sysinfo.c
+++ b/drivers/s390/sysinfo.c
@@ -106,7 +106,7 @@ static inline int stsi (void *sysinfo,
106{ 106{
107 int cc, retv; 107 int cc, retv;
108 108
109#ifndef CONFIG_ARCH_S390X 109#ifndef CONFIG_64BIT
110 __asm__ __volatile__ ( "lr\t0,%2\n" 110 __asm__ __volatile__ ( "lr\t0,%2\n"
111 "\tlr\t1,%3\n" 111 "\tlr\t1,%3\n"
112 "\tstsi\t0(%4)\n" 112 "\tstsi\t0(%4)\n"
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 4c42065dea88..3c606cf8c8ca 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -914,7 +914,7 @@ config SCSI_INIA100
914 914
915config SCSI_PPA 915config SCSI_PPA
916 tristate "IOMEGA parallel port (ppa - older drives)" 916 tristate "IOMEGA parallel port (ppa - older drives)"
917 depends on SCSI && PARPORT 917 depends on SCSI && PARPORT_PC
918 ---help--- 918 ---help---
919 This driver supports older versions of IOMEGA's parallel port ZIP 919 This driver supports older versions of IOMEGA's parallel port ZIP
920 drive (a 100 MB removable media device). 920 drive (a 100 MB removable media device).
@@ -941,7 +941,7 @@ config SCSI_PPA
941 941
942config SCSI_IMM 942config SCSI_IMM
943 tristate "IOMEGA parallel port (imm - newer drives)" 943 tristate "IOMEGA parallel port (imm - newer drives)"
944 depends on SCSI && PARPORT 944 depends on SCSI && PARPORT_PC
945 ---help--- 945 ---help---
946 This driver supports newer versions of IOMEGA's parallel port ZIP 946 This driver supports newer versions of IOMEGA's parallel port ZIP
947 drive (a 100 MB removable media device). 947 drive (a 100 MB removable media device).
@@ -968,7 +968,7 @@ config SCSI_IMM
968 968
969config SCSI_IZIP_EPP16 969config SCSI_IZIP_EPP16
970 bool "ppa/imm option - Use slow (but safe) EPP-16" 970 bool "ppa/imm option - Use slow (but safe) EPP-16"
971 depends on PARPORT && (SCSI_PPA || SCSI_IMM) 971 depends on SCSI_PPA || SCSI_IMM
972 ---help--- 972 ---help---
973 EPP (Enhanced Parallel Port) is a standard for parallel ports which 973 EPP (Enhanced Parallel Port) is a standard for parallel ports which
974 allows them to act as expansion buses that can handle up to 64 974 allows them to act as expansion buses that can handle up to 64
@@ -983,7 +983,7 @@ config SCSI_IZIP_EPP16
983 983
984config SCSI_IZIP_SLOW_CTR 984config SCSI_IZIP_SLOW_CTR
985 bool "ppa/imm option - Assume slow parport control register" 985 bool "ppa/imm option - Assume slow parport control register"
986 depends on PARPORT && (SCSI_PPA || SCSI_IMM) 986 depends on SCSI_PPA || SCSI_IMM
987 help 987 help
988 Some parallel ports are known to have excessive delays between 988 Some parallel ports are known to have excessive delays between
989 changing the parallel port control register and good data being 989 changing the parallel port control register and good data being
@@ -1815,7 +1815,7 @@ config SCSI_SUNESP
1815 1815
1816config ZFCP 1816config ZFCP
1817 tristate "FCP host bus adapter driver for IBM eServer zSeries" 1817 tristate "FCP host bus adapter driver for IBM eServer zSeries"
1818 depends on ARCH_S390 && QDIO && SCSI 1818 depends on S390 && QDIO && SCSI
1819 select SCSI_FC_ATTRS 1819 select SCSI_FC_ATTRS
1820 help 1820 help
1821 If you want to access SCSI devices attached to your IBM eServer 1821 If you want to access SCSI devices attached to your IBM eServer
diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index 887eaa2a3ebf..d113290b5fc0 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -214,7 +214,6 @@ static struct scsi_host_template ahci_sht = {
214 .dma_boundary = AHCI_DMA_BOUNDARY, 214 .dma_boundary = AHCI_DMA_BOUNDARY,
215 .slave_configure = ata_scsi_slave_config, 215 .slave_configure = ata_scsi_slave_config,
216 .bios_param = ata_std_bios_param, 216 .bios_param = ata_std_bios_param,
217 .ordered_flush = 1,
218}; 217};
219 218
220static const struct ata_port_operations ahci_ops = { 219static const struct ata_port_operations ahci_ops = {
diff --git a/drivers/scsi/ata_piix.c b/drivers/scsi/ata_piix.c
index 0ea27873b9ff..557788ec4eec 100644
--- a/drivers/scsi/ata_piix.c
+++ b/drivers/scsi/ata_piix.c
@@ -166,6 +166,8 @@ static struct pci_driver piix_pci_driver = {
166 .id_table = piix_pci_tbl, 166 .id_table = piix_pci_tbl,
167 .probe = piix_init_one, 167 .probe = piix_init_one,
168 .remove = ata_pci_remove_one, 168 .remove = ata_pci_remove_one,
169 .suspend = ata_pci_device_suspend,
170 .resume = ata_pci_device_resume,
169}; 171};
170 172
171static struct scsi_host_template piix_sht = { 173static struct scsi_host_template piix_sht = {
@@ -185,7 +187,8 @@ static struct scsi_host_template piix_sht = {
185 .dma_boundary = ATA_DMA_BOUNDARY, 187 .dma_boundary = ATA_DMA_BOUNDARY,
186 .slave_configure = ata_scsi_slave_config, 188 .slave_configure = ata_scsi_slave_config,
187 .bios_param = ata_std_bios_param, 189 .bios_param = ata_std_bios_param,
188 .ordered_flush = 1, 190 .resume = ata_scsi_device_resume,
191 .suspend = ata_scsi_device_suspend,
189}; 192};
190 193
191static const struct ata_port_operations piix_pata_ops = { 194static const struct ata_port_operations piix_pata_ops = {
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 5b9c2c5a7f0e..66783c860a19 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -347,17 +347,8 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
347 shost->cmd_per_lun = sht->cmd_per_lun; 347 shost->cmd_per_lun = sht->cmd_per_lun;
348 shost->unchecked_isa_dma = sht->unchecked_isa_dma; 348 shost->unchecked_isa_dma = sht->unchecked_isa_dma;
349 shost->use_clustering = sht->use_clustering; 349 shost->use_clustering = sht->use_clustering;
350 shost->ordered_flush = sht->ordered_flush;
351 shost->ordered_tag = sht->ordered_tag; 350 shost->ordered_tag = sht->ordered_tag;
352 351
353 /*
354 * hosts/devices that do queueing must support ordered tags
355 */
356 if (shost->can_queue > 1 && shost->ordered_flush) {
357 printk(KERN_ERR "scsi: ordered flushes don't support queueing\n");
358 shost->ordered_flush = 0;
359 }
360
361 if (sht->max_host_blocked) 352 if (sht->max_host_blocked)
362 shost->max_host_blocked = sht->max_host_blocked; 353 shost->max_host_blocked = sht->max_host_blocked;
363 else 354 else
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 4cb1f3ed9100..3c688ef54660 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -1046,7 +1046,7 @@ static int idescsi_eh_reset (struct scsi_cmnd *cmd)
1046 1046
1047 /* kill current request */ 1047 /* kill current request */
1048 blkdev_dequeue_request(req); 1048 blkdev_dequeue_request(req);
1049 end_that_request_last(req); 1049 end_that_request_last(req, 0);
1050 if (req->flags & REQ_SENSE) 1050 if (req->flags & REQ_SENSE)
1051 kfree(scsi->pc->buffer); 1051 kfree(scsi->pc->buffer);
1052 kfree(scsi->pc); 1052 kfree(scsi->pc);
@@ -1056,7 +1056,7 @@ static int idescsi_eh_reset (struct scsi_cmnd *cmd)
1056 /* now nuke the drive queue */ 1056 /* now nuke the drive queue */
1057 while ((req = elv_next_request(drive->queue))) { 1057 while ((req = elv_next_request(drive->queue))) {
1058 blkdev_dequeue_request(req); 1058 blkdev_dequeue_request(req);
1059 end_that_request_last(req); 1059 end_that_request_last(req, 0);
1060 } 1060 }
1061 1061
1062 HWGROUP(drive)->rq = NULL; 1062 HWGROUP(drive)->rq = NULL;
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index 9ea102587914..f55b9b3f7b37 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -562,16 +562,28 @@ static const u8 ata_rw_cmds[] = {
562 ATA_CMD_WRITE_MULTI, 562 ATA_CMD_WRITE_MULTI,
563 ATA_CMD_READ_MULTI_EXT, 563 ATA_CMD_READ_MULTI_EXT,
564 ATA_CMD_WRITE_MULTI_EXT, 564 ATA_CMD_WRITE_MULTI_EXT,
565 0,
566 0,
567 0,
568 ATA_CMD_WRITE_MULTI_FUA_EXT,
565 /* pio */ 569 /* pio */
566 ATA_CMD_PIO_READ, 570 ATA_CMD_PIO_READ,
567 ATA_CMD_PIO_WRITE, 571 ATA_CMD_PIO_WRITE,
568 ATA_CMD_PIO_READ_EXT, 572 ATA_CMD_PIO_READ_EXT,
569 ATA_CMD_PIO_WRITE_EXT, 573 ATA_CMD_PIO_WRITE_EXT,
574 0,
575 0,
576 0,
577 0,
570 /* dma */ 578 /* dma */
571 ATA_CMD_READ, 579 ATA_CMD_READ,
572 ATA_CMD_WRITE, 580 ATA_CMD_WRITE,
573 ATA_CMD_READ_EXT, 581 ATA_CMD_READ_EXT,
574 ATA_CMD_WRITE_EXT 582 ATA_CMD_WRITE_EXT,
583 0,
584 0,
585 0,
586 ATA_CMD_WRITE_FUA_EXT
575}; 587};
576 588
577/** 589/**
@@ -584,25 +596,32 @@ static const u8 ata_rw_cmds[] = {
584 * LOCKING: 596 * LOCKING:
585 * caller. 597 * caller.
586 */ 598 */
587void ata_rwcmd_protocol(struct ata_queued_cmd *qc) 599int ata_rwcmd_protocol(struct ata_queued_cmd *qc)
588{ 600{
589 struct ata_taskfile *tf = &qc->tf; 601 struct ata_taskfile *tf = &qc->tf;
590 struct ata_device *dev = qc->dev; 602 struct ata_device *dev = qc->dev;
603 u8 cmd;
591 604
592 int index, lba48, write; 605 int index, fua, lba48, write;
593 606
607 fua = (tf->flags & ATA_TFLAG_FUA) ? 4 : 0;
594 lba48 = (tf->flags & ATA_TFLAG_LBA48) ? 2 : 0; 608 lba48 = (tf->flags & ATA_TFLAG_LBA48) ? 2 : 0;
595 write = (tf->flags & ATA_TFLAG_WRITE) ? 1 : 0; 609 write = (tf->flags & ATA_TFLAG_WRITE) ? 1 : 0;
596 610
597 if (dev->flags & ATA_DFLAG_PIO) { 611 if (dev->flags & ATA_DFLAG_PIO) {
598 tf->protocol = ATA_PROT_PIO; 612 tf->protocol = ATA_PROT_PIO;
599 index = dev->multi_count ? 0 : 4; 613 index = dev->multi_count ? 0 : 8;
600 } else { 614 } else {
601 tf->protocol = ATA_PROT_DMA; 615 tf->protocol = ATA_PROT_DMA;
602 index = 8; 616 index = 16;
603 } 617 }
604 618
605 tf->command = ata_rw_cmds[index + lba48 + write]; 619 cmd = ata_rw_cmds[index + fua + lba48 + write];
620 if (cmd) {
621 tf->command = cmd;
622 return 0;
623 }
624 return -1;
606} 625}
607 626
608static const char * const xfer_mode_str[] = { 627static const char * const xfer_mode_str[] = {
@@ -4154,6 +4173,96 @@ err_out:
4154 * Inherited from caller. 4173 * Inherited from caller.
4155 */ 4174 */
4156 4175
4176/*
4177 * Execute a 'simple' command, that only consists of the opcode 'cmd' itself,
4178 * without filling any other registers
4179 */
4180static int ata_do_simple_cmd(struct ata_port *ap, struct ata_device *dev,
4181 u8 cmd)
4182{
4183 struct ata_taskfile tf;
4184 int err;
4185
4186 ata_tf_init(ap, &tf, dev->devno);
4187
4188 tf.command = cmd;
4189 tf.flags |= ATA_TFLAG_DEVICE;
4190 tf.protocol = ATA_PROT_NODATA;
4191
4192 err = ata_exec_internal(ap, dev, &tf, DMA_NONE, NULL, 0);
4193 if (err)
4194 printk(KERN_ERR "%s: ata command failed: %d\n",
4195 __FUNCTION__, err);
4196
4197 return err;
4198}
4199
4200static int ata_flush_cache(struct ata_port *ap, struct ata_device *dev)
4201{
4202 u8 cmd;
4203
4204 if (!ata_try_flush_cache(dev))
4205 return 0;
4206
4207 if (ata_id_has_flush_ext(dev->id))
4208 cmd = ATA_CMD_FLUSH_EXT;
4209 else
4210 cmd = ATA_CMD_FLUSH;
4211
4212 return ata_do_simple_cmd(ap, dev, cmd);
4213}
4214
4215static int ata_standby_drive(struct ata_port *ap, struct ata_device *dev)
4216{
4217 return ata_do_simple_cmd(ap, dev, ATA_CMD_STANDBYNOW1);
4218}
4219
4220static int ata_start_drive(struct ata_port *ap, struct ata_device *dev)
4221{
4222 return ata_do_simple_cmd(ap, dev, ATA_CMD_IDLEIMMEDIATE);
4223}
4224
4225/**
4226 * ata_device_resume - wakeup a previously suspended devices
4227 *
4228 * Kick the drive back into action, by sending it an idle immediate
4229 * command and making sure its transfer mode matches between drive
4230 * and host.
4231 *
4232 */
4233int ata_device_resume(struct ata_port *ap, struct ata_device *dev)
4234{
4235 if (ap->flags & ATA_FLAG_SUSPENDED) {
4236 ap->flags &= ~ATA_FLAG_SUSPENDED;
4237 ata_set_mode(ap);
4238 }
4239 if (!ata_dev_present(dev))
4240 return 0;
4241 if (dev->class == ATA_DEV_ATA)
4242 ata_start_drive(ap, dev);
4243
4244 return 0;
4245}
4246
4247/**
4248 * ata_device_suspend - prepare a device for suspend
4249 *
4250 * Flush the cache on the drive, if appropriate, then issue a
4251 * standbynow command.
4252 *
4253 */
4254int ata_device_suspend(struct ata_port *ap, struct ata_device *dev)
4255{
4256 if (!ata_dev_present(dev))
4257 return 0;
4258 if (dev->class == ATA_DEV_ATA)
4259 ata_flush_cache(ap, dev);
4260
4261 ata_standby_drive(ap, dev);
4262 ap->flags |= ATA_FLAG_SUSPENDED;
4263 return 0;
4264}
4265
4157int ata_port_start (struct ata_port *ap) 4266int ata_port_start (struct ata_port *ap)
4158{ 4267{
4159 struct device *dev = ap->host_set->dev; 4268 struct device *dev = ap->host_set->dev;
@@ -4902,6 +5011,23 @@ int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits)
4902 5011
4903 return (tmp == bits->val) ? 1 : 0; 5012 return (tmp == bits->val) ? 1 : 0;
4904} 5013}
5014
5015int ata_pci_device_suspend(struct pci_dev *pdev, pm_message_t state)
5016{
5017 pci_save_state(pdev);
5018 pci_disable_device(pdev);
5019 pci_set_power_state(pdev, PCI_D3hot);
5020 return 0;
5021}
5022
5023int ata_pci_device_resume(struct pci_dev *pdev)
5024{
5025 pci_set_power_state(pdev, PCI_D0);
5026 pci_restore_state(pdev);
5027 pci_enable_device(pdev);
5028 pci_set_master(pdev);
5029 return 0;
5030}
4905#endif /* CONFIG_PCI */ 5031#endif /* CONFIG_PCI */
4906 5032
4907 5033
@@ -5005,4 +5131,11 @@ EXPORT_SYMBOL_GPL(ata_pci_host_stop);
5005EXPORT_SYMBOL_GPL(ata_pci_init_native_mode); 5131EXPORT_SYMBOL_GPL(ata_pci_init_native_mode);
5006EXPORT_SYMBOL_GPL(ata_pci_init_one); 5132EXPORT_SYMBOL_GPL(ata_pci_init_one);
5007EXPORT_SYMBOL_GPL(ata_pci_remove_one); 5133EXPORT_SYMBOL_GPL(ata_pci_remove_one);
5134EXPORT_SYMBOL_GPL(ata_pci_device_suspend);
5135EXPORT_SYMBOL_GPL(ata_pci_device_resume);
5008#endif /* CONFIG_PCI */ 5136#endif /* CONFIG_PCI */
5137
5138EXPORT_SYMBOL_GPL(ata_device_suspend);
5139EXPORT_SYMBOL_GPL(ata_device_resume);
5140EXPORT_SYMBOL_GPL(ata_scsi_device_suspend);
5141EXPORT_SYMBOL_GPL(ata_scsi_device_resume);
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index e0439be4b573..cfbceb504718 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -396,6 +396,22 @@ void ata_dump_status(unsigned id, struct ata_taskfile *tf)
396 } 396 }
397} 397}
398 398
399int ata_scsi_device_resume(struct scsi_device *sdev)
400{
401 struct ata_port *ap = (struct ata_port *) &sdev->host->hostdata[0];
402 struct ata_device *dev = &ap->device[sdev->id];
403
404 return ata_device_resume(ap, dev);
405}
406
407int ata_scsi_device_suspend(struct scsi_device *sdev)
408{
409 struct ata_port *ap = (struct ata_port *) &sdev->host->hostdata[0];
410 struct ata_device *dev = &ap->device[sdev->id];
411
412 return ata_device_suspend(ap, dev);
413}
414
399/** 415/**
400 * ata_to_sense_error - convert ATA error to SCSI error 416 * ata_to_sense_error - convert ATA error to SCSI error
401 * @id: ATA device number 417 * @id: ATA device number
@@ -1080,11 +1096,13 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
1080 scsicmd[0] == WRITE_16) 1096 scsicmd[0] == WRITE_16)
1081 tf->flags |= ATA_TFLAG_WRITE; 1097 tf->flags |= ATA_TFLAG_WRITE;
1082 1098
1083 /* Calculate the SCSI LBA and transfer length. */ 1099 /* Calculate the SCSI LBA, transfer length and FUA. */
1084 switch (scsicmd[0]) { 1100 switch (scsicmd[0]) {
1085 case READ_10: 1101 case READ_10:
1086 case WRITE_10: 1102 case WRITE_10:
1087 scsi_10_lba_len(scsicmd, &block, &n_block); 1103 scsi_10_lba_len(scsicmd, &block, &n_block);
1104 if (unlikely(scsicmd[1] & (1 << 3)))
1105 tf->flags |= ATA_TFLAG_FUA;
1088 break; 1106 break;
1089 case READ_6: 1107 case READ_6:
1090 case WRITE_6: 1108 case WRITE_6:
@@ -1099,6 +1117,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
1099 case READ_16: 1117 case READ_16:
1100 case WRITE_16: 1118 case WRITE_16:
1101 scsi_16_lba_len(scsicmd, &block, &n_block); 1119 scsi_16_lba_len(scsicmd, &block, &n_block);
1120 if (unlikely(scsicmd[1] & (1 << 3)))
1121 tf->flags |= ATA_TFLAG_FUA;
1102 break; 1122 break;
1103 default: 1123 default:
1104 DPRINTK("no-byte command\n"); 1124 DPRINTK("no-byte command\n");
@@ -1142,7 +1162,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
1142 tf->device |= (block >> 24) & 0xf; 1162 tf->device |= (block >> 24) & 0xf;
1143 } 1163 }
1144 1164
1145 ata_rwcmd_protocol(qc); 1165 if (unlikely(ata_rwcmd_protocol(qc) < 0))
1166 goto invalid_fld;
1146 1167
1147 qc->nsect = n_block; 1168 qc->nsect = n_block;
1148 tf->nsect = n_block & 0xff; 1169 tf->nsect = n_block & 0xff;
@@ -1160,7 +1181,8 @@ static unsigned int ata_scsi_rw_xlat(struct ata_queued_cmd *qc, const u8 *scsicm
1160 if ((block >> 28) || (n_block > 256)) 1181 if ((block >> 28) || (n_block > 256))
1161 goto out_of_range; 1182 goto out_of_range;
1162 1183
1163 ata_rwcmd_protocol(qc); 1184 if (unlikely(ata_rwcmd_protocol(qc) < 0))
1185 goto invalid_fld;
1164 1186
1165 /* Convert LBA to CHS */ 1187 /* Convert LBA to CHS */
1166 track = (u32)block / dev->sectors; 1188 track = (u32)block / dev->sectors;
@@ -1695,6 +1717,7 @@ static unsigned int ata_msense_rw_recovery(u8 **ptr_io, const u8 *last)
1695unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf, 1717unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
1696 unsigned int buflen) 1718 unsigned int buflen)
1697{ 1719{
1720 struct ata_device *dev = args->dev;
1698 u8 *scsicmd = args->cmd->cmnd, *p, *last; 1721 u8 *scsicmd = args->cmd->cmnd, *p, *last;
1699 const u8 sat_blk_desc[] = { 1722 const u8 sat_blk_desc[] = {
1700 0, 0, 0, 0, /* number of blocks: sat unspecified */ 1723 0, 0, 0, 0, /* number of blocks: sat unspecified */
@@ -1703,6 +1726,7 @@ unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
1703 }; 1726 };
1704 u8 pg, spg; 1727 u8 pg, spg;
1705 unsigned int ebd, page_control, six_byte, output_len, alloc_len, minlen; 1728 unsigned int ebd, page_control, six_byte, output_len, alloc_len, minlen;
1729 u8 dpofua;
1706 1730
1707 VPRINTK("ENTER\n"); 1731 VPRINTK("ENTER\n");
1708 1732
@@ -1771,9 +1795,17 @@ unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
1771 1795
1772 if (minlen < 1) 1796 if (minlen < 1)
1773 return 0; 1797 return 0;
1798
1799 dpofua = 0;
1800 if (ata_id_has_fua(args->id) && dev->flags & ATA_DFLAG_LBA48 &&
1801 (!(dev->flags & ATA_DFLAG_PIO) || dev->multi_count))
1802 dpofua = 1 << 4;
1803
1774 if (six_byte) { 1804 if (six_byte) {
1775 output_len--; 1805 output_len--;
1776 rbuf[0] = output_len; 1806 rbuf[0] = output_len;
1807 if (minlen > 2)
1808 rbuf[2] |= dpofua;
1777 if (ebd) { 1809 if (ebd) {
1778 if (minlen > 3) 1810 if (minlen > 3)
1779 rbuf[3] = sizeof(sat_blk_desc); 1811 rbuf[3] = sizeof(sat_blk_desc);
@@ -1786,6 +1818,8 @@ unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf,
1786 rbuf[0] = output_len >> 8; 1818 rbuf[0] = output_len >> 8;
1787 if (minlen > 1) 1819 if (minlen > 1)
1788 rbuf[1] = output_len; 1820 rbuf[1] = output_len;
1821 if (minlen > 3)
1822 rbuf[3] |= dpofua;
1789 if (ebd) { 1823 if (ebd) {
1790 if (minlen > 7) 1824 if (minlen > 7)
1791 rbuf[7] = sizeof(sat_blk_desc); 1825 rbuf[7] = sizeof(sat_blk_desc);
@@ -2446,7 +2480,7 @@ int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
2446 if (xlat_func) 2480 if (xlat_func)
2447 ata_scsi_translate(ap, dev, cmd, done, xlat_func); 2481 ata_scsi_translate(ap, dev, cmd, done, xlat_func);
2448 else 2482 else
2449 ata_scsi_simulate(dev->id, cmd, done); 2483 ata_scsi_simulate(ap, dev, cmd, done);
2450 } else 2484 } else
2451 ata_scsi_translate(ap, dev, cmd, done, atapi_xlat); 2485 ata_scsi_translate(ap, dev, cmd, done, atapi_xlat);
2452 2486
@@ -2469,14 +2503,16 @@ out_unlock:
2469 * spin_lock_irqsave(host_set lock) 2503 * spin_lock_irqsave(host_set lock)
2470 */ 2504 */
2471 2505
2472void ata_scsi_simulate(u16 *id, 2506void ata_scsi_simulate(struct ata_port *ap, struct ata_device *dev,
2473 struct scsi_cmnd *cmd, 2507 struct scsi_cmnd *cmd,
2474 void (*done)(struct scsi_cmnd *)) 2508 void (*done)(struct scsi_cmnd *))
2475{ 2509{
2476 struct ata_scsi_args args; 2510 struct ata_scsi_args args;
2477 const u8 *scsicmd = cmd->cmnd; 2511 const u8 *scsicmd = cmd->cmnd;
2478 2512
2479 args.id = id; 2513 args.ap = ap;
2514 args.dev = dev;
2515 args.id = dev->id;
2480 args.cmd = cmd; 2516 args.cmd = cmd;
2481 args.done = done; 2517 args.done = done;
2482 2518
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index 251e53bdc6e0..e03ce48b7b4b 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -32,6 +32,8 @@
32#define DRV_VERSION "1.20" /* must be exactly four chars */ 32#define DRV_VERSION "1.20" /* must be exactly four chars */
33 33
34struct ata_scsi_args { 34struct ata_scsi_args {
35 struct ata_port *ap;
36 struct ata_device *dev;
35 u16 *id; 37 u16 *id;
36 struct scsi_cmnd *cmd; 38 struct scsi_cmnd *cmd;
37 void (*done)(struct scsi_cmnd *); 39 void (*done)(struct scsi_cmnd *);
@@ -41,7 +43,7 @@ struct ata_scsi_args {
41extern int atapi_enabled; 43extern int atapi_enabled;
42extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap, 44extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap,
43 struct ata_device *dev); 45 struct ata_device *dev);
44extern void ata_rwcmd_protocol(struct ata_queued_cmd *qc); 46extern int ata_rwcmd_protocol(struct ata_queued_cmd *qc);
45extern void ata_qc_free(struct ata_queued_cmd *qc); 47extern void ata_qc_free(struct ata_queued_cmd *qc);
46extern int ata_qc_issue(struct ata_queued_cmd *qc); 48extern int ata_qc_issue(struct ata_queued_cmd *qc);
47extern int ata_check_atapi_dma(struct ata_queued_cmd *qc); 49extern int ata_check_atapi_dma(struct ata_queued_cmd *qc);
diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c
index b2bf16a9bf4b..cd54244058b5 100644
--- a/drivers/scsi/sata_mv.c
+++ b/drivers/scsi/sata_mv.c
@@ -374,7 +374,6 @@ static struct scsi_host_template mv_sht = {
374 .dma_boundary = MV_DMA_BOUNDARY, 374 .dma_boundary = MV_DMA_BOUNDARY,
375 .slave_configure = ata_scsi_slave_config, 375 .slave_configure = ata_scsi_slave_config,
376 .bios_param = ata_std_bios_param, 376 .bios_param = ata_std_bios_param,
377 .ordered_flush = 1,
378}; 377};
379 378
380static const struct ata_port_operations mv5_ops = { 379static const struct ata_port_operations mv5_ops = {
diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c
index 4954896dfdb9..c0cf52cb975a 100644
--- a/drivers/scsi/sata_nv.c
+++ b/drivers/scsi/sata_nv.c
@@ -235,7 +235,6 @@ static struct scsi_host_template nv_sht = {
235 .dma_boundary = ATA_DMA_BOUNDARY, 235 .dma_boundary = ATA_DMA_BOUNDARY,
236 .slave_configure = ata_scsi_slave_config, 236 .slave_configure = ata_scsi_slave_config,
237 .bios_param = ata_std_bios_param, 237 .bios_param = ata_std_bios_param,
238 .ordered_flush = 1,
239}; 238};
240 239
241static const struct ata_port_operations nv_ops = { 240static const struct ata_port_operations nv_ops = {
diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c
index da7fa04b8a73..3d1ea09a06a1 100644
--- a/drivers/scsi/sata_promise.c
+++ b/drivers/scsi/sata_promise.c
@@ -114,7 +114,6 @@ static struct scsi_host_template pdc_ata_sht = {
114 .dma_boundary = ATA_DMA_BOUNDARY, 114 .dma_boundary = ATA_DMA_BOUNDARY,
115 .slave_configure = ata_scsi_slave_config, 115 .slave_configure = ata_scsi_slave_config,
116 .bios_param = ata_std_bios_param, 116 .bios_param = ata_std_bios_param,
117 .ordered_flush = 1,
118}; 117};
119 118
120static const struct ata_port_operations pdc_sata_ops = { 119static const struct ata_port_operations pdc_sata_ops = {
diff --git a/drivers/scsi/sata_sil.c b/drivers/scsi/sata_sil.c
index d2053487c73b..b017f85e6d6a 100644
--- a/drivers/scsi/sata_sil.c
+++ b/drivers/scsi/sata_sil.c
@@ -147,7 +147,6 @@ static struct scsi_host_template sil_sht = {
147 .dma_boundary = ATA_DMA_BOUNDARY, 147 .dma_boundary = ATA_DMA_BOUNDARY,
148 .slave_configure = ata_scsi_slave_config, 148 .slave_configure = ata_scsi_slave_config,
149 .bios_param = ata_std_bios_param, 149 .bios_param = ata_std_bios_param,
150 .ordered_flush = 1,
151}; 150};
152 151
153static const struct ata_port_operations sil_ops = { 152static const struct ata_port_operations sil_ops = {
diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c
index a0ad3ed2200a..923130185a9e 100644
--- a/drivers/scsi/sata_sil24.c
+++ b/drivers/scsi/sata_sil24.c
@@ -292,7 +292,6 @@ static struct scsi_host_template sil24_sht = {
292 .dma_boundary = ATA_DMA_BOUNDARY, 292 .dma_boundary = ATA_DMA_BOUNDARY,
293 .slave_configure = ata_scsi_slave_config, 293 .slave_configure = ata_scsi_slave_config,
294 .bios_param = ata_std_bios_param, 294 .bios_param = ata_std_bios_param,
295 .ordered_flush = 1, /* NCQ not supported yet */
296}; 295};
297 296
298static const struct ata_port_operations sil24_ops = { 297static const struct ata_port_operations sil24_ops = {
diff --git a/drivers/scsi/sata_sis.c b/drivers/scsi/sata_sis.c
index 32e12620b162..2df8c5632ac3 100644
--- a/drivers/scsi/sata_sis.c
+++ b/drivers/scsi/sata_sis.c
@@ -99,7 +99,6 @@ static struct scsi_host_template sis_sht = {
99 .dma_boundary = ATA_DMA_BOUNDARY, 99 .dma_boundary = ATA_DMA_BOUNDARY,
100 .slave_configure = ata_scsi_slave_config, 100 .slave_configure = ata_scsi_slave_config,
101 .bios_param = ata_std_bios_param, 101 .bios_param = ata_std_bios_param,
102 .ordered_flush = 1,
103}; 102};
104 103
105static const struct ata_port_operations sis_ops = { 104static const struct ata_port_operations sis_ops = {
diff --git a/drivers/scsi/sata_svw.c b/drivers/scsi/sata_svw.c
index 6e7f7c83a75a..668373590aa4 100644
--- a/drivers/scsi/sata_svw.c
+++ b/drivers/scsi/sata_svw.c
@@ -303,7 +303,6 @@ static struct scsi_host_template k2_sata_sht = {
303 .proc_info = k2_sata_proc_info, 303 .proc_info = k2_sata_proc_info,
304#endif 304#endif
305 .bios_param = ata_std_bios_param, 305 .bios_param = ata_std_bios_param,
306 .ordered_flush = 1,
307}; 306};
308 307
309 308
diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c
index 94b253b80da8..bc87c16c80d2 100644
--- a/drivers/scsi/sata_sx4.c
+++ b/drivers/scsi/sata_sx4.c
@@ -194,7 +194,6 @@ static struct scsi_host_template pdc_sata_sht = {
194 .dma_boundary = ATA_DMA_BOUNDARY, 194 .dma_boundary = ATA_DMA_BOUNDARY,
195 .slave_configure = ata_scsi_slave_config, 195 .slave_configure = ata_scsi_slave_config,
196 .bios_param = ata_std_bios_param, 196 .bios_param = ata_std_bios_param,
197 .ordered_flush = 1,
198}; 197};
199 198
200static const struct ata_port_operations pdc_20621_ops = { 199static const struct ata_port_operations pdc_20621_ops = {
diff --git a/drivers/scsi/sata_uli.c b/drivers/scsi/sata_uli.c
index b2422a0f25c8..9635ca700977 100644
--- a/drivers/scsi/sata_uli.c
+++ b/drivers/scsi/sata_uli.c
@@ -87,7 +87,6 @@ static struct scsi_host_template uli_sht = {
87 .dma_boundary = ATA_DMA_BOUNDARY, 87 .dma_boundary = ATA_DMA_BOUNDARY,
88 .slave_configure = ata_scsi_slave_config, 88 .slave_configure = ata_scsi_slave_config,
89 .bios_param = ata_std_bios_param, 89 .bios_param = ata_std_bios_param,
90 .ordered_flush = 1,
91}; 90};
92 91
93static const struct ata_port_operations uli_ops = { 92static const struct ata_port_operations uli_ops = {
diff --git a/drivers/scsi/sata_via.c b/drivers/scsi/sata_via.c
index c76215692da2..6d5b0a794cfd 100644
--- a/drivers/scsi/sata_via.c
+++ b/drivers/scsi/sata_via.c
@@ -106,7 +106,6 @@ static struct scsi_host_template svia_sht = {
106 .dma_boundary = ATA_DMA_BOUNDARY, 106 .dma_boundary = ATA_DMA_BOUNDARY,
107 .slave_configure = ata_scsi_slave_config, 107 .slave_configure = ata_scsi_slave_config,
108 .bios_param = ata_std_bios_param, 108 .bios_param = ata_std_bios_param,
109 .ordered_flush = 1,
110}; 109};
111 110
112static const struct ata_port_operations svia_sata_ops = { 111static const struct ata_port_operations svia_sata_ops = {
diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c
index fcfa486965b4..2e2c3b7acb0c 100644
--- a/drivers/scsi/sata_vsc.c
+++ b/drivers/scsi/sata_vsc.c
@@ -235,7 +235,6 @@ static struct scsi_host_template vsc_sata_sht = {
235 .dma_boundary = ATA_DMA_BOUNDARY, 235 .dma_boundary = ATA_DMA_BOUNDARY,
236 .slave_configure = ata_scsi_slave_config, 236 .slave_configure = ata_scsi_slave_config,
237 .bios_param = ata_std_bios_param, 237 .bios_param = ata_std_bios_param,
238 .ordered_flush = 1,
239}; 238};
240 239
241 240
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index a7f3f0c84db7..ba93d6e66d48 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -308,7 +308,7 @@ struct scsi_io_context {
308 308
309static kmem_cache_t *scsi_io_context_cache; 309static kmem_cache_t *scsi_io_context_cache;
310 310
311static void scsi_end_async(struct request *req) 311static void scsi_end_async(struct request *req, int uptodate)
312{ 312{
313 struct scsi_io_context *sioc = req->end_io_data; 313 struct scsi_io_context *sioc = req->end_io_data;
314 314
@@ -791,7 +791,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int uptodate,
791 spin_lock_irqsave(q->queue_lock, flags); 791 spin_lock_irqsave(q->queue_lock, flags);
792 if (blk_rq_tagged(req)) 792 if (blk_rq_tagged(req))
793 blk_queue_end_tag(q, req); 793 blk_queue_end_tag(q, req);
794 end_that_request_last(req); 794 end_that_request_last(req, uptodate);
795 spin_unlock_irqrestore(q->queue_lock, flags); 795 spin_unlock_irqrestore(q->queue_lock, flags);
796 796
797 /* 797 /*
@@ -932,9 +932,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes,
932 int sense_valid = 0; 932 int sense_valid = 0;
933 int sense_deferred = 0; 933 int sense_deferred = 0;
934 934
935 if (blk_complete_barrier_rq(q, req, good_bytes >> 9))
936 return;
937
938 /* 935 /*
939 * Free up any indirection buffers we allocated for DMA purposes. 936 * Free up any indirection buffers we allocated for DMA purposes.
940 * For the case of a READ, we need to copy the data out of the 937 * For the case of a READ, we need to copy the data out of the
@@ -1199,38 +1196,6 @@ static int scsi_init_io(struct scsi_cmnd *cmd)
1199 return BLKPREP_KILL; 1196 return BLKPREP_KILL;
1200} 1197}
1201 1198
1202static int scsi_prepare_flush_fn(request_queue_t *q, struct request *rq)
1203{
1204 struct scsi_device *sdev = q->queuedata;
1205 struct scsi_driver *drv;
1206
1207 if (sdev->sdev_state == SDEV_RUNNING) {
1208 drv = *(struct scsi_driver **) rq->rq_disk->private_data;
1209
1210 if (drv->prepare_flush)
1211 return drv->prepare_flush(q, rq);
1212 }
1213
1214 return 0;
1215}
1216
1217static void scsi_end_flush_fn(request_queue_t *q, struct request *rq)
1218{
1219 struct scsi_device *sdev = q->queuedata;
1220 struct request *flush_rq = rq->end_io_data;
1221 struct scsi_driver *drv;
1222
1223 if (flush_rq->errors) {
1224 printk("scsi: barrier error, disabling flush support\n");
1225 blk_queue_ordered(q, QUEUE_ORDERED_NONE);
1226 }
1227
1228 if (sdev->sdev_state == SDEV_RUNNING) {
1229 drv = *(struct scsi_driver **) rq->rq_disk->private_data;
1230 drv->end_flush(q, rq);
1231 }
1232}
1233
1234static int scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk, 1199static int scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
1235 sector_t *error_sector) 1200 sector_t *error_sector)
1236{ 1201{
@@ -1703,17 +1668,6 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
1703 blk_queue_segment_boundary(q, shost->dma_boundary); 1668 blk_queue_segment_boundary(q, shost->dma_boundary);
1704 blk_queue_issue_flush_fn(q, scsi_issue_flush_fn); 1669 blk_queue_issue_flush_fn(q, scsi_issue_flush_fn);
1705 1670
1706 /*
1707 * ordered tags are superior to flush ordering
1708 */
1709 if (shost->ordered_tag)
1710 blk_queue_ordered(q, QUEUE_ORDERED_TAG);
1711 else if (shost->ordered_flush) {
1712 blk_queue_ordered(q, QUEUE_ORDERED_FLUSH);
1713 q->prepare_flush_fn = scsi_prepare_flush_fn;
1714 q->end_flush_fn = scsi_end_flush_fn;
1715 }
1716
1717 if (!shost->use_clustering) 1671 if (!shost->use_clustering)
1718 clear_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); 1672 clear_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
1719 return q; 1673 return q;
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 15842b1f0f4a..ea7f3a433572 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -263,9 +263,40 @@ static int scsi_bus_match(struct device *dev, struct device_driver *gendrv)
263 return (sdp->inq_periph_qual == SCSI_INQ_PQ_CON)? 1: 0; 263 return (sdp->inq_periph_qual == SCSI_INQ_PQ_CON)? 1: 0;
264} 264}
265 265
266static int scsi_bus_suspend(struct device * dev, pm_message_t state)
267{
268 struct scsi_device *sdev = to_scsi_device(dev);
269 struct scsi_host_template *sht = sdev->host->hostt;
270 int err;
271
272 err = scsi_device_quiesce(sdev);
273 if (err)
274 return err;
275
276 if (sht->suspend)
277 err = sht->suspend(sdev);
278
279 return err;
280}
281
282static int scsi_bus_resume(struct device * dev)
283{
284 struct scsi_device *sdev = to_scsi_device(dev);
285 struct scsi_host_template *sht = sdev->host->hostt;
286 int err = 0;
287
288 if (sht->resume)
289 err = sht->resume(sdev);
290
291 scsi_device_resume(sdev);
292 return err;
293}
294
266struct bus_type scsi_bus_type = { 295struct bus_type scsi_bus_type = {
267 .name = "scsi", 296 .name = "scsi",
268 .match = scsi_bus_match, 297 .match = scsi_bus_match,
298 .suspend = scsi_bus_suspend,
299 .resume = scsi_bus_resume,
269}; 300};
270 301
271int scsi_sysfs_register(void) 302int scsi_sysfs_register(void)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3d3ad7d1b779..32d4d8d7b9f3 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -102,6 +102,7 @@ struct scsi_disk {
102 u8 write_prot; 102 u8 write_prot;
103 unsigned WCE : 1; /* state of disk WCE bit */ 103 unsigned WCE : 1; /* state of disk WCE bit */
104 unsigned RCD : 1; /* state of disk RCD bit, unused */ 104 unsigned RCD : 1; /* state of disk RCD bit, unused */
105 unsigned DPOFUA : 1; /* state of disk DPOFUA bit */
105}; 106};
106 107
107static DEFINE_IDR(sd_index_idr); 108static DEFINE_IDR(sd_index_idr);
@@ -121,8 +122,7 @@ static void sd_shutdown(struct device *dev);
121static void sd_rescan(struct device *); 122static void sd_rescan(struct device *);
122static int sd_init_command(struct scsi_cmnd *); 123static int sd_init_command(struct scsi_cmnd *);
123static int sd_issue_flush(struct device *, sector_t *); 124static int sd_issue_flush(struct device *, sector_t *);
124static void sd_end_flush(request_queue_t *, struct request *); 125static void sd_prepare_flush(request_queue_t *, struct request *);
125static int sd_prepare_flush(request_queue_t *, struct request *);
126static void sd_read_capacity(struct scsi_disk *sdkp, char *diskname, 126static void sd_read_capacity(struct scsi_disk *sdkp, char *diskname,
127 unsigned char *buffer); 127 unsigned char *buffer);
128 128
@@ -137,8 +137,6 @@ static struct scsi_driver sd_template = {
137 .rescan = sd_rescan, 137 .rescan = sd_rescan,
138 .init_command = sd_init_command, 138 .init_command = sd_init_command,
139 .issue_flush = sd_issue_flush, 139 .issue_flush = sd_issue_flush,
140 .prepare_flush = sd_prepare_flush,
141 .end_flush = sd_end_flush,
142}; 140};
143 141
144/* 142/*
@@ -346,6 +344,7 @@ static int sd_init_command(struct scsi_cmnd * SCpnt)
346 344
347 if (block > 0xffffffff) { 345 if (block > 0xffffffff) {
348 SCpnt->cmnd[0] += READ_16 - READ_6; 346 SCpnt->cmnd[0] += READ_16 - READ_6;
347 SCpnt->cmnd[1] |= blk_fua_rq(rq) ? 0x8 : 0;
349 SCpnt->cmnd[2] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0; 348 SCpnt->cmnd[2] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0;
350 SCpnt->cmnd[3] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0; 349 SCpnt->cmnd[3] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0;
351 SCpnt->cmnd[4] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0; 350 SCpnt->cmnd[4] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0;
@@ -365,6 +364,7 @@ static int sd_init_command(struct scsi_cmnd * SCpnt)
365 this_count = 0xffff; 364 this_count = 0xffff;
366 365
367 SCpnt->cmnd[0] += READ_10 - READ_6; 366 SCpnt->cmnd[0] += READ_10 - READ_6;
367 SCpnt->cmnd[1] |= blk_fua_rq(rq) ? 0x8 : 0;
368 SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff; 368 SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;
369 SCpnt->cmnd[3] = (unsigned char) (block >> 16) & 0xff; 369 SCpnt->cmnd[3] = (unsigned char) (block >> 16) & 0xff;
370 SCpnt->cmnd[4] = (unsigned char) (block >> 8) & 0xff; 370 SCpnt->cmnd[4] = (unsigned char) (block >> 8) & 0xff;
@@ -373,6 +373,17 @@ static int sd_init_command(struct scsi_cmnd * SCpnt)
373 SCpnt->cmnd[7] = (unsigned char) (this_count >> 8) & 0xff; 373 SCpnt->cmnd[7] = (unsigned char) (this_count >> 8) & 0xff;
374 SCpnt->cmnd[8] = (unsigned char) this_count & 0xff; 374 SCpnt->cmnd[8] = (unsigned char) this_count & 0xff;
375 } else { 375 } else {
376 if (unlikely(blk_fua_rq(rq))) {
377 /*
378 * This happens only if this drive failed
379 * 10byte rw command with ILLEGAL_REQUEST
380 * during operation and thus turned off
381 * use_10_for_rw.
382 */
383 printk(KERN_ERR "sd: FUA write on READ/WRITE(6) drive\n");
384 return 0;
385 }
386
376 SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f); 387 SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f);
377 SCpnt->cmnd[2] = (unsigned char) ((block >> 8) & 0xff); 388 SCpnt->cmnd[2] = (unsigned char) ((block >> 8) & 0xff);
378 SCpnt->cmnd[3] = (unsigned char) block & 0xff; 389 SCpnt->cmnd[3] = (unsigned char) block & 0xff;
@@ -729,42 +740,13 @@ static int sd_issue_flush(struct device *dev, sector_t *error_sector)
729 return ret; 740 return ret;
730} 741}
731 742
732static void sd_end_flush(request_queue_t *q, struct request *flush_rq) 743static void sd_prepare_flush(request_queue_t *q, struct request *rq)
733{
734 struct request *rq = flush_rq->end_io_data;
735 struct scsi_cmnd *cmd = rq->special;
736 unsigned int bytes = rq->hard_nr_sectors << 9;
737
738 if (!flush_rq->errors) {
739 spin_unlock(q->queue_lock);
740 scsi_io_completion(cmd, bytes, 0);
741 spin_lock(q->queue_lock);
742 } else if (blk_barrier_postflush(rq)) {
743 spin_unlock(q->queue_lock);
744 scsi_io_completion(cmd, 0, bytes);
745 spin_lock(q->queue_lock);
746 } else {
747 /*
748 * force journal abort of barriers
749 */
750 end_that_request_first(rq, -EOPNOTSUPP, rq->hard_nr_sectors);
751 end_that_request_last(rq);
752 }
753}
754
755static int sd_prepare_flush(request_queue_t *q, struct request *rq)
756{ 744{
757 struct scsi_device *sdev = q->queuedata;
758 struct scsi_disk *sdkp = dev_get_drvdata(&sdev->sdev_gendev);
759
760 if (!sdkp || !sdkp->WCE)
761 return 0;
762
763 memset(rq->cmd, 0, sizeof(rq->cmd)); 745 memset(rq->cmd, 0, sizeof(rq->cmd));
764 rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER; 746 rq->flags |= REQ_BLOCK_PC;
765 rq->timeout = SD_TIMEOUT; 747 rq->timeout = SD_TIMEOUT;
766 rq->cmd[0] = SYNCHRONIZE_CACHE; 748 rq->cmd[0] = SYNCHRONIZE_CACHE;
767 return 1; 749 rq->cmd_len = 10;
768} 750}
769 751
770static void sd_rescan(struct device *dev) 752static void sd_rescan(struct device *dev)
@@ -1427,10 +1409,18 @@ sd_read_cache_type(struct scsi_disk *sdkp, char *diskname,
1427 sdkp->RCD = 0; 1409 sdkp->RCD = 0;
1428 } 1410 }
1429 1411
1412 sdkp->DPOFUA = (data.device_specific & 0x10) != 0;
1413 if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw) {
1414 printk(KERN_NOTICE "SCSI device %s: uses "
1415 "READ/WRITE(6), disabling FUA\n", diskname);
1416 sdkp->DPOFUA = 0;
1417 }
1418
1430 ct = sdkp->RCD + 2*sdkp->WCE; 1419 ct = sdkp->RCD + 2*sdkp->WCE;
1431 1420
1432 printk(KERN_NOTICE "SCSI device %s: drive cache: %s\n", 1421 printk(KERN_NOTICE "SCSI device %s: drive cache: %s%s\n",
1433 diskname, types[ct]); 1422 diskname, types[ct],
1423 sdkp->DPOFUA ? " w/ FUA" : "");
1434 1424
1435 return; 1425 return;
1436 } 1426 }
@@ -1462,6 +1452,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
1462 struct scsi_disk *sdkp = scsi_disk(disk); 1452 struct scsi_disk *sdkp = scsi_disk(disk);
1463 struct scsi_device *sdp = sdkp->device; 1453 struct scsi_device *sdp = sdkp->device;
1464 unsigned char *buffer; 1454 unsigned char *buffer;
1455 unsigned ordered;
1465 1456
1466 SCSI_LOG_HLQUEUE(3, printk("sd_revalidate_disk: disk=%s\n", disk->disk_name)); 1457 SCSI_LOG_HLQUEUE(3, printk("sd_revalidate_disk: disk=%s\n", disk->disk_name));
1467 1458
@@ -1498,7 +1489,21 @@ static int sd_revalidate_disk(struct gendisk *disk)
1498 sd_read_write_protect_flag(sdkp, disk->disk_name, buffer); 1489 sd_read_write_protect_flag(sdkp, disk->disk_name, buffer);
1499 sd_read_cache_type(sdkp, disk->disk_name, buffer); 1490 sd_read_cache_type(sdkp, disk->disk_name, buffer);
1500 } 1491 }
1501 1492
1493 /*
1494 * We now have all cache related info, determine how we deal
1495 * with ordered requests. Note that as the current SCSI
1496 * dispatch function can alter request order, we cannot use
1497 * QUEUE_ORDERED_TAG_* even when ordered tag is supported.
1498 */
1499 if (sdkp->WCE)
1500 ordered = sdkp->DPOFUA
1501 ? QUEUE_ORDERED_DRAIN_FUA : QUEUE_ORDERED_DRAIN_FLUSH;
1502 else
1503 ordered = QUEUE_ORDERED_DRAIN;
1504
1505 blk_queue_ordered(sdkp->disk->queue, ordered, sd_prepare_flush);
1506
1502 set_capacity(disk, sdkp->capacity); 1507 set_capacity(disk, sdkp->capacity);
1503 kfree(buffer); 1508 kfree(buffer);
1504 1509
@@ -1598,6 +1603,7 @@ static int sd_probe(struct device *dev)
1598 strcpy(gd->devfs_name, sdp->devfs_name); 1603 strcpy(gd->devfs_name, sdp->devfs_name);
1599 1604
1600 gd->private_data = &sdkp->driver; 1605 gd->private_data = &sdkp->driver;
1606 gd->queue = sdkp->device->request_queue;
1601 1607
1602 sd_revalidate_disk(gd); 1608 sd_revalidate_disk(gd);
1603 1609
@@ -1605,7 +1611,6 @@ static int sd_probe(struct device *dev)
1605 gd->flags = GENHD_FL_DRIVERFS; 1611 gd->flags = GENHD_FL_DRIVERFS;
1606 if (sdp->removable) 1612 if (sdp->removable)
1607 gd->flags |= GENHD_FL_REMOVABLE; 1613 gd->flags |= GENHD_FL_REMOVABLE;
1608 gd->queue = sdkp->device->request_queue;
1609 1614
1610 dev_set_drvdata(dev, sdkp); 1615 dev_set_drvdata(dev, sdkp);
1611 add_disk(gd); 1616 add_disk(gd);
diff --git a/drivers/serial/mpc52xx_uart.c b/drivers/serial/mpc52xx_uart.c
index b8727d9bf690..1288d6203e94 100644
--- a/drivers/serial/mpc52xx_uart.c
+++ b/drivers/serial/mpc52xx_uart.c
@@ -37,11 +37,11 @@
37 * by the bootloader or in the platform init code. 37 * by the bootloader or in the platform init code.
38 * 38 *
39 * The idx field must be equal to the PSC index ( e.g. 0 for PSC1, 1 for PSC2, 39 * The idx field must be equal to the PSC index ( e.g. 0 for PSC1, 1 for PSC2,
40 * and so on). So the PSC1 is mapped to /dev/ttyS0, PSC2 to /dev/ttyS1 and so 40 * and so on). So the PSC1 is mapped to /dev/ttyPSC0, PSC2 to /dev/ttyPSC1 and
41 * on. But be warned, it's an ABSOLUTE REQUIREMENT ! This is needed mainly for 41 * so on. But be warned, it's an ABSOLUTE REQUIREMENT ! This is needed mainly
42 * the console code : without this 1:1 mapping, at early boot time, when we are 42 * fpr the console code : without this 1:1 mapping, at early boot time, when we
43 * parsing the kernel args console=ttyS?, we wouldn't know wich PSC it will be 43 * are parsing the kernel args console=ttyPSC?, we wouldn't know wich PSC it
44 * mapped to. 44 * will be mapped to.
45 */ 45 */
46 46
47#include <linux/config.h> 47#include <linux/config.h>
@@ -65,6 +65,10 @@
65#include <linux/serial_core.h> 65#include <linux/serial_core.h>
66 66
67 67
68/* We've been assigned a range on the "Low-density serial ports" major */
69#define SERIAL_PSC_MAJOR 204
70#define SERIAL_PSC_MINOR 148
71
68 72
69#define ISR_PASS_LIMIT 256 /* Max number of iteration in the interrupt */ 73#define ISR_PASS_LIMIT 256 /* Max number of iteration in the interrupt */
70 74
@@ -668,15 +672,15 @@ mpc52xx_console_setup(struct console *co, char *options)
668} 672}
669 673
670 674
671extern struct uart_driver mpc52xx_uart_driver; 675static struct uart_driver mpc52xx_uart_driver;
672 676
673static struct console mpc52xx_console = { 677static struct console mpc52xx_console = {
674 .name = "ttyS", 678 .name = "ttyPSC",
675 .write = mpc52xx_console_write, 679 .write = mpc52xx_console_write,
676 .device = uart_console_device, 680 .device = uart_console_device,
677 .setup = mpc52xx_console_setup, 681 .setup = mpc52xx_console_setup,
678 .flags = CON_PRINTBUFFER, 682 .flags = CON_PRINTBUFFER,
679 .index = -1, /* Specified on the cmdline (e.g. console=ttyS0 ) */ 683 .index = -1, /* Specified on the cmdline (e.g. console=ttyPSC0 ) */
680 .data = &mpc52xx_uart_driver, 684 .data = &mpc52xx_uart_driver,
681}; 685};
682 686
@@ -703,10 +707,10 @@ console_initcall(mpc52xx_console_init);
703static struct uart_driver mpc52xx_uart_driver = { 707static struct uart_driver mpc52xx_uart_driver = {
704 .owner = THIS_MODULE, 708 .owner = THIS_MODULE,
705 .driver_name = "mpc52xx_psc_uart", 709 .driver_name = "mpc52xx_psc_uart",
706 .dev_name = "ttyS", 710 .dev_name = "ttyPSC",
707 .devfs_name = "ttyS", 711 .devfs_name = "ttyPSC",
708 .major = TTY_MAJOR, 712 .major = SERIAL_PSC_MAJOR,
709 .minor = 64, 713 .minor = SERIAL_PSC_MINOR,
710 .nr = MPC52xx_PSC_MAXNUM, 714 .nr = MPC52xx_PSC_MAXNUM,
711 .cons = MPC52xx_PSC_CONSOLE, 715 .cons = MPC52xx_PSC_CONSOLE,
712}; 716};
diff --git a/fs/Kconfig b/fs/Kconfig
index d5255e627b5f..382e3b2883d5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -70,6 +70,7 @@ config FS_XIP
70 70
71config EXT3_FS 71config EXT3_FS
72 tristate "Ext3 journalling file system support" 72 tristate "Ext3 journalling file system support"
73 select JBD
73 help 74 help
74 This is the journaling version of the Second extended file system 75 This is the journaling version of the Second extended file system
75 (often called ext3), the de facto standard Linux file system 76 (often called ext3), the de facto standard Linux file system
@@ -138,23 +139,20 @@ config EXT3_FS_SECURITY
138 extended attributes for file security labels, say N. 139 extended attributes for file security labels, say N.
139 140
140config JBD 141config JBD
141# CONFIG_JBD could be its own option (even modular), but until there are
142# other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
143# dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
144 tristate 142 tristate
145 default EXT3_FS
146 help 143 help
147 This is a generic journaling layer for block devices. It is 144 This is a generic journaling layer for block devices. It is
148 currently used by the ext3 file system, but it could also be used to 145 currently used by the ext3 and OCFS2 file systems, but it could
149 add journal support to other file systems or block devices such as 146 also be used to add journal support to other file systems or block
150 RAID or LVM. 147 devices such as RAID or LVM.
151 148
152 If you are using the ext3 file system, you need to say Y here. If 149 If you are using the ext3 or OCFS2 file systems, you need to
153 you are not using ext3 then you will probably want to say N. 150 say Y here. If you are not using ext3 OCFS2 then you will probably
151 want to say N.
154 152
155 To compile this device as a module, choose M here: the module will be 153 To compile this device as a module, choose M here: the module will be
156 called jbd. If you are compiling ext3 into the kernel, you cannot 154 called jbd. If you are compiling ext3 or OCFS2 into the kernel,
157 compile this code as a module. 155 you cannot compile this code as a module.
158 156
159config JBD_DEBUG 157config JBD_DEBUG
160 bool "JBD (ext3) debugging support" 158 bool "JBD (ext3) debugging support"
@@ -326,6 +324,38 @@ config FS_POSIX_ACL
326 324
327source "fs/xfs/Kconfig" 325source "fs/xfs/Kconfig"
328 326
327config OCFS2_FS
328 tristate "OCFS2 file system support (EXPERIMENTAL)"
329 depends on NET && EXPERIMENTAL
330 select CONFIGFS_FS
331 select JBD
332 select CRC32
333 select INET
334 help
335 OCFS2 is a general purpose extent based shared disk cluster file
336 system with many similarities to ext3. It supports 64 bit inode
337 numbers, and has automatically extending metadata groups which may
338 also make it attractive for non-clustered use.
339
340 You'll want to install the ocfs2-tools package in order to at least
341 get "mount.ocfs2".
342
343 Project web page: http://oss.oracle.com/projects/ocfs2
344 Tools web page: http://oss.oracle.com/projects/ocfs2-tools
345 OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
346
347 Note: Features which OCFS2 does not support yet:
348 - extended attributes
349 - shared writeable mmap
350 - loopback is supported, but data written will not
351 be cluster coherent.
352 - quotas
353 - cluster aware flock
354 - Directory change notification (F_NOTIFY)
355 - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
356 - POSIX ACLs
357 - readpages / writepages (not user visible)
358
329config MINIX_FS 359config MINIX_FS
330 tristate "Minix fs support" 360 tristate "Minix fs support"
331 help 361 help
@@ -841,6 +871,20 @@ config RELAYFS_FS
841 871
842 If unsure, say N. 872 If unsure, say N.
843 873
874config CONFIGFS_FS
875 tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
876 depends on EXPERIMENTAL
877 help
878 configfs is a ram-based filesystem that provides the converse
879 of sysfs's functionality. Where sysfs is a filesystem-based
880 view of kernel objects, configfs is a filesystem-based manager
881 of kernel objects, or config_items.
882
883 Both sysfs and configfs can and should exist together on the
884 same system. One is not a replacement for the other.
885
886 If unsure, say N.
887
844endmenu 888endmenu
845 889
846menu "Miscellaneous filesystems" 890menu "Miscellaneous filesystems"
diff --git a/fs/Makefile b/fs/Makefile
index 4c2655759078..73676111ebbe 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -101,3 +101,5 @@ obj-$(CONFIG_BEFS_FS) += befs/
101obj-$(CONFIG_HOSTFS) += hostfs/ 101obj-$(CONFIG_HOSTFS) += hostfs/
102obj-$(CONFIG_HPPFS) += hppfs/ 102obj-$(CONFIG_HPPFS) += hppfs/
103obj-$(CONFIG_DEBUG_FS) += debugfs/ 103obj-$(CONFIG_DEBUG_FS) += debugfs/
104obj-$(CONFIG_CONFIGFS_FS) += configfs/
105obj-$(CONFIG_OCFS2_FS) += ocfs2/
diff --git a/fs/bio.c b/fs/bio.c
index 38d3e8023a07..dfe242a21eb4 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -325,10 +325,31 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
325 if (unlikely(bio_flagged(bio, BIO_CLONED))) 325 if (unlikely(bio_flagged(bio, BIO_CLONED)))
326 return 0; 326 return 0;
327 327
328 if (bio->bi_vcnt >= bio->bi_max_vecs) 328 if (((bio->bi_size + len) >> 9) > max_sectors)
329 return 0; 329 return 0;
330 330
331 if (((bio->bi_size + len) >> 9) > max_sectors) 331 /*
332 * For filesystems with a blocksize smaller than the pagesize
333 * we will often be called with the same page as last time and
334 * a consecutive offset. Optimize this special case.
335 */
336 if (bio->bi_vcnt > 0) {
337 struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
338
339 if (page == prev->bv_page &&
340 offset == prev->bv_offset + prev->bv_len) {
341 prev->bv_len += len;
342 if (q->merge_bvec_fn &&
343 q->merge_bvec_fn(q, bio, prev) < len) {
344 prev->bv_len -= len;
345 return 0;
346 }
347
348 goto done;
349 }
350 }
351
352 if (bio->bi_vcnt >= bio->bi_max_vecs)
332 return 0; 353 return 0;
333 354
334 /* 355 /*
@@ -382,6 +403,7 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page
382 bio->bi_vcnt++; 403 bio->bi_vcnt++;
383 bio->bi_phys_segments++; 404 bio->bi_phys_segments++;
384 bio->bi_hw_segments++; 405 bio->bi_hw_segments++;
406 done:
385 bio->bi_size += len; 407 bio->bi_size += len;
386 return len; 408 return len;
387} 409}
diff --git a/fs/configfs/Makefile b/fs/configfs/Makefile
new file mode 100644
index 000000000000..00ffb278e98c
--- /dev/null
+++ b/fs/configfs/Makefile
@@ -0,0 +1,7 @@
1#
2# Makefile for the configfs virtual filesystem
3#
4
5obj-$(CONFIG_CONFIGFS_FS) += configfs.o
6
7configfs-objs := inode.o file.o dir.o symlink.o mount.o item.o
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
new file mode 100644
index 000000000000..8899d9c5f6bf
--- /dev/null
+++ b/fs/configfs/configfs_internal.h
@@ -0,0 +1,142 @@
1/* -*- mode: c; c-basic-offset:8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * configfs_internal.h - Internal stuff for configfs
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 */
26
27#include <linux/slab.h>
28#include <linux/list.h>
29
30struct configfs_dirent {
31 atomic_t s_count;
32 struct list_head s_sibling;
33 struct list_head s_children;
34 struct list_head s_links;
35 void * s_element;
36 int s_type;
37 umode_t s_mode;
38 struct dentry * s_dentry;
39};
40
41#define CONFIGFS_ROOT 0x0001
42#define CONFIGFS_DIR 0x0002
43#define CONFIGFS_ITEM_ATTR 0x0004
44#define CONFIGFS_ITEM_LINK 0x0020
45#define CONFIGFS_USET_DIR 0x0040
46#define CONFIGFS_USET_DEFAULT 0x0080
47#define CONFIGFS_USET_DROPPING 0x0100
48#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR)
49
50extern struct vfsmount * configfs_mount;
51
52extern int configfs_is_root(struct config_item *item);
53
54extern struct inode * configfs_new_inode(mode_t mode);
55extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
56
57extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
58extern int configfs_make_dirent(struct configfs_dirent *,
59 struct dentry *, void *, umode_t, int);
60
61extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
62extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
63
64extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
65extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
66
67extern int configfs_pin_fs(void);
68extern void configfs_release_fs(void);
69
70extern struct rw_semaphore configfs_rename_sem;
71extern struct super_block * configfs_sb;
72extern struct file_operations configfs_dir_operations;
73extern struct file_operations configfs_file_operations;
74extern struct file_operations bin_fops;
75extern struct inode_operations configfs_dir_inode_operations;
76extern struct inode_operations configfs_symlink_inode_operations;
77
78extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
79 const char *symname);
80extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
81
82struct configfs_symlink {
83 struct list_head sl_list;
84 struct config_item *sl_target;
85};
86
87extern int configfs_create_link(struct configfs_symlink *sl,
88 struct dentry *parent,
89 struct dentry *dentry);
90
91static inline struct config_item * to_item(struct dentry * dentry)
92{
93 struct configfs_dirent * sd = dentry->d_fsdata;
94 return ((struct config_item *) sd->s_element);
95}
96
97static inline struct configfs_attribute * to_attr(struct dentry * dentry)
98{
99 struct configfs_dirent * sd = dentry->d_fsdata;
100 return ((struct configfs_attribute *) sd->s_element);
101}
102
103static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
104{
105 struct config_item * item = NULL;
106
107 spin_lock(&dcache_lock);
108 if (!d_unhashed(dentry)) {
109 struct configfs_dirent * sd = dentry->d_fsdata;
110 if (sd->s_type & CONFIGFS_ITEM_LINK) {
111 struct configfs_symlink * sl = sd->s_element;
112 item = config_item_get(sl->sl_target);
113 } else
114 item = config_item_get(sd->s_element);
115 }
116 spin_unlock(&dcache_lock);
117
118 return item;
119}
120
121static inline void release_configfs_dirent(struct configfs_dirent * sd)
122{
123 if (!(sd->s_type & CONFIGFS_ROOT))
124 kfree(sd);
125}
126
127static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
128{
129 if (sd) {
130 WARN_ON(!atomic_read(&sd->s_count));
131 atomic_inc(&sd->s_count);
132 }
133 return sd;
134}
135
136static inline void configfs_put(struct configfs_dirent * sd)
137{
138 WARN_ON(!atomic_read(&sd->s_count));
139 if (atomic_dec_and_test(&sd->s_count))
140 release_configfs_dirent(sd);
141}
142
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
new file mode 100644
index 000000000000..e48b539243a1
--- /dev/null
+++ b/fs/configfs/dir.c
@@ -0,0 +1,1102 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dir.c - Operations for configfs directories.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 */
26
27#undef DEBUG
28
29#include <linux/fs.h>
30#include <linux/mount.h>
31#include <linux/module.h>
32#include <linux/slab.h>
33
34#include <linux/configfs.h>
35#include "configfs_internal.h"
36
37DECLARE_RWSEM(configfs_rename_sem);
38
39static void configfs_d_iput(struct dentry * dentry,
40 struct inode * inode)
41{
42 struct configfs_dirent * sd = dentry->d_fsdata;
43
44 if (sd) {
45 BUG_ON(sd->s_dentry != dentry);
46 sd->s_dentry = NULL;
47 configfs_put(sd);
48 }
49 iput(inode);
50}
51
52/*
53 * We _must_ delete our dentries on last dput, as the chain-to-parent
54 * behavior is required to clear the parents of default_groups.
55 */
56static int configfs_d_delete(struct dentry *dentry)
57{
58 return 1;
59}
60
61static struct dentry_operations configfs_dentry_ops = {
62 .d_iput = configfs_d_iput,
63 /* simple_delete_dentry() isn't exported */
64 .d_delete = configfs_d_delete,
65};
66
67/*
68 * Allocates a new configfs_dirent and links it to the parent configfs_dirent
69 */
70static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd,
71 void * element)
72{
73 struct configfs_dirent * sd;
74
75 sd = kmalloc(sizeof(*sd), GFP_KERNEL);
76 if (!sd)
77 return NULL;
78
79 memset(sd, 0, sizeof(*sd));
80 atomic_set(&sd->s_count, 1);
81 INIT_LIST_HEAD(&sd->s_links);
82 INIT_LIST_HEAD(&sd->s_children);
83 list_add(&sd->s_sibling, &parent_sd->s_children);
84 sd->s_element = element;
85
86 return sd;
87}
88
89int configfs_make_dirent(struct configfs_dirent * parent_sd,
90 struct dentry * dentry, void * element,
91 umode_t mode, int type)
92{
93 struct configfs_dirent * sd;
94
95 sd = configfs_new_dirent(parent_sd, element);
96 if (!sd)
97 return -ENOMEM;
98
99 sd->s_mode = mode;
100 sd->s_type = type;
101 sd->s_dentry = dentry;
102 if (dentry) {
103 dentry->d_fsdata = configfs_get(sd);
104 dentry->d_op = &configfs_dentry_ops;
105 }
106
107 return 0;
108}
109
110static int init_dir(struct inode * inode)
111{
112 inode->i_op = &configfs_dir_inode_operations;
113 inode->i_fop = &configfs_dir_operations;
114
115 /* directory inodes start off with i_nlink == 2 (for "." entry) */
116 inode->i_nlink++;
117 return 0;
118}
119
120static int init_file(struct inode * inode)
121{
122 inode->i_size = PAGE_SIZE;
123 inode->i_fop = &configfs_file_operations;
124 return 0;
125}
126
127static int init_symlink(struct inode * inode)
128{
129 inode->i_op = &configfs_symlink_inode_operations;
130 return 0;
131}
132
133static int create_dir(struct config_item * k, struct dentry * p,
134 struct dentry * d)
135{
136 int error;
137 umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
138
139 error = configfs_create(d, mode, init_dir);
140 if (!error) {
141 error = configfs_make_dirent(p->d_fsdata, d, k, mode,
142 CONFIGFS_DIR);
143 if (!error) {
144 p->d_inode->i_nlink++;
145 (d)->d_op = &configfs_dentry_ops;
146 }
147 }
148 return error;
149}
150
151
152/**
153 * configfs_create_dir - create a directory for an config_item.
154 * @item: config_itemwe're creating directory for.
155 * @dentry: config_item's dentry.
156 */
157
158static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
159{
160 struct dentry * parent;
161 int error = 0;
162
163 BUG_ON(!item);
164
165 if (item->ci_parent)
166 parent = item->ci_parent->ci_dentry;
167 else if (configfs_mount && configfs_mount->mnt_sb)
168 parent = configfs_mount->mnt_sb->s_root;
169 else
170 return -EFAULT;
171
172 error = create_dir(item,parent,dentry);
173 if (!error)
174 item->ci_dentry = dentry;
175 return error;
176}
177
178int configfs_create_link(struct configfs_symlink *sl,
179 struct dentry *parent,
180 struct dentry *dentry)
181{
182 int err = 0;
183 umode_t mode = S_IFLNK | S_IRWXUGO;
184
185 err = configfs_create(dentry, mode, init_symlink);
186 if (!err) {
187 err = configfs_make_dirent(parent->d_fsdata, dentry, sl,
188 mode, CONFIGFS_ITEM_LINK);
189 if (!err)
190 dentry->d_op = &configfs_dentry_ops;
191 }
192 return err;
193}
194
195static void remove_dir(struct dentry * d)
196{
197 struct dentry * parent = dget(d->d_parent);
198 struct configfs_dirent * sd;
199
200 sd = d->d_fsdata;
201 list_del_init(&sd->s_sibling);
202 configfs_put(sd);
203 if (d->d_inode)
204 simple_rmdir(parent->d_inode,d);
205
206 pr_debug(" o %s removing done (%d)\n",d->d_name.name,
207 atomic_read(&d->d_count));
208
209 dput(parent);
210}
211
212/**
213 * configfs_remove_dir - remove an config_item's directory.
214 * @item: config_item we're removing.
215 *
216 * The only thing special about this is that we remove any files in
217 * the directory before we remove the directory, and we've inlined
218 * what used to be configfs_rmdir() below, instead of calling separately.
219 */
220
221static void configfs_remove_dir(struct config_item * item)
222{
223 struct dentry * dentry = dget(item->ci_dentry);
224
225 if (!dentry)
226 return;
227
228 remove_dir(dentry);
229 /**
230 * Drop reference from dget() on entrance.
231 */
232 dput(dentry);
233}
234
235
236/* attaches attribute's configfs_dirent to the dentry corresponding to the
237 * attribute file
238 */
239static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry)
240{
241 struct configfs_attribute * attr = sd->s_element;
242 int error;
243
244 error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file);
245 if (error)
246 return error;
247
248 dentry->d_op = &configfs_dentry_ops;
249 dentry->d_fsdata = configfs_get(sd);
250 sd->s_dentry = dentry;
251 d_rehash(dentry);
252
253 return 0;
254}
255
256static struct dentry * configfs_lookup(struct inode *dir,
257 struct dentry *dentry,
258 struct nameidata *nd)
259{
260 struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
261 struct configfs_dirent * sd;
262 int found = 0;
263 int err = 0;
264
265 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
266 if (sd->s_type & CONFIGFS_NOT_PINNED) {
267 const unsigned char * name = configfs_get_name(sd);
268
269 if (strcmp(name, dentry->d_name.name))
270 continue;
271
272 found = 1;
273 err = configfs_attach_attr(sd, dentry);
274 break;
275 }
276 }
277
278 if (!found) {
279 /*
280 * If it doesn't exist and it isn't a NOT_PINNED item,
281 * it must be negative.
282 */
283 return simple_lookup(dir, dentry, nd);
284 }
285
286 return ERR_PTR(err);
287}
288
289/*
290 * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
291 * attributes and are removed by rmdir(). We recurse, taking i_sem
292 * on all children that are candidates for default detach. If the
293 * result is clean, then configfs_detach_group() will handle dropping
294 * i_sem. If there is an error, the caller will clean up the i_sem
295 * holders via configfs_detach_rollback().
296 */
297static int configfs_detach_prep(struct dentry *dentry)
298{
299 struct configfs_dirent *parent_sd = dentry->d_fsdata;
300 struct configfs_dirent *sd;
301 int ret;
302
303 ret = -EBUSY;
304 if (!list_empty(&parent_sd->s_links))
305 goto out;
306
307 ret = 0;
308 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
309 if (sd->s_type & CONFIGFS_NOT_PINNED)
310 continue;
311 if (sd->s_type & CONFIGFS_USET_DEFAULT) {
312 down(&sd->s_dentry->d_inode->i_sem);
313 /* Mark that we've taken i_sem */
314 sd->s_type |= CONFIGFS_USET_DROPPING;
315
316 ret = configfs_detach_prep(sd->s_dentry);
317 if (!ret)
318 continue;
319 } else
320 ret = -ENOTEMPTY;
321
322 break;
323 }
324
325out:
326 return ret;
327}
328
329/*
330 * Walk the tree, dropping i_sem wherever CONFIGFS_USET_DROPPING is
331 * set.
332 */
333static void configfs_detach_rollback(struct dentry *dentry)
334{
335 struct configfs_dirent *parent_sd = dentry->d_fsdata;
336 struct configfs_dirent *sd;
337
338 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
339 if (sd->s_type & CONFIGFS_USET_DEFAULT) {
340 configfs_detach_rollback(sd->s_dentry);
341
342 if (sd->s_type & CONFIGFS_USET_DROPPING) {
343 sd->s_type &= ~CONFIGFS_USET_DROPPING;
344 up(&sd->s_dentry->d_inode->i_sem);
345 }
346 }
347 }
348}
349
350static void detach_attrs(struct config_item * item)
351{
352 struct dentry * dentry = dget(item->ci_dentry);
353 struct configfs_dirent * parent_sd;
354 struct configfs_dirent * sd, * tmp;
355
356 if (!dentry)
357 return;
358
359 pr_debug("configfs %s: dropping attrs for dir\n",
360 dentry->d_name.name);
361
362 parent_sd = dentry->d_fsdata;
363 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
364 if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
365 continue;
366 list_del_init(&sd->s_sibling);
367 configfs_drop_dentry(sd, dentry);
368 configfs_put(sd);
369 }
370
371 /**
372 * Drop reference from dget() on entrance.
373 */
374 dput(dentry);
375}
376
377static int populate_attrs(struct config_item *item)
378{
379 struct config_item_type *t = item->ci_type;
380 struct configfs_attribute *attr;
381 int error = 0;
382 int i;
383
384 if (!t)
385 return -EINVAL;
386 if (t->ct_attrs) {
387 for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) {
388 if ((error = configfs_create_file(item, attr)))
389 break;
390 }
391 }
392
393 if (error)
394 detach_attrs(item);
395
396 return error;
397}
398
399static int configfs_attach_group(struct config_item *parent_item,
400 struct config_item *item,
401 struct dentry *dentry);
402static void configfs_detach_group(struct config_item *item);
403
404static void detach_groups(struct config_group *group)
405{
406 struct dentry * dentry = dget(group->cg_item.ci_dentry);
407 struct dentry *child;
408 struct configfs_dirent *parent_sd;
409 struct configfs_dirent *sd, *tmp;
410
411 if (!dentry)
412 return;
413
414 parent_sd = dentry->d_fsdata;
415 list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
416 if (!sd->s_element ||
417 !(sd->s_type & CONFIGFS_USET_DEFAULT))
418 continue;
419
420 child = sd->s_dentry;
421
422 configfs_detach_group(sd->s_element);
423 child->d_inode->i_flags |= S_DEAD;
424
425 /*
426 * From rmdir/unregister, a configfs_detach_prep() pass
427 * has taken our i_sem for us. Drop it.
428 * From mkdir/register cleanup, there is no sem held.
429 */
430 if (sd->s_type & CONFIGFS_USET_DROPPING)
431 up(&child->d_inode->i_sem);
432
433 d_delete(child);
434 dput(child);
435 }
436
437 /**
438 * Drop reference from dget() on entrance.
439 */
440 dput(dentry);
441}
442
443/*
444 * This fakes mkdir(2) on a default_groups[] entry. It
445 * creates a dentry, attachs it, and then does fixup
446 * on the sd->s_type.
447 *
448 * We could, perhaps, tweak our parent's ->mkdir for a minute and
449 * try using vfs_mkdir. Just a thought.
450 */
451static int create_default_group(struct config_group *parent_group,
452 struct config_group *group)
453{
454 int ret;
455 struct qstr name;
456 struct configfs_dirent *sd;
457 /* We trust the caller holds a reference to parent */
458 struct dentry *child, *parent = parent_group->cg_item.ci_dentry;
459
460 if (!group->cg_item.ci_name)
461 group->cg_item.ci_name = group->cg_item.ci_namebuf;
462 name.name = group->cg_item.ci_name;
463 name.len = strlen(name.name);
464 name.hash = full_name_hash(name.name, name.len);
465
466 ret = -ENOMEM;
467 child = d_alloc(parent, &name);
468 if (child) {
469 d_add(child, NULL);
470
471 ret = configfs_attach_group(&parent_group->cg_item,
472 &group->cg_item, child);
473 if (!ret) {
474 sd = child->d_fsdata;
475 sd->s_type |= CONFIGFS_USET_DEFAULT;
476 } else {
477 d_delete(child);
478 dput(child);
479 }
480 }
481
482 return ret;
483}
484
485static int populate_groups(struct config_group *group)
486{
487 struct config_group *new_group;
488 struct dentry *dentry = group->cg_item.ci_dentry;
489 int ret = 0;
490 int i;
491
492 if (group && group->default_groups) {
493 /* FYI, we're faking mkdir here
494 * I'm not sure we need this semaphore, as we're called
495 * from our parent's mkdir. That holds our parent's
496 * i_sem, so afaik lookup cannot continue through our
497 * parent to find us, let alone mess with our tree.
498 * That said, taking our i_sem is closer to mkdir
499 * emulation, and shouldn't hurt. */
500 down(&dentry->d_inode->i_sem);
501
502 for (i = 0; group->default_groups[i]; i++) {
503 new_group = group->default_groups[i];
504
505 ret = create_default_group(group, new_group);
506 if (ret)
507 break;
508 }
509
510 up(&dentry->d_inode->i_sem);
511 }
512
513 if (ret)
514 detach_groups(group);
515
516 return ret;
517}
518
519/*
520 * All of link_obj/unlink_obj/link_group/unlink_group require that
521 * subsys->su_sem is held.
522 */
523
524static void unlink_obj(struct config_item *item)
525{
526 struct config_group *group;
527
528 group = item->ci_group;
529 if (group) {
530 list_del_init(&item->ci_entry);
531
532 item->ci_group = NULL;
533 item->ci_parent = NULL;
534 config_item_put(item);
535
536 config_group_put(group);
537 }
538}
539
540static void link_obj(struct config_item *parent_item, struct config_item *item)
541{
542 /* Parent seems redundant with group, but it makes certain
543 * traversals much nicer. */
544 item->ci_parent = parent_item;
545 item->ci_group = config_group_get(to_config_group(parent_item));
546 list_add_tail(&item->ci_entry, &item->ci_group->cg_children);
547
548 config_item_get(item);
549}
550
551static void unlink_group(struct config_group *group)
552{
553 int i;
554 struct config_group *new_group;
555
556 if (group->default_groups) {
557 for (i = 0; group->default_groups[i]; i++) {
558 new_group = group->default_groups[i];
559 unlink_group(new_group);
560 }
561 }
562
563 group->cg_subsys = NULL;
564 unlink_obj(&group->cg_item);
565}
566
567static void link_group(struct config_group *parent_group, struct config_group *group)
568{
569 int i;
570 struct config_group *new_group;
571 struct configfs_subsystem *subsys = NULL; /* gcc is a turd */
572
573 link_obj(&parent_group->cg_item, &group->cg_item);
574
575 if (parent_group->cg_subsys)
576 subsys = parent_group->cg_subsys;
577 else if (configfs_is_root(&parent_group->cg_item))
578 subsys = to_configfs_subsystem(group);
579 else
580 BUG();
581 group->cg_subsys = subsys;
582
583 if (group->default_groups) {
584 for (i = 0; group->default_groups[i]; i++) {
585 new_group = group->default_groups[i];
586 link_group(group, new_group);
587 }
588 }
589}
590
591/*
592 * The goal is that configfs_attach_item() (and
593 * configfs_attach_group()) can be called from either the VFS or this
594 * module. That is, they assume that the items have been created,
595 * the dentry allocated, and the dcache is all ready to go.
596 *
597 * If they fail, they must clean up after themselves as if they
598 * had never been called. The caller (VFS or local function) will
599 * handle cleaning up the dcache bits.
600 *
601 * configfs_detach_group() and configfs_detach_item() behave similarly on
602 * the way out. They assume that the proper semaphores are held, they
603 * clean up the configfs items, and they expect their callers will
604 * handle the dcache bits.
605 */
606static int configfs_attach_item(struct config_item *parent_item,
607 struct config_item *item,
608 struct dentry *dentry)
609{
610 int ret;
611
612 ret = configfs_create_dir(item, dentry);
613 if (!ret) {
614 ret = populate_attrs(item);
615 if (ret) {
616 configfs_remove_dir(item);
617 d_delete(dentry);
618 }
619 }
620
621 return ret;
622}
623
624static void configfs_detach_item(struct config_item *item)
625{
626 detach_attrs(item);
627 configfs_remove_dir(item);
628}
629
630static int configfs_attach_group(struct config_item *parent_item,
631 struct config_item *item,
632 struct dentry *dentry)
633{
634 int ret;
635 struct configfs_dirent *sd;
636
637 ret = configfs_attach_item(parent_item, item, dentry);
638 if (!ret) {
639 sd = dentry->d_fsdata;
640 sd->s_type |= CONFIGFS_USET_DIR;
641
642 ret = populate_groups(to_config_group(item));
643 if (ret) {
644 configfs_detach_item(item);
645 d_delete(dentry);
646 }
647 }
648
649 return ret;
650}
651
652static void configfs_detach_group(struct config_item *item)
653{
654 detach_groups(to_config_group(item));
655 configfs_detach_item(item);
656}
657
658/*
659 * Drop the initial reference from make_item()/make_group()
660 * This function assumes that reference is held on item
661 * and that item holds a valid reference to the parent. Also, it
662 * assumes the caller has validated ci_type.
663 */
664static void client_drop_item(struct config_item *parent_item,
665 struct config_item *item)
666{
667 struct config_item_type *type;
668
669 type = parent_item->ci_type;
670 BUG_ON(!type);
671
672 if (type->ct_group_ops && type->ct_group_ops->drop_item)
673 type->ct_group_ops->drop_item(to_config_group(parent_item),
674 item);
675 else
676 config_item_put(item);
677}
678
679
680static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
681{
682 int ret;
683 struct config_group *group;
684 struct config_item *item;
685 struct config_item *parent_item;
686 struct configfs_subsystem *subsys;
687 struct configfs_dirent *sd;
688 struct config_item_type *type;
689 struct module *owner;
690 char *name;
691
692 if (dentry->d_parent == configfs_sb->s_root)
693 return -EPERM;
694
695 sd = dentry->d_parent->d_fsdata;
696 if (!(sd->s_type & CONFIGFS_USET_DIR))
697 return -EPERM;
698
699 parent_item = configfs_get_config_item(dentry->d_parent);
700 type = parent_item->ci_type;
701 subsys = to_config_group(parent_item)->cg_subsys;
702 BUG_ON(!subsys);
703
704 if (!type || !type->ct_group_ops ||
705 (!type->ct_group_ops->make_group &&
706 !type->ct_group_ops->make_item)) {
707 config_item_put(parent_item);
708 return -EPERM; /* What lack-of-mkdir returns */
709 }
710
711 name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
712 if (!name) {
713 config_item_put(parent_item);
714 return -ENOMEM;
715 }
716 snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
717
718 down(&subsys->su_sem);
719 group = NULL;
720 item = NULL;
721 if (type->ct_group_ops->make_group) {
722 group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
723 if (group) {
724 link_group(to_config_group(parent_item), group);
725 item = &group->cg_item;
726 }
727 } else {
728 item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
729 if (item)
730 link_obj(parent_item, item);
731 }
732 up(&subsys->su_sem);
733
734 kfree(name);
735 if (!item) {
736 config_item_put(parent_item);
737 return -ENOMEM;
738 }
739
740 ret = -EINVAL;
741 type = item->ci_type;
742 if (type) {
743 owner = type->ct_owner;
744 if (try_module_get(owner)) {
745 if (group) {
746 ret = configfs_attach_group(parent_item,
747 item,
748 dentry);
749 } else {
750 ret = configfs_attach_item(parent_item,
751 item,
752 dentry);
753 }
754
755 if (ret) {
756 down(&subsys->su_sem);
757 if (group)
758 unlink_group(group);
759 else
760 unlink_obj(item);
761 client_drop_item(parent_item, item);
762 up(&subsys->su_sem);
763
764 config_item_put(parent_item);
765 module_put(owner);
766 }
767 }
768 }
769
770 return ret;
771}
772
773static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
774{
775 struct config_item *parent_item;
776 struct config_item *item;
777 struct configfs_subsystem *subsys;
778 struct configfs_dirent *sd;
779 struct module *owner = NULL;
780 int ret;
781
782 if (dentry->d_parent == configfs_sb->s_root)
783 return -EPERM;
784
785 sd = dentry->d_fsdata;
786 if (sd->s_type & CONFIGFS_USET_DEFAULT)
787 return -EPERM;
788
789 parent_item = configfs_get_config_item(dentry->d_parent);
790 subsys = to_config_group(parent_item)->cg_subsys;
791 BUG_ON(!subsys);
792
793 if (!parent_item->ci_type) {
794 config_item_put(parent_item);
795 return -EINVAL;
796 }
797
798 ret = configfs_detach_prep(dentry);
799 if (ret) {
800 configfs_detach_rollback(dentry);
801 config_item_put(parent_item);
802 return ret;
803 }
804
805 item = configfs_get_config_item(dentry);
806
807 /* Drop reference from above, item already holds one. */
808 config_item_put(parent_item);
809
810 if (item->ci_type)
811 owner = item->ci_type->ct_owner;
812
813 if (sd->s_type & CONFIGFS_USET_DIR) {
814 configfs_detach_group(item);
815
816 down(&subsys->su_sem);
817 unlink_group(to_config_group(item));
818 } else {
819 configfs_detach_item(item);
820
821 down(&subsys->su_sem);
822 unlink_obj(item);
823 }
824
825 client_drop_item(parent_item, item);
826 up(&subsys->su_sem);
827
828 /* Drop our reference from above */
829 config_item_put(item);
830
831 module_put(owner);
832
833 return 0;
834}
835
836struct inode_operations configfs_dir_inode_operations = {
837 .mkdir = configfs_mkdir,
838 .rmdir = configfs_rmdir,
839 .symlink = configfs_symlink,
840 .unlink = configfs_unlink,
841 .lookup = configfs_lookup,
842};
843
844#if 0
845int configfs_rename_dir(struct config_item * item, const char *new_name)
846{
847 int error = 0;
848 struct dentry * new_dentry, * parent;
849
850 if (!strcmp(config_item_name(item), new_name))
851 return -EINVAL;
852
853 if (!item->parent)
854 return -EINVAL;
855
856 down_write(&configfs_rename_sem);
857 parent = item->parent->dentry;
858
859 down(&parent->d_inode->i_sem);
860
861 new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
862 if (!IS_ERR(new_dentry)) {
863 if (!new_dentry->d_inode) {
864 error = config_item_set_name(item, "%s", new_name);
865 if (!error) {
866 d_add(new_dentry, NULL);
867 d_move(item->dentry, new_dentry);
868 }
869 else
870 d_delete(new_dentry);
871 } else
872 error = -EEXIST;
873 dput(new_dentry);
874 }
875 up(&parent->d_inode->i_sem);
876 up_write(&configfs_rename_sem);
877
878 return error;
879}
880#endif
881
882static int configfs_dir_open(struct inode *inode, struct file *file)
883{
884 struct dentry * dentry = file->f_dentry;
885 struct configfs_dirent * parent_sd = dentry->d_fsdata;
886
887 down(&dentry->d_inode->i_sem);
888 file->private_data = configfs_new_dirent(parent_sd, NULL);
889 up(&dentry->d_inode->i_sem);
890
891 return file->private_data ? 0 : -ENOMEM;
892
893}
894
895static int configfs_dir_close(struct inode *inode, struct file *file)
896{
897 struct dentry * dentry = file->f_dentry;
898 struct configfs_dirent * cursor = file->private_data;
899
900 down(&dentry->d_inode->i_sem);
901 list_del_init(&cursor->s_sibling);
902 up(&dentry->d_inode->i_sem);
903
904 release_configfs_dirent(cursor);
905
906 return 0;
907}
908
909/* Relationship between s_mode and the DT_xxx types */
910static inline unsigned char dt_type(struct configfs_dirent *sd)
911{
912 return (sd->s_mode >> 12) & 15;
913}
914
915static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
916{
917 struct dentry *dentry = filp->f_dentry;
918 struct configfs_dirent * parent_sd = dentry->d_fsdata;
919 struct configfs_dirent *cursor = filp->private_data;
920 struct list_head *p, *q = &cursor->s_sibling;
921 ino_t ino;
922 int i = filp->f_pos;
923
924 switch (i) {
925 case 0:
926 ino = dentry->d_inode->i_ino;
927 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
928 break;
929 filp->f_pos++;
930 i++;
931 /* fallthrough */
932 case 1:
933 ino = parent_ino(dentry);
934 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
935 break;
936 filp->f_pos++;
937 i++;
938 /* fallthrough */
939 default:
940 if (filp->f_pos == 2) {
941 list_del(q);
942 list_add(q, &parent_sd->s_children);
943 }
944 for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
945 struct configfs_dirent *next;
946 const char * name;
947 int len;
948
949 next = list_entry(p, struct configfs_dirent,
950 s_sibling);
951 if (!next->s_element)
952 continue;
953
954 name = configfs_get_name(next);
955 len = strlen(name);
956 if (next->s_dentry)
957 ino = next->s_dentry->d_inode->i_ino;
958 else
959 ino = iunique(configfs_sb, 2);
960
961 if (filldir(dirent, name, len, filp->f_pos, ino,
962 dt_type(next)) < 0)
963 return 0;
964
965 list_del(q);
966 list_add(q, p);
967 p = q;
968 filp->f_pos++;
969 }
970 }
971 return 0;
972}
973
974static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
975{
976 struct dentry * dentry = file->f_dentry;
977
978 down(&dentry->d_inode->i_sem);
979 switch (origin) {
980 case 1:
981 offset += file->f_pos;
982 case 0:
983 if (offset >= 0)
984 break;
985 default:
986 up(&file->f_dentry->d_inode->i_sem);
987 return -EINVAL;
988 }
989 if (offset != file->f_pos) {
990 file->f_pos = offset;
991 if (file->f_pos >= 2) {
992 struct configfs_dirent *sd = dentry->d_fsdata;
993 struct configfs_dirent *cursor = file->private_data;
994 struct list_head *p;
995 loff_t n = file->f_pos - 2;
996
997 list_del(&cursor->s_sibling);
998 p = sd->s_children.next;
999 while (n && p != &sd->s_children) {
1000 struct configfs_dirent *next;
1001 next = list_entry(p, struct configfs_dirent,
1002 s_sibling);
1003 if (next->s_element)
1004 n--;
1005 p = p->next;
1006 }
1007 list_add_tail(&cursor->s_sibling, p);
1008 }
1009 }
1010 up(&dentry->d_inode->i_sem);
1011 return offset;
1012}
1013
1014struct file_operations configfs_dir_operations = {
1015 .open = configfs_dir_open,
1016 .release = configfs_dir_close,
1017 .llseek = configfs_dir_lseek,
1018 .read = generic_read_dir,
1019 .readdir = configfs_readdir,
1020};
1021
1022int configfs_register_subsystem(struct configfs_subsystem *subsys)
1023{
1024 int err;
1025 struct config_group *group = &subsys->su_group;
1026 struct qstr name;
1027 struct dentry *dentry;
1028 struct configfs_dirent *sd;
1029
1030 err = configfs_pin_fs();
1031 if (err)
1032 return err;
1033
1034 if (!group->cg_item.ci_name)
1035 group->cg_item.ci_name = group->cg_item.ci_namebuf;
1036
1037 sd = configfs_sb->s_root->d_fsdata;
1038 link_group(to_config_group(sd->s_element), group);
1039
1040 down(&configfs_sb->s_root->d_inode->i_sem);
1041
1042 name.name = group->cg_item.ci_name;
1043 name.len = strlen(name.name);
1044 name.hash = full_name_hash(name.name, name.len);
1045
1046 err = -ENOMEM;
1047 dentry = d_alloc(configfs_sb->s_root, &name);
1048 if (!dentry)
1049 goto out_release;
1050
1051 d_add(dentry, NULL);
1052
1053 err = configfs_attach_group(sd->s_element, &group->cg_item,
1054 dentry);
1055 if (!err)
1056 dentry = NULL;
1057 else
1058 d_delete(dentry);
1059
1060 up(&configfs_sb->s_root->d_inode->i_sem);
1061
1062 if (dentry) {
1063 dput(dentry);
1064out_release:
1065 unlink_group(group);
1066 configfs_release_fs();
1067 }
1068
1069 return err;
1070}
1071
1072void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
1073{
1074 struct config_group *group = &subsys->su_group;
1075 struct dentry *dentry = group->cg_item.ci_dentry;
1076
1077 if (dentry->d_parent != configfs_sb->s_root) {
1078 printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
1079 return;
1080 }
1081
1082 down(&configfs_sb->s_root->d_inode->i_sem);
1083 down(&dentry->d_inode->i_sem);
1084 if (configfs_detach_prep(dentry)) {
1085 printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
1086 }
1087 configfs_detach_group(&group->cg_item);
1088 dentry->d_inode->i_flags |= S_DEAD;
1089 up(&dentry->d_inode->i_sem);
1090
1091 d_delete(dentry);
1092
1093 up(&configfs_sb->s_root->d_inode->i_sem);
1094
1095 dput(dentry);
1096
1097 unlink_group(group);
1098 configfs_release_fs();
1099}
1100
1101EXPORT_SYMBOL(configfs_register_subsystem);
1102EXPORT_SYMBOL(configfs_unregister_subsystem);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
new file mode 100644
index 000000000000..af1ffc9a15c0
--- /dev/null
+++ b/fs/configfs/file.c
@@ -0,0 +1,360 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * file.c - operations for regular (text) files.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 */
26
27#include <linux/fs.h>
28#include <linux/module.h>
29#include <linux/dnotify.h>
30#include <linux/slab.h>
31#include <asm/uaccess.h>
32#include <asm/semaphore.h>
33
34#include <linux/configfs.h>
35#include "configfs_internal.h"
36
37
38struct configfs_buffer {
39 size_t count;
40 loff_t pos;
41 char * page;
42 struct configfs_item_operations * ops;
43 struct semaphore sem;
44 int needs_read_fill;
45};
46
47
48/**
49 * fill_read_buffer - allocate and fill buffer from item.
50 * @dentry: dentry pointer.
51 * @buffer: data buffer for file.
52 *
53 * Allocate @buffer->page, if it hasn't been already, then call the
54 * config_item's show() method to fill the buffer with this attribute's
55 * data.
56 * This is called only once, on the file's first read.
57 */
58static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
59{
60 struct configfs_attribute * attr = to_attr(dentry);
61 struct config_item * item = to_item(dentry->d_parent);
62 struct configfs_item_operations * ops = buffer->ops;
63 int ret = 0;
64 ssize_t count;
65
66 if (!buffer->page)
67 buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
68 if (!buffer->page)
69 return -ENOMEM;
70
71 count = ops->show_attribute(item,attr,buffer->page);
72 buffer->needs_read_fill = 0;
73 BUG_ON(count > (ssize_t)PAGE_SIZE);
74 if (count >= 0)
75 buffer->count = count;
76 else
77 ret = count;
78 return ret;
79}
80
81
82/**
83 * flush_read_buffer - push buffer to userspace.
84 * @buffer: data buffer for file.
85 * @userbuf: user-passed buffer.
86 * @count: number of bytes requested.
87 * @ppos: file position.
88 *
89 * Copy the buffer we filled in fill_read_buffer() to userspace.
90 * This is done at the reader's leisure, copying and advancing
91 * the amount they specify each time.
92 * This may be called continuously until the buffer is empty.
93 */
94static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf,
95 size_t count, loff_t * ppos)
96{
97 int error;
98
99 if (*ppos > buffer->count)
100 return 0;
101
102 if (count > (buffer->count - *ppos))
103 count = buffer->count - *ppos;
104
105 error = copy_to_user(buf,buffer->page + *ppos,count);
106 if (!error)
107 *ppos += count;
108 return error ? -EFAULT : count;
109}
110
111/**
112 * configfs_read_file - read an attribute.
113 * @file: file pointer.
114 * @buf: buffer to fill.
115 * @count: number of bytes to read.
116 * @ppos: starting offset in file.
117 *
118 * Userspace wants to read an attribute file. The attribute descriptor
119 * is in the file's ->d_fsdata. The target item is in the directory's
120 * ->d_fsdata.
121 *
122 * We call fill_read_buffer() to allocate and fill the buffer from the
123 * item's show() method exactly once (if the read is happening from
124 * the beginning of the file). That should fill the entire buffer with
125 * all the data the item has to offer for that attribute.
126 * We then call flush_read_buffer() to copy the buffer to userspace
127 * in the increments specified.
128 */
129
130static ssize_t
131configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
132{
133 struct configfs_buffer * buffer = file->private_data;
134 ssize_t retval = 0;
135
136 down(&buffer->sem);
137 if (buffer->needs_read_fill) {
138 if ((retval = fill_read_buffer(file->f_dentry,buffer)))
139 goto out;
140 }
141 pr_debug("%s: count = %d, ppos = %lld, buf = %s\n",
142 __FUNCTION__,count,*ppos,buffer->page);
143 retval = flush_read_buffer(buffer,buf,count,ppos);
144out:
145 up(&buffer->sem);
146 return retval;
147}
148
149
150/**
151 * fill_write_buffer - copy buffer from userspace.
152 * @buffer: data buffer for file.
153 * @userbuf: data from user.
154 * @count: number of bytes in @userbuf.
155 *
156 * Allocate @buffer->page if it hasn't been already, then
157 * copy the user-supplied buffer into it.
158 */
159
160static int
161fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
162{
163 int error;
164
165 if (!buffer->page)
166 buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
167 if (!buffer->page)
168 return -ENOMEM;
169
170 if (count > PAGE_SIZE)
171 count = PAGE_SIZE;
172 error = copy_from_user(buffer->page,buf,count);
173 buffer->needs_read_fill = 1;
174 return error ? -EFAULT : count;
175}
176
177
178/**
179 * flush_write_buffer - push buffer to config_item.
180 * @file: file pointer.
181 * @buffer: data buffer for file.
182 *
183 * Get the correct pointers for the config_item and the attribute we're
184 * dealing with, then call the store() method for the attribute,
185 * passing the buffer that we acquired in fill_write_buffer().
186 */
187
188static int
189flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
190{
191 struct configfs_attribute * attr = to_attr(dentry);
192 struct config_item * item = to_item(dentry->d_parent);
193 struct configfs_item_operations * ops = buffer->ops;
194
195 return ops->store_attribute(item,attr,buffer->page,count);
196}
197
198
199/**
200 * configfs_write_file - write an attribute.
201 * @file: file pointer
202 * @buf: data to write
203 * @count: number of bytes
204 * @ppos: starting offset
205 *
206 * Similar to configfs_read_file(), though working in the opposite direction.
207 * We allocate and fill the data from the user in fill_write_buffer(),
208 * then push it to the config_item in flush_write_buffer().
209 * There is no easy way for us to know if userspace is only doing a partial
210 * write, so we don't support them. We expect the entire buffer to come
211 * on the first write.
212 * Hint: if you're writing a value, first read the file, modify only the
213 * the value you're changing, then write entire buffer back.
214 */
215
216static ssize_t
217configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
218{
219 struct configfs_buffer * buffer = file->private_data;
220
221 down(&buffer->sem);
222 count = fill_write_buffer(buffer,buf,count);
223 if (count > 0)
224 count = flush_write_buffer(file->f_dentry,buffer,count);
225 if (count > 0)
226 *ppos += count;
227 up(&buffer->sem);
228 return count;
229}
230
231static int check_perm(struct inode * inode, struct file * file)
232{
233 struct config_item *item = configfs_get_config_item(file->f_dentry->d_parent);
234 struct configfs_attribute * attr = to_attr(file->f_dentry);
235 struct configfs_buffer * buffer;
236 struct configfs_item_operations * ops = NULL;
237 int error = 0;
238
239 if (!item || !attr)
240 goto Einval;
241
242 /* Grab the module reference for this attribute if we have one */
243 if (!try_module_get(attr->ca_owner)) {
244 error = -ENODEV;
245 goto Done;
246 }
247
248 if (item->ci_type)
249 ops = item->ci_type->ct_item_ops;
250 else
251 goto Eaccess;
252
253 /* File needs write support.
254 * The inode's perms must say it's ok,
255 * and we must have a store method.
256 */
257 if (file->f_mode & FMODE_WRITE) {
258
259 if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
260 goto Eaccess;
261
262 }
263
264 /* File needs read support.
265 * The inode's perms must say it's ok, and we there
266 * must be a show method for it.
267 */
268 if (file->f_mode & FMODE_READ) {
269 if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
270 goto Eaccess;
271 }
272
273 /* No error? Great, allocate a buffer for the file, and store it
274 * it in file->private_data for easy access.
275 */
276 buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
277 if (buffer) {
278 memset(buffer,0,sizeof(struct configfs_buffer));
279 init_MUTEX(&buffer->sem);
280 buffer->needs_read_fill = 1;
281 buffer->ops = ops;
282 file->private_data = buffer;
283 } else
284 error = -ENOMEM;
285 goto Done;
286
287 Einval:
288 error = -EINVAL;
289 goto Done;
290 Eaccess:
291 error = -EACCES;
292 module_put(attr->ca_owner);
293 Done:
294 if (error && item)
295 config_item_put(item);
296 return error;
297}
298
299static int configfs_open_file(struct inode * inode, struct file * filp)
300{
301 return check_perm(inode,filp);
302}
303
304static int configfs_release(struct inode * inode, struct file * filp)
305{
306 struct config_item * item = to_item(filp->f_dentry->d_parent);
307 struct configfs_attribute * attr = to_attr(filp->f_dentry);
308 struct module * owner = attr->ca_owner;
309 struct configfs_buffer * buffer = filp->private_data;
310
311 if (item)
312 config_item_put(item);
313 /* After this point, attr should not be accessed. */
314 module_put(owner);
315
316 if (buffer) {
317 if (buffer->page)
318 free_page((unsigned long)buffer->page);
319 kfree(buffer);
320 }
321 return 0;
322}
323
324struct file_operations configfs_file_operations = {
325 .read = configfs_read_file,
326 .write = configfs_write_file,
327 .llseek = generic_file_llseek,
328 .open = configfs_open_file,
329 .release = configfs_release,
330};
331
332
333int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type)
334{
335 struct configfs_dirent * parent_sd = dir->d_fsdata;
336 umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
337 int error = 0;
338
339 down(&dir->d_inode->i_sem);
340 error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
341 up(&dir->d_inode->i_sem);
342
343 return error;
344}
345
346
347/**
348 * configfs_create_file - create an attribute file for an item.
349 * @item: item we're creating for.
350 * @attr: atrribute descriptor.
351 */
352
353int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
354{
355 BUG_ON(!item || !item->ci_dentry || !attr);
356
357 return configfs_add_file(item->ci_dentry, attr,
358 CONFIGFS_ITEM_ATTR);
359}
360
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
new file mode 100644
index 000000000000..6b274c6d428f
--- /dev/null
+++ b/fs/configfs/inode.c
@@ -0,0 +1,162 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * inode.c - basic inode and dentry operations.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 *
26 * Please see Documentation/filesystems/configfs.txt for more information.
27 */
28
29#undef DEBUG
30
31#include <linux/pagemap.h>
32#include <linux/namei.h>
33#include <linux/backing-dev.h>
34
35#include <linux/configfs.h>
36#include "configfs_internal.h"
37
38extern struct super_block * configfs_sb;
39
40static struct address_space_operations configfs_aops = {
41 .readpage = simple_readpage,
42 .prepare_write = simple_prepare_write,
43 .commit_write = simple_commit_write
44};
45
46static struct backing_dev_info configfs_backing_dev_info = {
47 .ra_pages = 0, /* No readahead */
48 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
49};
50
51struct inode * configfs_new_inode(mode_t mode)
52{
53 struct inode * inode = new_inode(configfs_sb);
54 if (inode) {
55 inode->i_mode = mode;
56 inode->i_uid = 0;
57 inode->i_gid = 0;
58 inode->i_blksize = PAGE_CACHE_SIZE;
59 inode->i_blocks = 0;
60 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
61 inode->i_mapping->a_ops = &configfs_aops;
62 inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
63 }
64 return inode;
65}
66
67int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
68{
69 int error = 0;
70 struct inode * inode = NULL;
71 if (dentry) {
72 if (!dentry->d_inode) {
73 if ((inode = configfs_new_inode(mode))) {
74 if (dentry->d_parent && dentry->d_parent->d_inode) {
75 struct inode *p_inode = dentry->d_parent->d_inode;
76 p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
77 }
78 goto Proceed;
79 }
80 else
81 error = -ENOMEM;
82 } else
83 error = -EEXIST;
84 } else
85 error = -ENOENT;
86 goto Done;
87
88 Proceed:
89 if (init)
90 error = init(inode);
91 if (!error) {
92 d_instantiate(dentry, inode);
93 if (S_ISDIR(mode) || S_ISLNK(mode))
94 dget(dentry); /* pin link and directory dentries in core */
95 } else
96 iput(inode);
97 Done:
98 return error;
99}
100
101/*
102 * Get the name for corresponding element represented by the given configfs_dirent
103 */
104const unsigned char * configfs_get_name(struct configfs_dirent *sd)
105{
106 struct attribute * attr;
107
108 if (!sd || !sd->s_element)
109 BUG();
110
111 /* These always have a dentry, so use that */
112 if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
113 return sd->s_dentry->d_name.name;
114
115 if (sd->s_type & CONFIGFS_ITEM_ATTR) {
116 attr = sd->s_element;
117 return attr->name;
118 }
119 return NULL;
120}
121
122
123/*
124 * Unhashes the dentry corresponding to given configfs_dirent
125 * Called with parent inode's i_sem held.
126 */
127void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
128{
129 struct dentry * dentry = sd->s_dentry;
130
131 if (dentry) {
132 spin_lock(&dcache_lock);
133 if (!(d_unhashed(dentry) && dentry->d_inode)) {
134 dget_locked(dentry);
135 __d_drop(dentry);
136 spin_unlock(&dcache_lock);
137 simple_unlink(parent->d_inode, dentry);
138 } else
139 spin_unlock(&dcache_lock);
140 }
141}
142
143void configfs_hash_and_remove(struct dentry * dir, const char * name)
144{
145 struct configfs_dirent * sd;
146 struct configfs_dirent * parent_sd = dir->d_fsdata;
147
148 down(&dir->d_inode->i_sem);
149 list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
150 if (!sd->s_element)
151 continue;
152 if (!strcmp(configfs_get_name(sd), name)) {
153 list_del_init(&sd->s_sibling);
154 configfs_drop_dentry(sd, dir);
155 configfs_put(sd);
156 break;
157 }
158 }
159 up(&dir->d_inode->i_sem);
160}
161
162
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
new file mode 100644
index 000000000000..e07485ac50ad
--- /dev/null
+++ b/fs/configfs/item.c
@@ -0,0 +1,227 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * item.c - library routines for handling generic config items
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on kobject:
22 * kobject is Copyright (c) 2002-2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 *
26 * Please see the file Documentation/filesystems/configfs.txt for
27 * critical information about using the config_item interface.
28 */
29
30#include <linux/string.h>
31#include <linux/module.h>
32#include <linux/stat.h>
33#include <linux/slab.h>
34
35#include <linux/configfs.h>
36
37
38static inline struct config_item * to_item(struct list_head * entry)
39{
40 return container_of(entry,struct config_item,ci_entry);
41}
42
43/* Evil kernel */
44static void config_item_release(struct kref *kref);
45
46/**
47 * config_item_init - initialize item.
48 * @item: item in question.
49 */
50void config_item_init(struct config_item * item)
51{
52 kref_init(&item->ci_kref);
53 INIT_LIST_HEAD(&item->ci_entry);
54}
55
56/**
57 * config_item_set_name - Set the name of an item
58 * @item: item.
59 * @name: name.
60 *
61 * If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
62 * dynamically allocated string that @item->ci_name points to.
63 * Otherwise, use the static @item->ci_namebuf array.
64 */
65
66int config_item_set_name(struct config_item * item, const char * fmt, ...)
67{
68 int error = 0;
69 int limit = CONFIGFS_ITEM_NAME_LEN;
70 int need;
71 va_list args;
72 char * name;
73
74 /*
75 * First, try the static array
76 */
77 va_start(args,fmt);
78 need = vsnprintf(item->ci_namebuf,limit,fmt,args);
79 va_end(args);
80 if (need < limit)
81 name = item->ci_namebuf;
82 else {
83 /*
84 * Need more space? Allocate it and try again
85 */
86 limit = need + 1;
87 name = kmalloc(limit,GFP_KERNEL);
88 if (!name) {
89 error = -ENOMEM;
90 goto Done;
91 }
92 va_start(args,fmt);
93 need = vsnprintf(name,limit,fmt,args);
94 va_end(args);
95
96 /* Still? Give up. */
97 if (need >= limit) {
98 kfree(name);
99 error = -EFAULT;
100 goto Done;
101 }
102 }
103
104 /* Free the old name, if necessary. */
105 if (item->ci_name && item->ci_name != item->ci_namebuf)
106 kfree(item->ci_name);
107
108 /* Now, set the new name */
109 item->ci_name = name;
110 Done:
111 return error;
112}
113
114EXPORT_SYMBOL(config_item_set_name);
115
116void config_item_init_type_name(struct config_item *item,
117 const char *name,
118 struct config_item_type *type)
119{
120 config_item_set_name(item, name);
121 item->ci_type = type;
122 config_item_init(item);
123}
124EXPORT_SYMBOL(config_item_init_type_name);
125
126void config_group_init_type_name(struct config_group *group, const char *name,
127 struct config_item_type *type)
128{
129 config_item_set_name(&group->cg_item, name);
130 group->cg_item.ci_type = type;
131 config_group_init(group);
132}
133EXPORT_SYMBOL(config_group_init_type_name);
134
135struct config_item * config_item_get(struct config_item * item)
136{
137 if (item)
138 kref_get(&item->ci_kref);
139 return item;
140}
141
142/**
143 * config_item_cleanup - free config_item resources.
144 * @item: item.
145 */
146
147void config_item_cleanup(struct config_item * item)
148{
149 struct config_item_type * t = item->ci_type;
150 struct config_group * s = item->ci_group;
151 struct config_item * parent = item->ci_parent;
152
153 pr_debug("config_item %s: cleaning up\n",config_item_name(item));
154 if (item->ci_name != item->ci_namebuf)
155 kfree(item->ci_name);
156 item->ci_name = NULL;
157 if (t && t->ct_item_ops && t->ct_item_ops->release)
158 t->ct_item_ops->release(item);
159 if (s)
160 config_group_put(s);
161 if (parent)
162 config_item_put(parent);
163}
164
165static void config_item_release(struct kref *kref)
166{
167 config_item_cleanup(container_of(kref, struct config_item, ci_kref));
168}
169
170/**
171 * config_item_put - decrement refcount for item.
172 * @item: item.
173 *
174 * Decrement the refcount, and if 0, call config_item_cleanup().
175 */
176void config_item_put(struct config_item * item)
177{
178 if (item)
179 kref_put(&item->ci_kref, config_item_release);
180}
181
182
183/**
184 * config_group_init - initialize a group for use
185 * @k: group
186 */
187
188void config_group_init(struct config_group *group)
189{
190 config_item_init(&group->cg_item);
191 INIT_LIST_HEAD(&group->cg_children);
192}
193
194
195/**
196 * config_group_find_obj - search for item in group.
197 * @group: group we're looking in.
198 * @name: item's name.
199 *
200 * Lock group via @group->cg_subsys, and iterate over @group->cg_list,
201 * looking for a matching config_item. If matching item is found
202 * take a reference and return the item.
203 */
204
205struct config_item * config_group_find_obj(struct config_group * group, const char * name)
206{
207 struct list_head * entry;
208 struct config_item * ret = NULL;
209
210 /* XXX LOCKING! */
211 list_for_each(entry,&group->cg_children) {
212 struct config_item * item = to_item(entry);
213 if (config_item_name(item) &&
214 !strcmp(config_item_name(item), name)) {
215 ret = config_item_get(item);
216 break;
217 }
218 }
219 return ret;
220}
221
222
223EXPORT_SYMBOL(config_item_init);
224EXPORT_SYMBOL(config_group_init);
225EXPORT_SYMBOL(config_item_get);
226EXPORT_SYMBOL(config_item_put);
227
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
new file mode 100644
index 000000000000..1a2f6f6a4d91
--- /dev/null
+++ b/fs/configfs/mount.c
@@ -0,0 +1,159 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * mount.c - operations for initializing and mounting configfs.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 */
26
27#include <linux/fs.h>
28#include <linux/module.h>
29#include <linux/mount.h>
30#include <linux/pagemap.h>
31#include <linux/init.h>
32
33#include <linux/configfs.h>
34#include "configfs_internal.h"
35
36/* Random magic number */
37#define CONFIGFS_MAGIC 0x62656570
38
39struct vfsmount * configfs_mount = NULL;
40struct super_block * configfs_sb = NULL;
41static int configfs_mnt_count = 0;
42
43static struct super_operations configfs_ops = {
44 .statfs = simple_statfs,
45 .drop_inode = generic_delete_inode,
46};
47
48static struct config_group configfs_root_group = {
49 .cg_item = {
50 .ci_namebuf = "root",
51 .ci_name = configfs_root_group.cg_item.ci_namebuf,
52 },
53};
54
55int configfs_is_root(struct config_item *item)
56{
57 return item == &configfs_root_group.cg_item;
58}
59
60static struct configfs_dirent configfs_root = {
61 .s_sibling = LIST_HEAD_INIT(configfs_root.s_sibling),
62 .s_children = LIST_HEAD_INIT(configfs_root.s_children),
63 .s_element = &configfs_root_group.cg_item,
64 .s_type = CONFIGFS_ROOT,
65};
66
67static int configfs_fill_super(struct super_block *sb, void *data, int silent)
68{
69 struct inode *inode;
70 struct dentry *root;
71
72 sb->s_blocksize = PAGE_CACHE_SIZE;
73 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
74 sb->s_magic = CONFIGFS_MAGIC;
75 sb->s_op = &configfs_ops;
76 configfs_sb = sb;
77
78 inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
79 if (inode) {
80 inode->i_op = &configfs_dir_inode_operations;
81 inode->i_fop = &configfs_dir_operations;
82 /* directory inodes start off with i_nlink == 2 (for "." entry) */
83 inode->i_nlink++;
84 } else {
85 pr_debug("configfs: could not get root inode\n");
86 return -ENOMEM;
87 }
88
89 root = d_alloc_root(inode);
90 if (!root) {
91 pr_debug("%s: could not get root dentry!\n",__FUNCTION__);
92 iput(inode);
93 return -ENOMEM;
94 }
95 config_group_init(&configfs_root_group);
96 configfs_root_group.cg_item.ci_dentry = root;
97 root->d_fsdata = &configfs_root;
98 sb->s_root = root;
99 return 0;
100}
101
102static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
103 int flags, const char *dev_name, void *data)
104{
105 return get_sb_single(fs_type, flags, data, configfs_fill_super);
106}
107
108static struct file_system_type configfs_fs_type = {
109 .owner = THIS_MODULE,
110 .name = "configfs",
111 .get_sb = configfs_get_sb,
112 .kill_sb = kill_litter_super,
113};
114
115int configfs_pin_fs(void)
116{
117 return simple_pin_fs("configfs", &configfs_mount,
118 &configfs_mnt_count);
119}
120
121void configfs_release_fs(void)
122{
123 simple_release_fs(&configfs_mount, &configfs_mnt_count);
124}
125
126
127static decl_subsys(config, NULL, NULL);
128
129static int __init configfs_init(void)
130{
131 int err;
132
133 kset_set_kset_s(&config_subsys, kernel_subsys);
134 err = subsystem_register(&config_subsys);
135 if (err)
136 return err;
137
138 err = register_filesystem(&configfs_fs_type);
139 if (err) {
140 printk(KERN_ERR "configfs: Unable to register filesystem!\n");
141 subsystem_unregister(&config_subsys);
142 }
143
144 return err;
145}
146
147static void __exit configfs_exit(void)
148{
149 unregister_filesystem(&configfs_fs_type);
150 subsystem_unregister(&config_subsys);
151}
152
153MODULE_AUTHOR("Oracle");
154MODULE_LICENSE("GPL");
155MODULE_VERSION("0.0.1");
156MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
157
158module_init(configfs_init);
159module_exit(configfs_exit);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
new file mode 100644
index 000000000000..50f5840521a9
--- /dev/null
+++ b/fs/configfs/symlink.c
@@ -0,0 +1,281 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * symlink.c - operations for configfs symlinks.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 *
24 * configfs Copyright (C) 2005 Oracle. All rights reserved.
25 */
26
27#include <linux/fs.h>
28#include <linux/module.h>
29#include <linux/namei.h>
30
31#include <linux/configfs.h>
32#include "configfs_internal.h"
33
34static int item_depth(struct config_item * item)
35{
36 struct config_item * p = item;
37 int depth = 0;
38 do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p));
39 return depth;
40}
41
42static int item_path_length(struct config_item * item)
43{
44 struct config_item * p = item;
45 int length = 1;
46 do {
47 length += strlen(config_item_name(p)) + 1;
48 p = p->ci_parent;
49 } while (p && !configfs_is_root(p));
50 return length;
51}
52
53static void fill_item_path(struct config_item * item, char * buffer, int length)
54{
55 struct config_item * p;
56
57 --length;
58 for (p = item; p && !configfs_is_root(p); p = p->ci_parent) {
59 int cur = strlen(config_item_name(p));
60
61 /* back up enough to print this bus id with '/' */
62 length -= cur;
63 strncpy(buffer + length,config_item_name(p),cur);
64 *(buffer + --length) = '/';
65 }
66}
67
68static int create_link(struct config_item *parent_item,
69 struct config_item *item,
70 struct dentry *dentry)
71{
72 struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
73 struct configfs_symlink *sl;
74 int ret;
75
76 ret = -ENOMEM;
77 sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
78 if (sl) {
79 sl->sl_target = config_item_get(item);
80 /* FIXME: needs a lock, I'd bet */
81 list_add(&sl->sl_list, &target_sd->s_links);
82 ret = configfs_create_link(sl, parent_item->ci_dentry,
83 dentry);
84 if (ret) {
85 list_del_init(&sl->sl_list);
86 config_item_put(item);
87 kfree(sl);
88 }
89 }
90
91 return ret;
92}
93
94
95static int get_target(const char *symname, struct nameidata *nd,
96 struct config_item **target)
97{
98 int ret;
99
100 ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd);
101 if (!ret) {
102 if (nd->dentry->d_sb == configfs_sb) {
103 *target = configfs_get_config_item(nd->dentry);
104 if (!*target) {
105 ret = -ENOENT;
106 path_release(nd);
107 }
108 } else
109 ret = -EPERM;
110 }
111
112 return ret;
113}
114
115
116int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
117{
118 int ret;
119 struct nameidata nd;
120 struct config_item *parent_item;
121 struct config_item *target_item;
122 struct config_item_type *type;
123
124 ret = -EPERM; /* What lack-of-symlink returns */
125 if (dentry->d_parent == configfs_sb->s_root)
126 goto out;
127
128 parent_item = configfs_get_config_item(dentry->d_parent);
129 type = parent_item->ci_type;
130
131 if (!type || !type->ct_item_ops ||
132 !type->ct_item_ops->allow_link)
133 goto out_put;
134
135 ret = get_target(symname, &nd, &target_item);
136 if (ret)
137 goto out_put;
138
139 ret = type->ct_item_ops->allow_link(parent_item, target_item);
140 if (!ret)
141 ret = create_link(parent_item, target_item, dentry);
142
143 config_item_put(target_item);
144 path_release(&nd);
145
146out_put:
147 config_item_put(parent_item);
148
149out:
150 return ret;
151}
152
153int configfs_unlink(struct inode *dir, struct dentry *dentry)
154{
155 struct configfs_dirent *sd = dentry->d_fsdata;
156 struct configfs_symlink *sl;
157 struct config_item *parent_item;
158 struct config_item_type *type;
159 int ret;
160
161 ret = -EPERM; /* What lack-of-symlink returns */
162 if (!(sd->s_type & CONFIGFS_ITEM_LINK))
163 goto out;
164
165 if (dentry->d_parent == configfs_sb->s_root)
166 BUG();
167
168 sl = sd->s_element;
169
170 parent_item = configfs_get_config_item(dentry->d_parent);
171 type = parent_item->ci_type;
172
173 list_del_init(&sd->s_sibling);
174 configfs_drop_dentry(sd, dentry->d_parent);
175 dput(dentry);
176 configfs_put(sd);
177
178 /*
179 * drop_link() must be called before
180 * list_del_init(&sl->sl_list), so that the order of
181 * drop_link(this, target) and drop_item(target) is preserved.
182 */
183 if (type && type->ct_item_ops &&
184 type->ct_item_ops->drop_link)
185 type->ct_item_ops->drop_link(parent_item,
186 sl->sl_target);
187
188 /* FIXME: Needs lock */
189 list_del_init(&sl->sl_list);
190
191 /* Put reference from create_link() */
192 config_item_put(sl->sl_target);
193 kfree(sl);
194
195 config_item_put(parent_item);
196
197 ret = 0;
198
199out:
200 return ret;
201}
202
203static int configfs_get_target_path(struct config_item * item, struct config_item * target,
204 char *path)
205{
206 char * s;
207 int depth, size;
208
209 depth = item_depth(item);
210 size = item_path_length(target) + depth * 3 - 1;
211 if (size > PATH_MAX)
212 return -ENAMETOOLONG;
213
214 pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
215
216 for (s = path; depth--; s += 3)
217 strcpy(s,"../");
218
219 fill_item_path(target, path, size);
220 pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
221
222 return 0;
223}
224
225static int configfs_getlink(struct dentry *dentry, char * path)
226{
227 struct config_item *item, *target_item;
228 int error = 0;
229
230 item = configfs_get_config_item(dentry->d_parent);
231 if (!item)
232 return -EINVAL;
233
234 target_item = configfs_get_config_item(dentry);
235 if (!target_item) {
236 config_item_put(item);
237 return -EINVAL;
238 }
239
240 down_read(&configfs_rename_sem);
241 error = configfs_get_target_path(item, target_item, path);
242 up_read(&configfs_rename_sem);
243
244 config_item_put(item);
245 config_item_put(target_item);
246 return error;
247
248}
249
250static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
251{
252 int error = -ENOMEM;
253 unsigned long page = get_zeroed_page(GFP_KERNEL);
254
255 if (page) {
256 error = configfs_getlink(dentry, (char *)page);
257 if (!error) {
258 nd_set_link(nd, (char *)page);
259 return (void *)page;
260 }
261 }
262
263 nd_set_link(nd, ERR_PTR(error));
264 return NULL;
265}
266
267static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
268 void *cookie)
269{
270 if (cookie) {
271 unsigned long page = (unsigned long)cookie;
272 free_page(page);
273 }
274}
275
276struct inode_operations configfs_symlink_inode_operations = {
277 .follow_link = configfs_follow_link,
278 .readlink = generic_readlink,
279 .put_link = configfs_put_link,
280};
281
diff --git a/fs/exec.c b/fs/exec.c
index 22533cce0611..e75a9548da8e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -324,7 +324,7 @@ void install_arg_page(struct vm_area_struct *vma,
324 lru_cache_add_active(page); 324 lru_cache_add_active(page);
325 set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( 325 set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
326 page, vma->vm_page_prot)))); 326 page, vma->vm_page_prot))));
327 page_add_anon_rmap(page, vma, address); 327 page_add_new_anon_rmap(page, vma, address);
328 pte_unmap_unlock(pte, ptl); 328 pte_unmap_unlock(pte, ptl);
329 329
330 /* no need for flush_tlb */ 330 /* no need for flush_tlb */
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8f873e621f41..e08ab4702d97 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -148,6 +148,26 @@ void fuse_release_background(struct fuse_req *req)
148 spin_unlock(&fuse_lock); 148 spin_unlock(&fuse_lock);
149} 149}
150 150
151static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
152{
153 int i;
154 struct fuse_init_out *arg = &req->misc.init_out;
155
156 if (arg->major != FUSE_KERNEL_VERSION)
157 fc->conn_error = 1;
158 else {
159 fc->minor = arg->minor;
160 fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
161 }
162
163 /* After INIT reply is received other requests can go
164 out. So do (FUSE_MAX_OUTSTANDING - 1) number of
165 up()s on outstanding_sem. The last up() is done in
166 fuse_putback_request() */
167 for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
168 up(&fc->outstanding_sem);
169}
170
151/* 171/*
152 * This function is called when a request is finished. Either a reply 172 * This function is called when a request is finished. Either a reply
153 * has arrived or it was interrupted (and not yet sent) or some error 173 * has arrived or it was interrupted (and not yet sent) or some error
@@ -172,19 +192,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
172 up_read(&fc->sbput_sem); 192 up_read(&fc->sbput_sem);
173 } 193 }
174 wake_up(&req->waitq); 194 wake_up(&req->waitq);
175 if (req->in.h.opcode == FUSE_INIT) { 195 if (req->in.h.opcode == FUSE_INIT)
176 int i; 196 process_init_reply(fc, req);
177 197 else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
178 if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
179 fc->conn_error = 1;
180
181 /* After INIT reply is received other requests can go
182 out. So do (FUSE_MAX_OUTSTANDING - 1) number of
183 up()s on outstanding_sem. The last up() is done in
184 fuse_putback_request() */
185 for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
186 up(&fc->outstanding_sem);
187 } else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
188 /* Special case for failed iget in CREATE */ 198 /* Special case for failed iget in CREATE */
189 u64 nodeid = req->in.h.nodeid; 199 u64 nodeid = req->in.h.nodeid;
190 __fuse_get_request(req); 200 __fuse_get_request(req);
@@ -357,7 +367,7 @@ void fuse_send_init(struct fuse_conn *fc)
357 /* This is called from fuse_read_super() so there's guaranteed 367 /* This is called from fuse_read_super() so there's guaranteed
358 to be a request available */ 368 to be a request available */
359 struct fuse_req *req = do_get_request(fc); 369 struct fuse_req *req = do_get_request(fc);
360 struct fuse_init_in_out *arg = &req->misc.init_in_out; 370 struct fuse_init_in *arg = &req->misc.init_in;
361 arg->major = FUSE_KERNEL_VERSION; 371 arg->major = FUSE_KERNEL_VERSION;
362 arg->minor = FUSE_KERNEL_MINOR_VERSION; 372 arg->minor = FUSE_KERNEL_MINOR_VERSION;
363 req->in.h.opcode = FUSE_INIT; 373 req->in.h.opcode = FUSE_INIT;
@@ -365,8 +375,12 @@ void fuse_send_init(struct fuse_conn *fc)
365 req->in.args[0].size = sizeof(*arg); 375 req->in.args[0].size = sizeof(*arg);
366 req->in.args[0].value = arg; 376 req->in.args[0].value = arg;
367 req->out.numargs = 1; 377 req->out.numargs = 1;
368 req->out.args[0].size = sizeof(*arg); 378 /* Variable length arguement used for backward compatibility
369 req->out.args[0].value = arg; 379 with interface version < 7.5. Rest of init_out is zeroed
380 by do_get_request(), so a short reply is not a problem */
381 req->out.argvar = 1;
382 req->out.args[0].size = sizeof(struct fuse_init_out);
383 req->out.args[0].value = &req->misc.init_out;
370 request_send_background(fc, req); 384 request_send_background(fc, req);
371} 385}
372 386
@@ -615,6 +629,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
615 struct fuse_copy_state cs; 629 struct fuse_copy_state cs;
616 unsigned reqsize; 630 unsigned reqsize;
617 631
632 restart:
618 spin_lock(&fuse_lock); 633 spin_lock(&fuse_lock);
619 fc = file->private_data; 634 fc = file->private_data;
620 err = -EPERM; 635 err = -EPERM;
@@ -630,20 +645,25 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
630 645
631 req = list_entry(fc->pending.next, struct fuse_req, list); 646 req = list_entry(fc->pending.next, struct fuse_req, list);
632 list_del_init(&req->list); 647 list_del_init(&req->list);
633 spin_unlock(&fuse_lock);
634 648
635 in = &req->in; 649 in = &req->in;
636 reqsize = req->in.h.len; 650 reqsize = in->h.len;
637 fuse_copy_init(&cs, 1, req, iov, nr_segs); 651 /* If request is too large, reply with an error and restart the read */
638 err = -EINVAL; 652 if (iov_length(iov, nr_segs) < reqsize) {
639 if (iov_length(iov, nr_segs) >= reqsize) { 653 req->out.h.error = -EIO;
640 err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); 654 /* SETXATTR is special, since it may contain too large data */
641 if (!err) 655 if (in->h.opcode == FUSE_SETXATTR)
642 err = fuse_copy_args(&cs, in->numargs, in->argpages, 656 req->out.h.error = -E2BIG;
643 (struct fuse_arg *) in->args, 0); 657 request_end(fc, req);
658 goto restart;
644 } 659 }
660 spin_unlock(&fuse_lock);
661 fuse_copy_init(&cs, 1, req, iov, nr_segs);
662 err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
663 if (!err)
664 err = fuse_copy_args(&cs, in->numargs, in->argpages,
665 (struct fuse_arg *) in->args, 0);
645 fuse_copy_finish(&cs); 666 fuse_copy_finish(&cs);
646
647 spin_lock(&fuse_lock); 667 spin_lock(&fuse_lock);
648 req->locked = 0; 668 req->locked = 0;
649 if (!err && req->interrupted) 669 if (!err && req->interrupted)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 51f5da652771..417bcee466f6 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -13,8 +13,16 @@
13#include <linux/gfp.h> 13#include <linux/gfp.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/namei.h> 15#include <linux/namei.h>
16#include <linux/mount.h>
17 16
17/*
18 * FUSE caches dentries and attributes with separate timeout. The
19 * time in jiffies until the dentry/attributes are valid is stored in
20 * dentry->d_time and fuse_inode->i_time respectively.
21 */
22
23/*
24 * Calculate the time in jiffies until a dentry/attributes are valid
25 */
18static inline unsigned long time_to_jiffies(unsigned long sec, 26static inline unsigned long time_to_jiffies(unsigned long sec,
19 unsigned long nsec) 27 unsigned long nsec)
20{ 28{
@@ -22,6 +30,50 @@ static inline unsigned long time_to_jiffies(unsigned long sec,
22 return jiffies + timespec_to_jiffies(&ts); 30 return jiffies + timespec_to_jiffies(&ts);
23} 31}
24 32
33/*
34 * Set dentry and possibly attribute timeouts from the lookup/mk*
35 * replies
36 */
37static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o)
38{
39 entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec);
40 if (entry->d_inode)
41 get_fuse_inode(entry->d_inode)->i_time =
42 time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
43}
44
45/*
46 * Mark the attributes as stale, so that at the next call to
47 * ->getattr() they will be fetched from userspace
48 */
49void fuse_invalidate_attr(struct inode *inode)
50{
51 get_fuse_inode(inode)->i_time = jiffies - 1;
52}
53
54/*
55 * Just mark the entry as stale, so that a next attempt to look it up
56 * will result in a new lookup call to userspace
57 *
58 * This is called when a dentry is about to become negative and the
59 * timeout is unknown (unlink, rmdir, rename and in some cases
60 * lookup)
61 */
62static void fuse_invalidate_entry_cache(struct dentry *entry)
63{
64 entry->d_time = jiffies - 1;
65}
66
67/*
68 * Same as fuse_invalidate_entry_cache(), but also try to remove the
69 * dentry from the hash
70 */
71static void fuse_invalidate_entry(struct dentry *entry)
72{
73 d_invalidate(entry);
74 fuse_invalidate_entry_cache(entry);
75}
76
25static void fuse_lookup_init(struct fuse_req *req, struct inode *dir, 77static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
26 struct dentry *entry, 78 struct dentry *entry,
27 struct fuse_entry_out *outarg) 79 struct fuse_entry_out *outarg)
@@ -37,17 +89,34 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
37 req->out.args[0].value = outarg; 89 req->out.args[0].value = outarg;
38} 90}
39 91
92/*
93 * Check whether the dentry is still valid
94 *
95 * If the entry validity timeout has expired and the dentry is
96 * positive, try to redo the lookup. If the lookup results in a
97 * different inode, then let the VFS invalidate the dentry and redo
98 * the lookup once more. If the lookup results in the same inode,
99 * then refresh the attributes, timeouts and mark the dentry valid.
100 */
40static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) 101static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
41{ 102{
42 if (!entry->d_inode || is_bad_inode(entry->d_inode)) 103 struct inode *inode = entry->d_inode;
104
105 if (inode && is_bad_inode(inode))
43 return 0; 106 return 0;
44 else if (time_after(jiffies, entry->d_time)) { 107 else if (time_after(jiffies, entry->d_time)) {
45 int err; 108 int err;
46 struct fuse_entry_out outarg; 109 struct fuse_entry_out outarg;
47 struct inode *inode = entry->d_inode; 110 struct fuse_conn *fc;
48 struct fuse_inode *fi = get_fuse_inode(inode); 111 struct fuse_req *req;
49 struct fuse_conn *fc = get_fuse_conn(inode); 112
50 struct fuse_req *req = fuse_get_request(fc); 113 /* Doesn't hurt to "reset" the validity timeout */
114 fuse_invalidate_entry_cache(entry);
115 if (!inode)
116 return 0;
117
118 fc = get_fuse_conn(inode);
119 req = fuse_get_request(fc);
51 if (!req) 120 if (!req)
52 return 0; 121 return 0;
53 122
@@ -55,6 +124,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
55 request_send(fc, req); 124 request_send(fc, req);
56 err = req->out.h.error; 125 err = req->out.h.error;
57 if (!err) { 126 if (!err) {
127 struct fuse_inode *fi = get_fuse_inode(inode);
58 if (outarg.nodeid != get_node_id(inode)) { 128 if (outarg.nodeid != get_node_id(inode)) {
59 fuse_send_forget(fc, req, outarg.nodeid, 1); 129 fuse_send_forget(fc, req, outarg.nodeid, 1);
60 return 0; 130 return 0;
@@ -66,18 +136,18 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
66 return 0; 136 return 0;
67 137
68 fuse_change_attributes(inode, &outarg.attr); 138 fuse_change_attributes(inode, &outarg.attr);
69 entry->d_time = time_to_jiffies(outarg.entry_valid, 139 fuse_change_timeout(entry, &outarg);
70 outarg.entry_valid_nsec);
71 fi->i_time = time_to_jiffies(outarg.attr_valid,
72 outarg.attr_valid_nsec);
73 } 140 }
74 return 1; 141 return 1;
75} 142}
76 143
144/*
145 * Check if there's already a hashed alias of this directory inode.
146 * If yes, then lookup and mkdir must not create a new alias.
147 */
77static int dir_alias(struct inode *inode) 148static int dir_alias(struct inode *inode)
78{ 149{
79 if (S_ISDIR(inode->i_mode)) { 150 if (S_ISDIR(inode->i_mode)) {
80 /* Don't allow creating an alias to a directory */
81 struct dentry *alias = d_find_alias(inode); 151 struct dentry *alias = d_find_alias(inode);
82 if (alias) { 152 if (alias) {
83 dput(alias); 153 dput(alias);
@@ -96,8 +166,14 @@ static struct dentry_operations fuse_dentry_operations = {
96 .d_revalidate = fuse_dentry_revalidate, 166 .d_revalidate = fuse_dentry_revalidate,
97}; 167};
98 168
99static int fuse_lookup_iget(struct inode *dir, struct dentry *entry, 169static inline int valid_mode(int m)
100 struct inode **inodep) 170{
171 return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
172 S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
173}
174
175static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
176 struct nameidata *nd)
101{ 177{
102 int err; 178 int err;
103 struct fuse_entry_out outarg; 179 struct fuse_entry_out outarg;
@@ -106,53 +182,49 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
106 struct fuse_req *req; 182 struct fuse_req *req;
107 183
108 if (entry->d_name.len > FUSE_NAME_MAX) 184 if (entry->d_name.len > FUSE_NAME_MAX)
109 return -ENAMETOOLONG; 185 return ERR_PTR(-ENAMETOOLONG);
110 186
111 req = fuse_get_request(fc); 187 req = fuse_get_request(fc);
112 if (!req) 188 if (!req)
113 return -EINTR; 189 return ERR_PTR(-EINTR);
114 190
115 fuse_lookup_init(req, dir, entry, &outarg); 191 fuse_lookup_init(req, dir, entry, &outarg);
116 request_send(fc, req); 192 request_send(fc, req);
117 err = req->out.h.error; 193 err = req->out.h.error;
118 if (!err && invalid_nodeid(outarg.nodeid)) 194 if (!err && ((outarg.nodeid && invalid_nodeid(outarg.nodeid)) ||
195 !valid_mode(outarg.attr.mode)))
119 err = -EIO; 196 err = -EIO;
120 if (!err) { 197 if (!err && outarg.nodeid) {
121 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, 198 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
122 &outarg.attr); 199 &outarg.attr);
123 if (!inode) { 200 if (!inode) {
124 fuse_send_forget(fc, req, outarg.nodeid, 1); 201 fuse_send_forget(fc, req, outarg.nodeid, 1);
125 return -ENOMEM; 202 return ERR_PTR(-ENOMEM);
126 } 203 }
127 } 204 }
128 fuse_put_request(fc, req); 205 fuse_put_request(fc, req);
129 if (err && err != -ENOENT) 206 if (err && err != -ENOENT)
130 return err; 207 return ERR_PTR(err);
131 208
132 if (inode) { 209 if (inode && dir_alias(inode)) {
133 struct fuse_inode *fi = get_fuse_inode(inode); 210 iput(inode);
134 entry->d_time = time_to_jiffies(outarg.entry_valid, 211 return ERR_PTR(-EIO);
135 outarg.entry_valid_nsec);
136 fi->i_time = time_to_jiffies(outarg.attr_valid,
137 outarg.attr_valid_nsec);
138 } 212 }
139 213 d_add(entry, inode);
140 entry->d_op = &fuse_dentry_operations; 214 entry->d_op = &fuse_dentry_operations;
141 *inodep = inode; 215 if (!err)
142 return 0; 216 fuse_change_timeout(entry, &outarg);
143} 217 else
144 218 fuse_invalidate_entry_cache(entry);
145void fuse_invalidate_attr(struct inode *inode) 219 return NULL;
146{
147 get_fuse_inode(inode)->i_time = jiffies - 1;
148}
149
150static void fuse_invalidate_entry(struct dentry *entry)
151{
152 d_invalidate(entry);
153 entry->d_time = jiffies - 1;
154} 220}
155 221
222/*
223 * Atomic create+open operation
224 *
225 * If the filesystem doesn't support this, then fall back to separate
226 * 'mknod' + 'open' requests.
227 */
156static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, 228static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
157 struct nameidata *nd) 229 struct nameidata *nd)
158{ 230{
@@ -163,7 +235,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
163 struct fuse_open_in inarg; 235 struct fuse_open_in inarg;
164 struct fuse_open_out outopen; 236 struct fuse_open_out outopen;
165 struct fuse_entry_out outentry; 237 struct fuse_entry_out outentry;
166 struct fuse_inode *fi;
167 struct fuse_file *ff; 238 struct fuse_file *ff;
168 struct file *file; 239 struct file *file;
169 int flags = nd->intent.open.flags - 1; 240 int flags = nd->intent.open.flags - 1;
@@ -172,10 +243,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
172 if (fc->no_create) 243 if (fc->no_create)
173 goto out; 244 goto out;
174 245
175 err = -ENAMETOOLONG;
176 if (entry->d_name.len > FUSE_NAME_MAX)
177 goto out;
178
179 err = -EINTR; 246 err = -EINTR;
180 req = fuse_get_request(fc); 247 req = fuse_get_request(fc);
181 if (!req) 248 if (!req)
@@ -220,17 +287,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
220 if (!inode) { 287 if (!inode) {
221 flags &= ~(O_CREAT | O_EXCL | O_TRUNC); 288 flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
222 ff->fh = outopen.fh; 289 ff->fh = outopen.fh;
290 /* Special release, with inode = NULL, this will
291 trigger a 'forget' request when the release is
292 complete */
223 fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0); 293 fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0);
224 goto out_put_request; 294 goto out_put_request;
225 } 295 }
226 fuse_put_request(fc, req); 296 fuse_put_request(fc, req);
227 entry->d_time = time_to_jiffies(outentry.entry_valid,
228 outentry.entry_valid_nsec);
229 fi = get_fuse_inode(inode);
230 fi->i_time = time_to_jiffies(outentry.attr_valid,
231 outentry.attr_valid_nsec);
232
233 d_instantiate(entry, inode); 297 d_instantiate(entry, inode);
298 fuse_change_timeout(entry, &outentry);
234 file = lookup_instantiate_filp(nd, entry, generic_file_open); 299 file = lookup_instantiate_filp(nd, entry, generic_file_open);
235 if (IS_ERR(file)) { 300 if (IS_ERR(file)) {
236 ff->fh = outopen.fh; 301 ff->fh = outopen.fh;
@@ -248,13 +313,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
248 return err; 313 return err;
249} 314}
250 315
316/*
317 * Code shared between mknod, mkdir, symlink and link
318 */
251static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, 319static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
252 struct inode *dir, struct dentry *entry, 320 struct inode *dir, struct dentry *entry,
253 int mode) 321 int mode)
254{ 322{
255 struct fuse_entry_out outarg; 323 struct fuse_entry_out outarg;
256 struct inode *inode; 324 struct inode *inode;
257 struct fuse_inode *fi;
258 int err; 325 int err;
259 326
260 req->in.h.nodeid = get_node_id(dir); 327 req->in.h.nodeid = get_node_id(dir);
@@ -268,10 +335,13 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
268 fuse_put_request(fc, req); 335 fuse_put_request(fc, req);
269 return err; 336 return err;
270 } 337 }
271 if (invalid_nodeid(outarg.nodeid)) { 338 err = -EIO;
272 fuse_put_request(fc, req); 339 if (invalid_nodeid(outarg.nodeid))
273 return -EIO; 340 goto out_put_request;
274 } 341
342 if ((outarg.attr.mode ^ mode) & S_IFMT)
343 goto out_put_request;
344
275 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, 345 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
276 &outarg.attr); 346 &outarg.attr);
277 if (!inode) { 347 if (!inode) {
@@ -280,22 +350,19 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
280 } 350 }
281 fuse_put_request(fc, req); 351 fuse_put_request(fc, req);
282 352
283 /* Don't allow userspace to do really stupid things... */ 353 if (dir_alias(inode)) {
284 if (((inode->i_mode ^ mode) & S_IFMT) || dir_alias(inode)) {
285 iput(inode); 354 iput(inode);
286 return -EIO; 355 return -EIO;
287 } 356 }
288 357
289 entry->d_time = time_to_jiffies(outarg.entry_valid,
290 outarg.entry_valid_nsec);
291
292 fi = get_fuse_inode(inode);
293 fi->i_time = time_to_jiffies(outarg.attr_valid,
294 outarg.attr_valid_nsec);
295
296 d_instantiate(entry, inode); 358 d_instantiate(entry, inode);
359 fuse_change_timeout(entry, &outarg);
297 fuse_invalidate_attr(dir); 360 fuse_invalidate_attr(dir);
298 return 0; 361 return 0;
362
363 out_put_request:
364 fuse_put_request(fc, req);
365 return err;
299} 366}
300 367
301static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, 368static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
@@ -355,12 +422,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
355{ 422{
356 struct fuse_conn *fc = get_fuse_conn(dir); 423 struct fuse_conn *fc = get_fuse_conn(dir);
357 unsigned len = strlen(link) + 1; 424 unsigned len = strlen(link) + 1;
358 struct fuse_req *req; 425 struct fuse_req *req = fuse_get_request(fc);
359
360 if (len > FUSE_SYMLINK_MAX)
361 return -ENAMETOOLONG;
362
363 req = fuse_get_request(fc);
364 if (!req) 426 if (!req)
365 return -EINTR; 427 return -EINTR;
366 428
@@ -399,6 +461,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
399 inode->i_nlink = 0; 461 inode->i_nlink = 0;
400 fuse_invalidate_attr(inode); 462 fuse_invalidate_attr(inode);
401 fuse_invalidate_attr(dir); 463 fuse_invalidate_attr(dir);
464 fuse_invalidate_entry_cache(entry);
402 } else if (err == -EINTR) 465 } else if (err == -EINTR)
403 fuse_invalidate_entry(entry); 466 fuse_invalidate_entry(entry);
404 return err; 467 return err;
@@ -424,6 +487,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
424 if (!err) { 487 if (!err) {
425 entry->d_inode->i_nlink = 0; 488 entry->d_inode->i_nlink = 0;
426 fuse_invalidate_attr(dir); 489 fuse_invalidate_attr(dir);
490 fuse_invalidate_entry_cache(entry);
427 } else if (err == -EINTR) 491 } else if (err == -EINTR)
428 fuse_invalidate_entry(entry); 492 fuse_invalidate_entry(entry);
429 return err; 493 return err;
@@ -459,6 +523,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
459 fuse_invalidate_attr(olddir); 523 fuse_invalidate_attr(olddir);
460 if (olddir != newdir) 524 if (olddir != newdir)
461 fuse_invalidate_attr(newdir); 525 fuse_invalidate_attr(newdir);
526
527 /* newent will end up negative */
528 if (newent->d_inode)
529 fuse_invalidate_entry_cache(newent);
462 } else if (err == -EINTR) { 530 } else if (err == -EINTR) {
463 /* If request was interrupted, DEITY only knows if the 531 /* If request was interrupted, DEITY only knows if the
464 rename actually took place. If the invalidation 532 rename actually took place. If the invalidation
@@ -566,6 +634,15 @@ static int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
566 return 0; 634 return 0;
567} 635}
568 636
637/*
638 * Check whether the inode attributes are still valid
639 *
640 * If the attribute validity timeout has expired, then fetch the fresh
641 * attributes with a 'getattr' request
642 *
643 * I'm not sure why cached attributes are never returned for the root
644 * inode, this is probably being too cautious.
645 */
569static int fuse_revalidate(struct dentry *entry) 646static int fuse_revalidate(struct dentry *entry)
570{ 647{
571 struct inode *inode = entry->d_inode; 648 struct inode *inode = entry->d_inode;
@@ -613,6 +690,19 @@ static int fuse_access(struct inode *inode, int mask)
613 return err; 690 return err;
614} 691}
615 692
693/*
694 * Check permission. The two basic access models of FUSE are:
695 *
696 * 1) Local access checking ('default_permissions' mount option) based
697 * on file mode. This is the plain old disk filesystem permission
698 * modell.
699 *
700 * 2) "Remote" access checking, where server is responsible for
701 * checking permission in each inode operation. An exception to this
702 * is if ->permission() was invoked from sys_access() in which case an
703 * access request is sent. Execute permission is still checked
704 * locally based on file mode.
705 */
616static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd) 706static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
617{ 707{
618 struct fuse_conn *fc = get_fuse_conn(inode); 708 struct fuse_conn *fc = get_fuse_conn(inode);
@@ -631,14 +721,10 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
631 err = generic_permission(inode, mask, NULL); 721 err = generic_permission(inode, mask, NULL);
632 } 722 }
633 723
634 /* FIXME: Need some mechanism to revoke permissions: 724 /* Note: the opposite of the above test does not
635 currently if the filesystem suddenly changes the 725 exist. So if permissions are revoked this won't be
636 file mode, we will not be informed about it, and 726 noticed immediately, only after the attribute
637 continue to allow access to the file/directory. 727 timeout has expired */
638
639 This is actually not so grave, since the user can
640 simply keep access to the file/directory anyway by
641 keeping it open... */
642 728
643 return err; 729 return err;
644 } else { 730 } else {
@@ -691,7 +777,12 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
691 struct page *page; 777 struct page *page;
692 struct inode *inode = file->f_dentry->d_inode; 778 struct inode *inode = file->f_dentry->d_inode;
693 struct fuse_conn *fc = get_fuse_conn(inode); 779 struct fuse_conn *fc = get_fuse_conn(inode);
694 struct fuse_req *req = fuse_get_request(fc); 780 struct fuse_req *req;
781
782 if (is_bad_inode(inode))
783 return -EIO;
784
785 req = fuse_get_request(fc);
695 if (!req) 786 if (!req)
696 return -EINTR; 787 return -EINTR;
697 788
@@ -806,6 +897,15 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
806 } 897 }
807} 898}
808 899
900/*
901 * Set attributes, and at the same time refresh them.
902 *
903 * Truncation is slightly complicated, because the 'truncate' request
904 * may fail, in which case we don't want to touch the mapping.
905 * vmtruncate() doesn't allow for this case. So do the rlimit
906 * checking by hand and call vmtruncate() only after the file has
907 * actually been truncated.
908 */
809static int fuse_setattr(struct dentry *entry, struct iattr *attr) 909static int fuse_setattr(struct dentry *entry, struct iattr *attr)
810{ 910{
811 struct inode *inode = entry->d_inode; 911 struct inode *inode = entry->d_inode;
@@ -883,23 +983,6 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
883 return err; 983 return err;
884} 984}
885 985
886static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
887 struct nameidata *nd)
888{
889 struct inode *inode;
890 int err;
891
892 err = fuse_lookup_iget(dir, entry, &inode);
893 if (err)
894 return ERR_PTR(err);
895 if (inode && dir_alias(inode)) {
896 iput(inode);
897 return ERR_PTR(-EIO);
898 }
899 d_add(entry, inode);
900 return NULL;
901}
902
903static int fuse_setxattr(struct dentry *entry, const char *name, 986static int fuse_setxattr(struct dentry *entry, const char *name,
904 const void *value, size_t size, int flags) 987 const void *value, size_t size, int flags)
905{ 988{
@@ -909,9 +992,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
909 struct fuse_setxattr_in inarg; 992 struct fuse_setxattr_in inarg;
910 int err; 993 int err;
911 994
912 if (size > FUSE_XATTR_SIZE_MAX)
913 return -E2BIG;
914
915 if (fc->no_setxattr) 995 if (fc->no_setxattr)
916 return -EOPNOTSUPP; 996 return -EOPNOTSUPP;
917 997
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2ca86141d13a..05dedddf4289 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -163,6 +163,9 @@ static int fuse_flush(struct file *file)
163 struct fuse_flush_in inarg; 163 struct fuse_flush_in inarg;
164 int err; 164 int err;
165 165
166 if (is_bad_inode(inode))
167 return -EIO;
168
166 if (fc->no_flush) 169 if (fc->no_flush)
167 return 0; 170 return 0;
168 171
@@ -199,6 +202,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
199 struct fuse_fsync_in inarg; 202 struct fuse_fsync_in inarg;
200 int err; 203 int err;
201 204
205 if (is_bad_inode(inode))
206 return -EIO;
207
202 if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) 208 if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
203 return 0; 209 return 0;
204 210
@@ -272,16 +278,22 @@ static int fuse_readpage(struct file *file, struct page *page)
272{ 278{
273 struct inode *inode = page->mapping->host; 279 struct inode *inode = page->mapping->host;
274 struct fuse_conn *fc = get_fuse_conn(inode); 280 struct fuse_conn *fc = get_fuse_conn(inode);
275 loff_t pos = (loff_t) page->index << PAGE_CACHE_SHIFT; 281 struct fuse_req *req;
276 struct fuse_req *req = fuse_get_request(fc); 282 int err;
277 int err = -EINTR; 283
284 err = -EIO;
285 if (is_bad_inode(inode))
286 goto out;
287
288 err = -EINTR;
289 req = fuse_get_request(fc);
278 if (!req) 290 if (!req)
279 goto out; 291 goto out;
280 292
281 req->out.page_zeroing = 1; 293 req->out.page_zeroing = 1;
282 req->num_pages = 1; 294 req->num_pages = 1;
283 req->pages[0] = page; 295 req->pages[0] = page;
284 fuse_send_read(req, file, inode, pos, PAGE_CACHE_SIZE); 296 fuse_send_read(req, file, inode, page_offset(page), PAGE_CACHE_SIZE);
285 err = req->out.h.error; 297 err = req->out.h.error;
286 fuse_put_request(fc, req); 298 fuse_put_request(fc, req);
287 if (!err) 299 if (!err)
@@ -295,7 +307,7 @@ static int fuse_readpage(struct file *file, struct page *page)
295static int fuse_send_readpages(struct fuse_req *req, struct file *file, 307static int fuse_send_readpages(struct fuse_req *req, struct file *file,
296 struct inode *inode) 308 struct inode *inode)
297{ 309{
298 loff_t pos = (loff_t) req->pages[0]->index << PAGE_CACHE_SHIFT; 310 loff_t pos = page_offset(req->pages[0]);
299 size_t count = req->num_pages << PAGE_CACHE_SHIFT; 311 size_t count = req->num_pages << PAGE_CACHE_SHIFT;
300 unsigned i; 312 unsigned i;
301 req->out.page_zeroing = 1; 313 req->out.page_zeroing = 1;
@@ -345,6 +357,10 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
345 struct fuse_conn *fc = get_fuse_conn(inode); 357 struct fuse_conn *fc = get_fuse_conn(inode);
346 struct fuse_readpages_data data; 358 struct fuse_readpages_data data;
347 int err; 359 int err;
360
361 if (is_bad_inode(inode))
362 return -EIO;
363
348 data.file = file; 364 data.file = file;
349 data.inode = inode; 365 data.inode = inode;
350 data.req = fuse_get_request(fc); 366 data.req = fuse_get_request(fc);
@@ -402,8 +418,13 @@ static int fuse_commit_write(struct file *file, struct page *page,
402 unsigned count = to - offset; 418 unsigned count = to - offset;
403 struct inode *inode = page->mapping->host; 419 struct inode *inode = page->mapping->host;
404 struct fuse_conn *fc = get_fuse_conn(inode); 420 struct fuse_conn *fc = get_fuse_conn(inode);
405 loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + offset; 421 loff_t pos = page_offset(page) + offset;
406 struct fuse_req *req = fuse_get_request(fc); 422 struct fuse_req *req;
423
424 if (is_bad_inode(inode))
425 return -EIO;
426
427 req = fuse_get_request(fc);
407 if (!req) 428 if (!req)
408 return -EINTR; 429 return -EINTR;
409 430
@@ -454,7 +475,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
454 475
455 nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); 476 nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
456 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 477 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
457 npages = min(npages, FUSE_MAX_PAGES_PER_REQ); 478 npages = min(max(npages, 1), FUSE_MAX_PAGES_PER_REQ);
458 down_read(&current->mm->mmap_sem); 479 down_read(&current->mm->mmap_sem);
459 npages = get_user_pages(current, current->mm, user_addr, npages, write, 480 npages = get_user_pages(current, current->mm, user_addr, npages, write,
460 0, req->pages, NULL); 481 0, req->pages, NULL);
@@ -475,12 +496,16 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
475 size_t nmax = write ? fc->max_write : fc->max_read; 496 size_t nmax = write ? fc->max_write : fc->max_read;
476 loff_t pos = *ppos; 497 loff_t pos = *ppos;
477 ssize_t res = 0; 498 ssize_t res = 0;
478 struct fuse_req *req = fuse_get_request(fc); 499 struct fuse_req *req;
500
501 if (is_bad_inode(inode))
502 return -EIO;
503
504 req = fuse_get_request(fc);
479 if (!req) 505 if (!req)
480 return -EINTR; 506 return -EINTR;
481 507
482 while (count) { 508 while (count) {
483 size_t tmp;
484 size_t nres; 509 size_t nres;
485 size_t nbytes = min(count, nmax); 510 size_t nbytes = min(count, nmax);
486 int err = fuse_get_user_pages(req, buf, nbytes, !write); 511 int err = fuse_get_user_pages(req, buf, nbytes, !write);
@@ -488,8 +513,8 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
488 res = err; 513 res = err;
489 break; 514 break;
490 } 515 }
491 tmp = (req->num_pages << PAGE_SHIFT) - req->page_offset; 516 nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
492 nbytes = min(nbytes, tmp); 517 nbytes = min(count, nbytes);
493 if (write) 518 if (write)
494 nres = fuse_send_write(req, file, inode, pos, nbytes); 519 nres = fuse_send_write(req, file, inode, pos, nbytes);
495 else 520 else
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0ea5301f86be..74c8d098a14a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,9 @@
21/** If more requests are outstanding, then the operation will block */ 21/** If more requests are outstanding, then the operation will block */
22#define FUSE_MAX_OUTSTANDING 10 22#define FUSE_MAX_OUTSTANDING 10
23 23
24/** It could be as large as PATH_MAX, but would that have any uses? */
25#define FUSE_NAME_MAX 1024
26
24/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem 27/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
25 module will check permissions based on the file mode. Otherwise no 28 module will check permissions based on the file mode. Otherwise no
26 permission checking is done in the kernel */ 29 permission checking is done in the kernel */
@@ -108,9 +111,6 @@ struct fuse_out {
108 struct fuse_arg args[3]; 111 struct fuse_arg args[3];
109}; 112};
110 113
111struct fuse_req;
112struct fuse_conn;
113
114/** 114/**
115 * A request to the client 115 * A request to the client
116 */ 116 */
@@ -159,7 +159,8 @@ struct fuse_req {
159 union { 159 union {
160 struct fuse_forget_in forget_in; 160 struct fuse_forget_in forget_in;
161 struct fuse_release_in release_in; 161 struct fuse_release_in release_in;
162 struct fuse_init_in_out init_in_out; 162 struct fuse_init_in init_in;
163 struct fuse_init_out init_out;
163 } misc; 164 } misc;
164 165
165 /** page vector */ 166 /** page vector */
@@ -272,6 +273,9 @@ struct fuse_conn {
272 /** Is create not implemented by fs? */ 273 /** Is create not implemented by fs? */
273 unsigned no_create : 1; 274 unsigned no_create : 1;
274 275
276 /** Negotiated minor version */
277 unsigned minor;
278
275 /** Backing dev info */ 279 /** Backing dev info */
276 struct backing_dev_info bdi; 280 struct backing_dev_info bdi;
277}; 281};
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e69a546844d0..04c80cc957a3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -135,12 +135,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
135 fuse_init_common(inode); 135 fuse_init_common(inode);
136 init_special_inode(inode, inode->i_mode, 136 init_special_inode(inode, inode->i_mode,
137 new_decode_dev(attr->rdev)); 137 new_decode_dev(attr->rdev));
138 } else { 138 } else
139 /* Don't let user create weird files */ 139 BUG();
140 inode->i_mode = S_IFREG;
141 fuse_init_common(inode);
142 fuse_init_file_inode(inode);
143 }
144} 140}
145 141
146static int fuse_inode_eq(struct inode *inode, void *_nodeidp) 142static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
@@ -218,6 +214,7 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
218{ 214{
219 stbuf->f_type = FUSE_SUPER_MAGIC; 215 stbuf->f_type = FUSE_SUPER_MAGIC;
220 stbuf->f_bsize = attr->bsize; 216 stbuf->f_bsize = attr->bsize;
217 stbuf->f_frsize = attr->frsize;
221 stbuf->f_blocks = attr->blocks; 218 stbuf->f_blocks = attr->blocks;
222 stbuf->f_bfree = attr->bfree; 219 stbuf->f_bfree = attr->bfree;
223 stbuf->f_bavail = attr->bavail; 220 stbuf->f_bavail = attr->bavail;
@@ -238,10 +235,12 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
238 if (!req) 235 if (!req)
239 return -EINTR; 236 return -EINTR;
240 237
238 memset(&outarg, 0, sizeof(outarg));
241 req->in.numargs = 0; 239 req->in.numargs = 0;
242 req->in.h.opcode = FUSE_STATFS; 240 req->in.h.opcode = FUSE_STATFS;
243 req->out.numargs = 1; 241 req->out.numargs = 1;
244 req->out.args[0].size = sizeof(outarg); 242 req->out.args[0].size =
243 fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
245 req->out.args[0].value = &outarg; 244 req->out.args[0].value = &outarg;
246 request_send(fc, req); 245 request_send(fc, req);
247 err = req->out.h.error; 246 err = req->out.h.error;
@@ -482,7 +481,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
482 fc->max_read = d.max_read; 481 fc->max_read = d.max_read;
483 if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages) 482 if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages)
484 fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE; 483 fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE;
485 fc->max_write = FUSE_MAX_IN / 2;
486 484
487 err = -ENOMEM; 485 err = -ENOMEM;
488 root = get_root_inode(sb, d.rootmode); 486 root = get_root_inode(sb, d.rootmode);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 8093351bd7c3..6daaf7c755a6 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -320,7 +320,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
320 /* temporarily use utf8 to correctly find the hidden dir below */ 320 /* temporarily use utf8 to correctly find the hidden dir below */
321 nls = sbi->nls; 321 nls = sbi->nls;
322 sbi->nls = load_nls("utf8"); 322 sbi->nls = load_nls("utf8");
323 if (!nls) { 323 if (!sbi->nls) {
324 printk("HFS+: unable to load nls for utf8\n"); 324 printk("HFS+: unable to load nls for utf8\n");
325 err = -EINVAL; 325 err = -EINVAL;
326 goto cleanup; 326 goto cleanup;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c1cef3bb677..8c41315a6e42 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -100,9 +100,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
100 loff_t len, vma_len; 100 loff_t len, vma_len;
101 int ret; 101 int ret;
102 102
103 if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE)
104 return -EINVAL;
105
106 if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1)) 103 if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
107 return -EINVAL; 104 return -EINVAL;
108 105
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 014a51fd00d7..cb3cef525c3b 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,29 +24,75 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25 25
26/* 26/*
27 * Unlink a buffer from a transaction. 27 * Unlink a buffer from a transaction checkpoint list.
28 * 28 *
29 * Called with j_list_lock held. 29 * Called with j_list_lock held.
30 */ 30 */
31 31
32static inline void __buffer_unlink(struct journal_head *jh) 32static void __buffer_unlink_first(struct journal_head *jh)
33{ 33{
34 transaction_t *transaction; 34 transaction_t *transaction;
35 35
36 transaction = jh->b_cp_transaction; 36 transaction = jh->b_cp_transaction;
37 jh->b_cp_transaction = NULL;
38 37
39 jh->b_cpnext->b_cpprev = jh->b_cpprev; 38 jh->b_cpnext->b_cpprev = jh->b_cpprev;
40 jh->b_cpprev->b_cpnext = jh->b_cpnext; 39 jh->b_cpprev->b_cpnext = jh->b_cpnext;
41 if (transaction->t_checkpoint_list == jh) 40 if (transaction->t_checkpoint_list == jh) {
42 transaction->t_checkpoint_list = jh->b_cpnext; 41 transaction->t_checkpoint_list = jh->b_cpnext;
43 if (transaction->t_checkpoint_list == jh) 42 if (transaction->t_checkpoint_list == jh)
44 transaction->t_checkpoint_list = NULL; 43 transaction->t_checkpoint_list = NULL;
44 }
45}
46
47/*
48 * Unlink a buffer from a transaction checkpoint(io) list.
49 *
50 * Called with j_list_lock held.
51 */
52
53static inline void __buffer_unlink(struct journal_head *jh)
54{
55 transaction_t *transaction;
56
57 transaction = jh->b_cp_transaction;
58
59 __buffer_unlink_first(jh);
60 if (transaction->t_checkpoint_io_list == jh) {
61 transaction->t_checkpoint_io_list = jh->b_cpnext;
62 if (transaction->t_checkpoint_io_list == jh)
63 transaction->t_checkpoint_io_list = NULL;
64 }
65}
66
67/*
68 * Move a buffer from the checkpoint list to the checkpoint io list
69 *
70 * Called with j_list_lock held
71 */
72
73static inline void __buffer_relink_io(struct journal_head *jh)
74{
75 transaction_t *transaction;
76
77 transaction = jh->b_cp_transaction;
78 __buffer_unlink_first(jh);
79
80 if (!transaction->t_checkpoint_io_list) {
81 jh->b_cpnext = jh->b_cpprev = jh;
82 } else {
83 jh->b_cpnext = transaction->t_checkpoint_io_list;
84 jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
85 jh->b_cpprev->b_cpnext = jh;
86 jh->b_cpnext->b_cpprev = jh;
87 }
88 transaction->t_checkpoint_io_list = jh;
45} 89}
46 90
47/* 91/*
48 * Try to release a checkpointed buffer from its transaction. 92 * Try to release a checkpointed buffer from its transaction.
49 * Returns 1 if we released it. 93 * Returns 1 if we released it and 2 if we also released the
94 * whole transaction.
95 *
50 * Requires j_list_lock 96 * Requires j_list_lock
51 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 97 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
52 */ 98 */
@@ -57,12 +103,11 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
57 103
58 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { 104 if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
59 JBUFFER_TRACE(jh, "remove from checkpoint list"); 105 JBUFFER_TRACE(jh, "remove from checkpoint list");
60 __journal_remove_checkpoint(jh); 106 ret = __journal_remove_checkpoint(jh) + 1;
61 jbd_unlock_bh_state(bh); 107 jbd_unlock_bh_state(bh);
62 journal_remove_journal_head(bh); 108 journal_remove_journal_head(bh);
63 BUFFER_TRACE(bh, "release"); 109 BUFFER_TRACE(bh, "release");
64 __brelse(bh); 110 __brelse(bh);
65 ret = 1;
66 } else { 111 } else {
67 jbd_unlock_bh_state(bh); 112 jbd_unlock_bh_state(bh);
68 } 113 }
@@ -117,83 +162,53 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
117} 162}
118 163
119/* 164/*
120 * Clean up a transaction's checkpoint list. 165 * Clean up transaction's list of buffers submitted for io.
121 * 166 * We wait for any pending IO to complete and remove any clean
122 * We wait for any pending IO to complete and make sure any clean 167 * buffers. Note that we take the buffers in the opposite ordering
123 * buffers are removed from the transaction. 168 * from the one in which they were submitted for IO.
124 *
125 * Return 1 if we performed any actions which might have destroyed the
126 * checkpoint. (journal_remove_checkpoint() deletes the transaction when
127 * the last checkpoint buffer is cleansed)
128 * 169 *
129 * Called with j_list_lock held. 170 * Called with j_list_lock held.
130 */ 171 */
131static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) 172
173static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
132{ 174{
133 struct journal_head *jh, *next_jh, *last_jh; 175 struct journal_head *jh;
134 struct buffer_head *bh; 176 struct buffer_head *bh;
135 int ret = 0; 177 tid_t this_tid;
136 178 int released = 0;
137 assert_spin_locked(&journal->j_list_lock); 179
138 jh = transaction->t_checkpoint_list; 180 this_tid = transaction->t_tid;
139 if (!jh) 181restart:
140 return 0; 182 /* Didn't somebody clean up the transaction in the meanwhile */
141 183 if (journal->j_checkpoint_transactions != transaction ||
142 last_jh = jh->b_cpprev; 184 transaction->t_tid != this_tid)
143 next_jh = jh; 185 return;
144 do { 186 while (!released && transaction->t_checkpoint_io_list) {
145 jh = next_jh; 187 jh = transaction->t_checkpoint_io_list;
146 bh = jh2bh(jh); 188 bh = jh2bh(jh);
189 if (!jbd_trylock_bh_state(bh)) {
190 jbd_sync_bh(journal, bh);
191 spin_lock(&journal->j_list_lock);
192 goto restart;
193 }
147 if (buffer_locked(bh)) { 194 if (buffer_locked(bh)) {
148 atomic_inc(&bh->b_count); 195 atomic_inc(&bh->b_count);
149 spin_unlock(&journal->j_list_lock); 196 spin_unlock(&journal->j_list_lock);
197 jbd_unlock_bh_state(bh);
150 wait_on_buffer(bh); 198 wait_on_buffer(bh);
151 /* the journal_head may have gone by now */ 199 /* the journal_head may have gone by now */
152 BUFFER_TRACE(bh, "brelse"); 200 BUFFER_TRACE(bh, "brelse");
153 __brelse(bh); 201 __brelse(bh);
154 goto out_return_1; 202 spin_lock(&journal->j_list_lock);
155 } 203 goto restart;
156
157 /*
158 * This is foul
159 */
160 if (!jbd_trylock_bh_state(bh)) {
161 jbd_sync_bh(journal, bh);
162 goto out_return_1;
163 } 204 }
164
165 if (jh->b_transaction != NULL) {
166 transaction_t *t = jh->b_transaction;
167 tid_t tid = t->t_tid;
168
169 spin_unlock(&journal->j_list_lock);
170 jbd_unlock_bh_state(bh);
171 log_start_commit(journal, tid);
172 log_wait_commit(journal, tid);
173 goto out_return_1;
174 }
175
176 /* 205 /*
177 * AKPM: I think the buffer_jbddirty test is redundant - it 206 * Now in whatever state the buffer currently is, we know that
178 * shouldn't have NULL b_transaction? 207 * it has been written out and so we can drop it from the list
179 */ 208 */
180 next_jh = jh->b_cpnext; 209 released = __journal_remove_checkpoint(jh);
181 if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) { 210 jbd_unlock_bh_state(bh);
182 BUFFER_TRACE(bh, "remove from checkpoint"); 211 }
183 __journal_remove_checkpoint(jh);
184 jbd_unlock_bh_state(bh);
185 journal_remove_journal_head(bh);
186 __brelse(bh);
187 ret = 1;
188 } else {
189 jbd_unlock_bh_state(bh);
190 }
191 } while (jh != last_jh);
192
193 return ret;
194out_return_1:
195 spin_lock(&journal->j_list_lock);
196 return 1;
197} 212}
198 213
199#define NR_BATCH 64 214#define NR_BATCH 64
@@ -203,9 +218,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
203{ 218{
204 int i; 219 int i;
205 220
206 spin_unlock(&journal->j_list_lock);
207 ll_rw_block(SWRITE, *batch_count, bhs); 221 ll_rw_block(SWRITE, *batch_count, bhs);
208 spin_lock(&journal->j_list_lock);
209 for (i = 0; i < *batch_count; i++) { 222 for (i = 0; i < *batch_count; i++) {
210 struct buffer_head *bh = bhs[i]; 223 struct buffer_head *bh = bhs[i];
211 clear_buffer_jwrite(bh); 224 clear_buffer_jwrite(bh);
@@ -221,19 +234,46 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
221 * Return 1 if something happened which requires us to abort the current 234 * Return 1 if something happened which requires us to abort the current
222 * scan of the checkpoint list. 235 * scan of the checkpoint list.
223 * 236 *
224 * Called with j_list_lock held. 237 * Called with j_list_lock held and drops it if 1 is returned
225 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it 238 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
226 */ 239 */
227static int __flush_buffer(journal_t *journal, struct journal_head *jh, 240static int __process_buffer(journal_t *journal, struct journal_head *jh,
228 struct buffer_head **bhs, int *batch_count, 241 struct buffer_head **bhs, int *batch_count)
229 int *drop_count)
230{ 242{
231 struct buffer_head *bh = jh2bh(jh); 243 struct buffer_head *bh = jh2bh(jh);
232 int ret = 0; 244 int ret = 0;
233 245
234 if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { 246 if (buffer_locked(bh)) {
235 J_ASSERT_JH(jh, jh->b_transaction == NULL); 247 get_bh(bh);
248 spin_unlock(&journal->j_list_lock);
249 jbd_unlock_bh_state(bh);
250 wait_on_buffer(bh);
251 /* the journal_head may have gone by now */
252 BUFFER_TRACE(bh, "brelse");
253 put_bh(bh);
254 ret = 1;
255 }
256 else if (jh->b_transaction != NULL) {
257 transaction_t *t = jh->b_transaction;
258 tid_t tid = t->t_tid;
236 259
260 spin_unlock(&journal->j_list_lock);
261 jbd_unlock_bh_state(bh);
262 log_start_commit(journal, tid);
263 log_wait_commit(journal, tid);
264 ret = 1;
265 }
266 else if (!buffer_dirty(bh)) {
267 J_ASSERT_JH(jh, !buffer_jbddirty(bh));
268 BUFFER_TRACE(bh, "remove from checkpoint");
269 __journal_remove_checkpoint(jh);
270 spin_unlock(&journal->j_list_lock);
271 jbd_unlock_bh_state(bh);
272 journal_remove_journal_head(bh);
273 put_bh(bh);
274 ret = 1;
275 }
276 else {
237 /* 277 /*
238 * Important: we are about to write the buffer, and 278 * Important: we are about to write the buffer, and
239 * possibly block, while still holding the journal lock. 279 * possibly block, while still holding the journal lock.
@@ -246,45 +286,30 @@ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
246 J_ASSERT_BH(bh, !buffer_jwrite(bh)); 286 J_ASSERT_BH(bh, !buffer_jwrite(bh));
247 set_buffer_jwrite(bh); 287 set_buffer_jwrite(bh);
248 bhs[*batch_count] = bh; 288 bhs[*batch_count] = bh;
289 __buffer_relink_io(jh);
249 jbd_unlock_bh_state(bh); 290 jbd_unlock_bh_state(bh);
250 (*batch_count)++; 291 (*batch_count)++;
251 if (*batch_count == NR_BATCH) { 292 if (*batch_count == NR_BATCH) {
293 spin_unlock(&journal->j_list_lock);
252 __flush_batch(journal, bhs, batch_count); 294 __flush_batch(journal, bhs, batch_count);
253 ret = 1; 295 ret = 1;
254 } 296 }
255 } else {
256 int last_buffer = 0;
257 if (jh->b_cpnext == jh) {
258 /* We may be about to drop the transaction. Tell the
259 * caller that the lists have changed.
260 */
261 last_buffer = 1;
262 }
263 if (__try_to_free_cp_buf(jh)) {
264 (*drop_count)++;
265 ret = last_buffer;
266 }
267 } 297 }
268 return ret; 298 return ret;
269} 299}
270 300
271/* 301/*
272 * Perform an actual checkpoint. We don't write out only enough to 302 * Perform an actual checkpoint. We take the first transaction on the
273 * satisfy the current blocked requests: rather we submit a reasonably 303 * list of transactions to be checkpointed and send all its buffers
274 * sized chunk of the outstanding data to disk at once for 304 * to disk. We submit larger chunks of data at once.
275 * efficiency. __log_wait_for_space() will retry if we didn't free enough.
276 * 305 *
277 * However, we _do_ take into account the amount requested so that once
278 * the IO has been queued, we can return as soon as enough of it has
279 * completed to disk.
280 *
281 * The journal should be locked before calling this function. 306 * The journal should be locked before calling this function.
282 */ 307 */
283int log_do_checkpoint(journal_t *journal) 308int log_do_checkpoint(journal_t *journal)
284{ 309{
310 transaction_t *transaction;
311 tid_t this_tid;
285 int result; 312 int result;
286 int batch_count = 0;
287 struct buffer_head *bhs[NR_BATCH];
288 313
289 jbd_debug(1, "Start checkpoint\n"); 314 jbd_debug(1, "Start checkpoint\n");
290 315
@@ -299,79 +324,70 @@ int log_do_checkpoint(journal_t *journal)
299 return result; 324 return result;
300 325
301 /* 326 /*
302 * OK, we need to start writing disk blocks. Try to free up a 327 * OK, we need to start writing disk blocks. Take one transaction
303 * quarter of the log in a single checkpoint if we can. 328 * and write it.
304 */ 329 */
330 spin_lock(&journal->j_list_lock);
331 if (!journal->j_checkpoint_transactions)
332 goto out;
333 transaction = journal->j_checkpoint_transactions;
334 this_tid = transaction->t_tid;
335restart:
305 /* 336 /*
306 * AKPM: check this code. I had a feeling a while back that it 337 * If someone cleaned up this transaction while we slept, we're
307 * degenerates into a busy loop at unmount time. 338 * done (maybe it's a new transaction, but it fell at the same
339 * address).
308 */ 340 */
309 spin_lock(&journal->j_list_lock); 341 if (journal->j_checkpoint_transactions == transaction ||
310 while (journal->j_checkpoint_transactions) { 342 transaction->t_tid == this_tid) {
311 transaction_t *transaction; 343 int batch_count = 0;
312 struct journal_head *jh, *last_jh, *next_jh; 344 struct buffer_head *bhs[NR_BATCH];
313 int drop_count = 0; 345 struct journal_head *jh;
314 int cleanup_ret, retry = 0; 346 int retry = 0;
315 tid_t this_tid; 347
316 348 while (!retry && transaction->t_checkpoint_list) {
317 transaction = journal->j_checkpoint_transactions;
318 this_tid = transaction->t_tid;
319 jh = transaction->t_checkpoint_list;
320 last_jh = jh->b_cpprev;
321 next_jh = jh;
322 do {
323 struct buffer_head *bh; 349 struct buffer_head *bh;
324 350
325 jh = next_jh; 351 jh = transaction->t_checkpoint_list;
326 next_jh = jh->b_cpnext;
327 bh = jh2bh(jh); 352 bh = jh2bh(jh);
328 if (!jbd_trylock_bh_state(bh)) { 353 if (!jbd_trylock_bh_state(bh)) {
329 jbd_sync_bh(journal, bh); 354 jbd_sync_bh(journal, bh);
330 spin_lock(&journal->j_list_lock);
331 retry = 1; 355 retry = 1;
332 break; 356 break;
333 } 357 }
334 retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count); 358 retry = __process_buffer(journal, jh, bhs,
335 if (cond_resched_lock(&journal->j_list_lock)) { 359 &batch_count);
360 if (!retry &&
361 lock_need_resched(&journal->j_list_lock)) {
362 spin_unlock(&journal->j_list_lock);
336 retry = 1; 363 retry = 1;
337 break; 364 break;
338 } 365 }
339 } while (jh != last_jh && !retry); 366 }
340 367
341 if (batch_count) { 368 if (batch_count) {
369 if (!retry) {
370 spin_unlock(&journal->j_list_lock);
371 retry = 1;
372 }
342 __flush_batch(journal, bhs, &batch_count); 373 __flush_batch(journal, bhs, &batch_count);
343 retry = 1;
344 } 374 }
345 375
376 if (retry) {
377 spin_lock(&journal->j_list_lock);
378 goto restart;
379 }
346 /* 380 /*
347 * If someone cleaned up this transaction while we slept, we're 381 * Now we have cleaned up the first transaction's checkpoint
348 * done 382 * list. Let's clean up the second one.
349 */
350 if (journal->j_checkpoint_transactions != transaction)
351 break;
352 if (retry)
353 continue;
354 /*
355 * Maybe it's a new transaction, but it fell at the same
356 * address
357 */
358 if (transaction->t_tid != this_tid)
359 continue;
360 /*
361 * We have walked the whole transaction list without
362 * finding anything to write to disk. We had better be
363 * able to make some progress or we are in trouble.
364 */ 383 */
365 cleanup_ret = __cleanup_transaction(journal, transaction); 384 __wait_cp_io(journal, transaction);
366 J_ASSERT(drop_count != 0 || cleanup_ret != 0);
367 if (journal->j_checkpoint_transactions != transaction)
368 break;
369 } 385 }
386out:
370 spin_unlock(&journal->j_list_lock); 387 spin_unlock(&journal->j_list_lock);
371 result = cleanup_journal_tail(journal); 388 result = cleanup_journal_tail(journal);
372 if (result < 0) 389 if (result < 0)
373 return result; 390 return result;
374
375 return 0; 391 return 0;
376} 392}
377 393
@@ -456,52 +472,91 @@ int cleanup_journal_tail(journal_t *journal)
456/* Checkpoint list management */ 472/* Checkpoint list management */
457 473
458/* 474/*
475 * journal_clean_one_cp_list
476 *
477 * Find all the written-back checkpoint buffers in the given list and release them.
478 *
479 * Called with the journal locked.
480 * Called with j_list_lock held.
481 * Returns number of bufers reaped (for debug)
482 */
483
484static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
485{
486 struct journal_head *last_jh;
487 struct journal_head *next_jh = jh;
488 int ret, freed = 0;
489
490 *released = 0;
491 if (!jh)
492 return 0;
493
494 last_jh = jh->b_cpprev;
495 do {
496 jh = next_jh;
497 next_jh = jh->b_cpnext;
498 /* Use trylock because of the ranking */
499 if (jbd_trylock_bh_state(jh2bh(jh))) {
500 ret = __try_to_free_cp_buf(jh);
501 if (ret) {
502 freed++;
503 if (ret == 2) {
504 *released = 1;
505 return freed;
506 }
507 }
508 }
509 /*
510 * This function only frees up some memory if possible so we
511 * dont have an obligation to finish processing. Bail out if
512 * preemption requested:
513 */
514 if (need_resched())
515 return freed;
516 } while (jh != last_jh);
517
518 return freed;
519}
520
521/*
459 * journal_clean_checkpoint_list 522 * journal_clean_checkpoint_list
460 * 523 *
461 * Find all the written-back checkpoint buffers in the journal and release them. 524 * Find all the written-back checkpoint buffers in the journal and release them.
462 * 525 *
463 * Called with the journal locked. 526 * Called with the journal locked.
464 * Called with j_list_lock held. 527 * Called with j_list_lock held.
465 * Returns number of bufers reaped (for debug) 528 * Returns number of buffers reaped (for debug)
466 */ 529 */
467 530
468int __journal_clean_checkpoint_list(journal_t *journal) 531int __journal_clean_checkpoint_list(journal_t *journal)
469{ 532{
470 transaction_t *transaction, *last_transaction, *next_transaction; 533 transaction_t *transaction, *last_transaction, *next_transaction;
471 int ret = 0; 534 int ret = 0, released;
472 535
473 transaction = journal->j_checkpoint_transactions; 536 transaction = journal->j_checkpoint_transactions;
474 if (transaction == 0) 537 if (!transaction)
475 goto out; 538 goto out;
476 539
477 last_transaction = transaction->t_cpprev; 540 last_transaction = transaction->t_cpprev;
478 next_transaction = transaction; 541 next_transaction = transaction;
479 do { 542 do {
480 struct journal_head *jh;
481
482 transaction = next_transaction; 543 transaction = next_transaction;
483 next_transaction = transaction->t_cpnext; 544 next_transaction = transaction->t_cpnext;
484 jh = transaction->t_checkpoint_list; 545 ret += journal_clean_one_cp_list(transaction->
485 if (jh) { 546 t_checkpoint_list, &released);
486 struct journal_head *last_jh = jh->b_cpprev; 547 if (need_resched())
487 struct journal_head *next_jh = jh; 548 goto out;
488 549 if (released)
489 do { 550 continue;
490 jh = next_jh; 551 /*
491 next_jh = jh->b_cpnext; 552 * It is essential that we are as careful as in the case of
492 /* Use trylock because of the ranknig */ 553 * t_checkpoint_list with removing the buffer from the list as
493 if (jbd_trylock_bh_state(jh2bh(jh))) 554 * we can possibly see not yet submitted buffers on io_list
494 ret += __try_to_free_cp_buf(jh); 555 */
495 /* 556 ret += journal_clean_one_cp_list(transaction->
496 * This function only frees up some memory 557 t_checkpoint_io_list, &released);
497 * if possible so we dont have an obligation 558 if (need_resched())
498 * to finish processing. Bail out if preemption 559 goto out;
499 * requested:
500 */
501 if (need_resched())
502 goto out;
503 } while (jh != last_jh);
504 }
505 } while (transaction != last_transaction); 560 } while (transaction != last_transaction);
506out: 561out:
507 return ret; 562 return ret;
@@ -516,18 +571,22 @@ out:
516 * buffer updates committed in that transaction have safely been stored 571 * buffer updates committed in that transaction have safely been stored
517 * elsewhere on disk. To achieve this, all of the buffers in a 572 * elsewhere on disk. To achieve this, all of the buffers in a
518 * transaction need to be maintained on the transaction's checkpoint 573 * transaction need to be maintained on the transaction's checkpoint
519 * list until they have been rewritten, at which point this function is 574 * lists until they have been rewritten, at which point this function is
520 * called to remove the buffer from the existing transaction's 575 * called to remove the buffer from the existing transaction's
521 * checkpoint list. 576 * checkpoint lists.
577 *
578 * The function returns 1 if it frees the transaction, 0 otherwise.
522 * 579 *
523 * This function is called with the journal locked. 580 * This function is called with the journal locked.
524 * This function is called with j_list_lock held. 581 * This function is called with j_list_lock held.
582 * This function is called with jbd_lock_bh_state(jh2bh(jh))
525 */ 583 */
526 584
527void __journal_remove_checkpoint(struct journal_head *jh) 585int __journal_remove_checkpoint(struct journal_head *jh)
528{ 586{
529 transaction_t *transaction; 587 transaction_t *transaction;
530 journal_t *journal; 588 journal_t *journal;
589 int ret = 0;
531 590
532 JBUFFER_TRACE(jh, "entry"); 591 JBUFFER_TRACE(jh, "entry");
533 592
@@ -538,8 +597,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
538 journal = transaction->t_journal; 597 journal = transaction->t_journal;
539 598
540 __buffer_unlink(jh); 599 __buffer_unlink(jh);
600 jh->b_cp_transaction = NULL;
541 601
542 if (transaction->t_checkpoint_list != NULL) 602 if (transaction->t_checkpoint_list != NULL ||
603 transaction->t_checkpoint_io_list != NULL)
543 goto out; 604 goto out;
544 JBUFFER_TRACE(jh, "transaction has no more buffers"); 605 JBUFFER_TRACE(jh, "transaction has no more buffers");
545 606
@@ -565,8 +626,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
565 /* Just in case anybody was waiting for more transactions to be 626 /* Just in case anybody was waiting for more transactions to be
566 checkpointed... */ 627 checkpointed... */
567 wake_up(&journal->j_wait_logspace); 628 wake_up(&journal->j_wait_logspace);
629 ret = 1;
568out: 630out:
569 JBUFFER_TRACE(jh, "exit"); 631 JBUFFER_TRACE(jh, "exit");
632 return ret;
570} 633}
571 634
572/* 635/*
@@ -628,6 +691,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
628 J_ASSERT(transaction->t_shadow_list == NULL); 691 J_ASSERT(transaction->t_shadow_list == NULL);
629 J_ASSERT(transaction->t_log_list == NULL); 692 J_ASSERT(transaction->t_log_list == NULL);
630 J_ASSERT(transaction->t_checkpoint_list == NULL); 693 J_ASSERT(transaction->t_checkpoint_list == NULL);
694 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
631 J_ASSERT(transaction->t_updates == 0); 695 J_ASSERT(transaction->t_updates == 0);
632 J_ASSERT(journal->j_committing_transaction != transaction); 696 J_ASSERT(journal->j_committing_transaction != transaction);
633 J_ASSERT(journal->j_running_transaction != transaction); 697 J_ASSERT(journal->j_running_transaction != transaction);
diff --git a/fs/mpage.c b/fs/mpage.c
index c5adcdddf3cc..f1d2d02bd4c8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -721,7 +721,7 @@ retry:
721 &last_block_in_bio, &ret, wbc, 721 &last_block_in_bio, &ret, wbc,
722 page->mapping->a_ops->writepage); 722 page->mapping->a_ops->writepage);
723 } 723 }
724 if (unlikely(ret == WRITEPAGE_ACTIVATE)) 724 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
725 unlock_page(page); 725 unlock_page(page);
726 if (ret || (--(wbc->nr_to_write) <= 0)) 726 if (ret || (--(wbc->nr_to_write) <= 0))
727 done = 1; 727 done = 1;
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 041380fe667b..6d2dfed1de08 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -56,13 +56,20 @@ static int
56nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, 56nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
57 struct nfsd3_attrstat *resp) 57 struct nfsd3_attrstat *resp)
58{ 58{
59 int nfserr; 59 int err, nfserr;
60 60
61 dprintk("nfsd: GETATTR(3) %s\n", 61 dprintk("nfsd: GETATTR(3) %s\n",
62 SVCFH_fmt(&argp->fh)); 62 SVCFH_fmt(&argp->fh));
63 63
64 fh_copy(&resp->fh, &argp->fh); 64 fh_copy(&resp->fh, &argp->fh);
65 nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP); 65 nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
66 if (nfserr)
67 RETURN_STATUS(nfserr);
68
69 err = vfs_getattr(resp->fh.fh_export->ex_mnt,
70 resp->fh.fh_dentry, &resp->stat);
71 nfserr = nfserrno(err);
72
66 RETURN_STATUS(nfserr); 73 RETURN_STATUS(nfserr);
67} 74}
68 75
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9147b8524d05..243d94b9653a 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -154,37 +154,34 @@ decode_sattr3(u32 *p, struct iattr *iap)
154} 154}
155 155
156static inline u32 * 156static inline u32 *
157encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) 157encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
158 struct kstat *stat)
158{ 159{
159 struct vfsmount *mnt = fhp->fh_export->ex_mnt;
160 struct dentry *dentry = fhp->fh_dentry; 160 struct dentry *dentry = fhp->fh_dentry;
161 struct kstat stat;
162 struct timespec time; 161 struct timespec time;
163 162
164 vfs_getattr(mnt, dentry, &stat); 163 *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
165 164 *p++ = htonl((u32) stat->mode);
166 *p++ = htonl(nfs3_ftypes[(stat.mode & S_IFMT) >> 12]); 165 *p++ = htonl((u32) stat->nlink);
167 *p++ = htonl((u32) stat.mode); 166 *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
168 *p++ = htonl((u32) stat.nlink); 167 *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
169 *p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid)); 168 if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
170 *p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid));
171 if (S_ISLNK(stat.mode) && stat.size > NFS3_MAXPATHLEN) {
172 p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); 169 p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
173 } else { 170 } else {
174 p = xdr_encode_hyper(p, (u64) stat.size); 171 p = xdr_encode_hyper(p, (u64) stat->size);
175 } 172 }
176 p = xdr_encode_hyper(p, ((u64)stat.blocks) << 9); 173 p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
177 *p++ = htonl((u32) MAJOR(stat.rdev)); 174 *p++ = htonl((u32) MAJOR(stat->rdev));
178 *p++ = htonl((u32) MINOR(stat.rdev)); 175 *p++ = htonl((u32) MINOR(stat->rdev));
179 if (is_fsid(fhp, rqstp->rq_reffh)) 176 if (is_fsid(fhp, rqstp->rq_reffh))
180 p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); 177 p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
181 else 178 else
182 p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat.dev)); 179 p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat->dev));
183 p = xdr_encode_hyper(p, (u64) stat.ino); 180 p = xdr_encode_hyper(p, (u64) stat->ino);
184 p = encode_time3(p, &stat.atime); 181 p = encode_time3(p, &stat->atime);
185 lease_get_mtime(dentry->d_inode, &time); 182 lease_get_mtime(dentry->d_inode, &time);
186 p = encode_time3(p, &time); 183 p = encode_time3(p, &time);
187 p = encode_time3(p, &stat.ctime); 184 p = encode_time3(p, &stat->ctime);
188 185
189 return p; 186 return p;
190} 187}
@@ -232,8 +229,14 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
232{ 229{
233 struct dentry *dentry = fhp->fh_dentry; 230 struct dentry *dentry = fhp->fh_dentry;
234 if (dentry && dentry->d_inode != NULL) { 231 if (dentry && dentry->d_inode != NULL) {
235 *p++ = xdr_one; /* attributes follow */ 232 int err;
236 return encode_fattr3(rqstp, p, fhp); 233 struct kstat stat;
234
235 err = vfs_getattr(fhp->fh_export->ex_mnt, dentry, &stat);
236 if (!err) {
237 *p++ = xdr_one; /* attributes follow */
238 return encode_fattr3(rqstp, p, fhp, &stat);
239 }
237 } 240 }
238 *p++ = xdr_zero; 241 *p++ = xdr_zero;
239 return p; 242 return p;
@@ -616,7 +619,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
616 struct nfsd3_attrstat *resp) 619 struct nfsd3_attrstat *resp)
617{ 620{
618 if (resp->status == 0) 621 if (resp->status == 0)
619 p = encode_fattr3(rqstp, p, &resp->fh); 622 p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
620 return xdr_ressize_check(rqstp, p); 623 return xdr_ressize_check(rqstp, p);
621} 624}
622 625
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b45999ff33e6..aa7bb41b293d 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -152,46 +152,44 @@ decode_sattr(u32 *p, struct iattr *iap)
152} 152}
153 153
154static inline u32 * 154static inline u32 *
155encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) 155encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
156 struct kstat *stat)
156{ 157{
157 struct vfsmount *mnt = fhp->fh_export->ex_mnt;
158 struct dentry *dentry = fhp->fh_dentry; 158 struct dentry *dentry = fhp->fh_dentry;
159 struct kstat stat;
160 int type; 159 int type;
161 struct timespec time; 160 struct timespec time;
162 161
163 vfs_getattr(mnt, dentry, &stat); 162 type = (stat->mode & S_IFMT);
164 type = (stat.mode & S_IFMT);
165 163
166 *p++ = htonl(nfs_ftypes[type >> 12]); 164 *p++ = htonl(nfs_ftypes[type >> 12]);
167 *p++ = htonl((u32) stat.mode); 165 *p++ = htonl((u32) stat->mode);
168 *p++ = htonl((u32) stat.nlink); 166 *p++ = htonl((u32) stat->nlink);
169 *p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid)); 167 *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid));
170 *p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid)); 168 *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid));
171 169
172 if (S_ISLNK(type) && stat.size > NFS_MAXPATHLEN) { 170 if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
173 *p++ = htonl(NFS_MAXPATHLEN); 171 *p++ = htonl(NFS_MAXPATHLEN);
174 } else { 172 } else {
175 *p++ = htonl((u32) stat.size); 173 *p++ = htonl((u32) stat->size);
176 } 174 }
177 *p++ = htonl((u32) stat.blksize); 175 *p++ = htonl((u32) stat->blksize);
178 if (S_ISCHR(type) || S_ISBLK(type)) 176 if (S_ISCHR(type) || S_ISBLK(type))
179 *p++ = htonl(new_encode_dev(stat.rdev)); 177 *p++ = htonl(new_encode_dev(stat->rdev));
180 else 178 else
181 *p++ = htonl(0xffffffff); 179 *p++ = htonl(0xffffffff);
182 *p++ = htonl((u32) stat.blocks); 180 *p++ = htonl((u32) stat->blocks);
183 if (is_fsid(fhp, rqstp->rq_reffh)) 181 if (is_fsid(fhp, rqstp->rq_reffh))
184 *p++ = htonl((u32) fhp->fh_export->ex_fsid); 182 *p++ = htonl((u32) fhp->fh_export->ex_fsid);
185 else 183 else
186 *p++ = htonl(new_encode_dev(stat.dev)); 184 *p++ = htonl(new_encode_dev(stat->dev));
187 *p++ = htonl((u32) stat.ino); 185 *p++ = htonl((u32) stat->ino);
188 *p++ = htonl((u32) stat.atime.tv_sec); 186 *p++ = htonl((u32) stat->atime.tv_sec);
189 *p++ = htonl(stat.atime.tv_nsec ? stat.atime.tv_nsec / 1000 : 0); 187 *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
190 lease_get_mtime(dentry->d_inode, &time); 188 lease_get_mtime(dentry->d_inode, &time);
191 *p++ = htonl((u32) time.tv_sec); 189 *p++ = htonl((u32) time.tv_sec);
192 *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); 190 *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0);
193 *p++ = htonl((u32) stat.ctime.tv_sec); 191 *p++ = htonl((u32) stat->ctime.tv_sec);
194 *p++ = htonl(stat.ctime.tv_nsec ? stat.ctime.tv_nsec / 1000 : 0); 192 *p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0);
195 193
196 return p; 194 return p;
197} 195}
@@ -199,7 +197,9 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
199/* Helper function for NFSv2 ACL code */ 197/* Helper function for NFSv2 ACL code */
200u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) 198u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
201{ 199{
202 return encode_fattr(rqstp, p, fhp); 200 struct kstat stat;
201 vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, &stat);
202 return encode_fattr(rqstp, p, fhp, &stat);
203} 203}
204 204
205/* 205/*
@@ -394,7 +394,7 @@ int
394nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p, 394nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
395 struct nfsd_attrstat *resp) 395 struct nfsd_attrstat *resp)
396{ 396{
397 p = encode_fattr(rqstp, p, &resp->fh); 397 p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
398 return xdr_ressize_check(rqstp, p); 398 return xdr_ressize_check(rqstp, p);
399} 399}
400 400
@@ -403,7 +403,7 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
403 struct nfsd_diropres *resp) 403 struct nfsd_diropres *resp)
404{ 404{
405 p = encode_fh(p, &resp->fh); 405 p = encode_fh(p, &resp->fh);
406 p = encode_fattr(rqstp, p, &resp->fh); 406 p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
407 return xdr_ressize_check(rqstp, p); 407 return xdr_ressize_check(rqstp, p);
408} 408}
409 409
@@ -428,7 +428,7 @@ int
428nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p, 428nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
429 struct nfsd_readres *resp) 429 struct nfsd_readres *resp)
430{ 430{
431 p = encode_fattr(rqstp, p, &resp->fh); 431 p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
432 *p++ = htonl(resp->count); 432 *p++ = htonl(resp->count);
433 xdr_ressize_check(rqstp, p); 433 xdr_ressize_check(rqstp, p);
434 434
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index af7c3c3074b0..df4019f04560 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -717,27 +717,33 @@ nfsd_close(struct file *filp)
717 * As this calls fsync (not fdatasync) there is no need for a write_inode 717 * As this calls fsync (not fdatasync) there is no need for a write_inode
718 * after it. 718 * after it.
719 */ 719 */
720static inline void nfsd_dosync(struct file *filp, struct dentry *dp, 720static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
721 struct file_operations *fop) 721 struct file_operations *fop)
722{ 722{
723 struct inode *inode = dp->d_inode; 723 struct inode *inode = dp->d_inode;
724 int (*fsync) (struct file *, struct dentry *, int); 724 int (*fsync) (struct file *, struct dentry *, int);
725 int err = nfs_ok;
725 726
726 filemap_fdatawrite(inode->i_mapping); 727 filemap_fdatawrite(inode->i_mapping);
727 if (fop && (fsync = fop->fsync)) 728 if (fop && (fsync = fop->fsync))
728 fsync(filp, dp, 0); 729 err=fsync(filp, dp, 0);
729 filemap_fdatawait(inode->i_mapping); 730 filemap_fdatawait(inode->i_mapping);
731
732 return nfserrno(err);
730} 733}
731 734
732 735
733static void 736static int
734nfsd_sync(struct file *filp) 737nfsd_sync(struct file *filp)
735{ 738{
739 int err;
736 struct inode *inode = filp->f_dentry->d_inode; 740 struct inode *inode = filp->f_dentry->d_inode;
737 dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name); 741 dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name);
738 down(&inode->i_sem); 742 down(&inode->i_sem);
739 nfsd_dosync(filp, filp->f_dentry, filp->f_op); 743 err=nfsd_dosync(filp, filp->f_dentry, filp->f_op);
740 up(&inode->i_sem); 744 up(&inode->i_sem);
745
746 return err;
741} 747}
742 748
743void 749void
@@ -874,6 +880,16 @@ out:
874 return err; 880 return err;
875} 881}
876 882
883static void kill_suid(struct dentry *dentry)
884{
885 struct iattr ia;
886 ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
887
888 down(&dentry->d_inode->i_sem);
889 notify_change(dentry, &ia);
890 up(&dentry->d_inode->i_sem);
891}
892
877static inline int 893static inline int
878nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 894nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
879 loff_t offset, struct kvec *vec, int vlen, 895 loff_t offset, struct kvec *vec, int vlen,
@@ -927,14 +943,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
927 } 943 }
928 944
929 /* clear setuid/setgid flag after write */ 945 /* clear setuid/setgid flag after write */
930 if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) { 946 if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
931 struct iattr ia; 947 kill_suid(dentry);
932 ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
933
934 down(&inode->i_sem);
935 notify_change(dentry, &ia);
936 up(&inode->i_sem);
937 }
938 948
939 if (err >= 0 && stable) { 949 if (err >= 0 && stable) {
940 static ino_t last_ino; 950 static ino_t last_ino;
@@ -962,7 +972,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
962 972
963 if (inode->i_state & I_DIRTY) { 973 if (inode->i_state & I_DIRTY) {
964 dprintk("nfsd: write sync %d\n", current->pid); 974 dprintk("nfsd: write sync %d\n", current->pid);
965 nfsd_sync(file); 975 err=nfsd_sync(file);
966 } 976 }
967#if 0 977#if 0
968 wake_up(&inode->i_wait); 978 wake_up(&inode->i_wait);
@@ -1066,7 +1076,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1066 return err; 1076 return err;
1067 if (EX_ISSYNC(fhp->fh_export)) { 1077 if (EX_ISSYNC(fhp->fh_export)) {
1068 if (file->f_op && file->f_op->fsync) { 1078 if (file->f_op && file->f_op->fsync) {
1069 nfsd_sync(file); 1079 err = nfsd_sync(file);
1070 } else { 1080 } else {
1071 err = nfserr_notsupp; 1081 err = nfserr_notsupp;
1072 } 1082 }
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
new file mode 100644
index 000000000000..7d3be845a614
--- /dev/null
+++ b/fs/ocfs2/Makefile
@@ -0,0 +1,33 @@
1EXTRA_CFLAGS += -Ifs/ocfs2
2
3EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
4
5obj-$(CONFIG_OCFS2_FS) += ocfs2.o
6
7ocfs2-objs := \
8 alloc.o \
9 aops.o \
10 buffer_head_io.o \
11 dcache.o \
12 dir.o \
13 dlmglue.o \
14 export.o \
15 extent_map.o \
16 file.o \
17 heartbeat.o \
18 inode.o \
19 journal.o \
20 localalloc.o \
21 mmap.o \
22 namei.o \
23 slot_map.o \
24 suballoc.o \
25 super.o \
26 symlink.o \
27 sysfile.o \
28 uptodate.o \
29 ver.o \
30 vote.o
31
32obj-$(CONFIG_OCFS2_FS) += cluster/
33obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
new file mode 100644
index 000000000000..465f797451ee
--- /dev/null
+++ b/fs/ocfs2/alloc.c
@@ -0,0 +1,2040 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * alloc.c
5 *
6 * Extent allocs and frees
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30
31#define MLOG_MASK_PREFIX ML_DISK_ALLOC
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "alloc.h"
37#include "dlmglue.h"
38#include "extent_map.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "sysfile.h"
44#include "file.h"
45#include "super.h"
46#include "uptodate.h"
47
48#include "buffer_head_io.h"
49
50static int ocfs2_extent_contig(struct inode *inode,
51 struct ocfs2_extent_rec *ext,
52 u64 blkno);
53
54static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
55 struct ocfs2_journal_handle *handle,
56 struct inode *inode,
57 int wanted,
58 struct ocfs2_alloc_context *meta_ac,
59 struct buffer_head *bhs[]);
60
61static int ocfs2_add_branch(struct ocfs2_super *osb,
62 struct ocfs2_journal_handle *handle,
63 struct inode *inode,
64 struct buffer_head *fe_bh,
65 struct buffer_head *eb_bh,
66 struct buffer_head *last_eb_bh,
67 struct ocfs2_alloc_context *meta_ac);
68
69static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
70 struct ocfs2_journal_handle *handle,
71 struct inode *inode,
72 struct buffer_head *fe_bh,
73 struct ocfs2_alloc_context *meta_ac,
74 struct buffer_head **ret_new_eb_bh);
75
76static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
77 struct ocfs2_journal_handle *handle,
78 struct inode *inode,
79 struct buffer_head *fe_bh,
80 u64 blkno,
81 u32 new_clusters);
82
83static int ocfs2_find_branch_target(struct ocfs2_super *osb,
84 struct inode *inode,
85 struct buffer_head *fe_bh,
86 struct buffer_head **target_bh);
87
88static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
89 struct inode *inode,
90 struct ocfs2_dinode *fe,
91 unsigned int new_i_clusters,
92 struct buffer_head *old_last_eb,
93 struct buffer_head **new_last_eb);
94
95static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
96
97static int ocfs2_extent_contig(struct inode *inode,
98 struct ocfs2_extent_rec *ext,
99 u64 blkno)
100{
101 return blkno == (le64_to_cpu(ext->e_blkno) +
102 ocfs2_clusters_to_blocks(inode->i_sb,
103 le32_to_cpu(ext->e_clusters)));
104}
105
106/*
107 * How many free extents have we got before we need more meta data?
108 */
109int ocfs2_num_free_extents(struct ocfs2_super *osb,
110 struct inode *inode,
111 struct ocfs2_dinode *fe)
112{
113 int retval;
114 struct ocfs2_extent_list *el;
115 struct ocfs2_extent_block *eb;
116 struct buffer_head *eb_bh = NULL;
117
118 mlog_entry_void();
119
120 if (!OCFS2_IS_VALID_DINODE(fe)) {
121 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
122 retval = -EIO;
123 goto bail;
124 }
125
126 if (fe->i_last_eb_blk) {
127 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
128 &eb_bh, OCFS2_BH_CACHED, inode);
129 if (retval < 0) {
130 mlog_errno(retval);
131 goto bail;
132 }
133 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
134 el = &eb->h_list;
135 } else
136 el = &fe->id2.i_list;
137
138 BUG_ON(el->l_tree_depth != 0);
139
140 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
141bail:
142 if (eb_bh)
143 brelse(eb_bh);
144
145 mlog_exit(retval);
146 return retval;
147}
148
149/* expects array to already be allocated
150 *
151 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
152 * l_count for you
153 */
154static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
155 struct ocfs2_journal_handle *handle,
156 struct inode *inode,
157 int wanted,
158 struct ocfs2_alloc_context *meta_ac,
159 struct buffer_head *bhs[])
160{
161 int count, status, i;
162 u16 suballoc_bit_start;
163 u32 num_got;
164 u64 first_blkno;
165 struct ocfs2_extent_block *eb;
166
167 mlog_entry_void();
168
169 count = 0;
170 while (count < wanted) {
171 status = ocfs2_claim_metadata(osb,
172 handle,
173 meta_ac,
174 wanted - count,
175 &suballoc_bit_start,
176 &num_got,
177 &first_blkno);
178 if (status < 0) {
179 mlog_errno(status);
180 goto bail;
181 }
182
183 for(i = count; i < (num_got + count); i++) {
184 bhs[i] = sb_getblk(osb->sb, first_blkno);
185 if (bhs[i] == NULL) {
186 status = -EIO;
187 mlog_errno(status);
188 goto bail;
189 }
190 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
191
192 status = ocfs2_journal_access(handle, inode, bhs[i],
193 OCFS2_JOURNAL_ACCESS_CREATE);
194 if (status < 0) {
195 mlog_errno(status);
196 goto bail;
197 }
198
199 memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
200 eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
201 /* Ok, setup the minimal stuff here. */
202 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
203 eb->h_blkno = cpu_to_le64(first_blkno);
204 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
205
206#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
207 /* we always use slot zero's suballocator */
208 eb->h_suballoc_slot = 0;
209#else
210 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
211#endif
212 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
213 eb->h_list.l_count =
214 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
215
216 suballoc_bit_start++;
217 first_blkno++;
218
219 /* We'll also be dirtied by the caller, so
220 * this isn't absolutely necessary. */
221 status = ocfs2_journal_dirty(handle, bhs[i]);
222 if (status < 0) {
223 mlog_errno(status);
224 goto bail;
225 }
226 }
227
228 count += num_got;
229 }
230
231 status = 0;
232bail:
233 if (status < 0) {
234 for(i = 0; i < wanted; i++) {
235 if (bhs[i])
236 brelse(bhs[i]);
237 bhs[i] = NULL;
238 }
239 }
240 mlog_exit(status);
241 return status;
242}
243
244/*
245 * Add an entire tree branch to our inode. eb_bh is the extent block
246 * to start at, if we don't want to start the branch at the dinode
247 * structure.
248 *
249 * last_eb_bh is required as we have to update it's next_leaf pointer
250 * for the new last extent block.
251 *
252 * the new branch will be 'empty' in the sense that every block will
253 * contain a single record with e_clusters == 0.
254 */
255static int ocfs2_add_branch(struct ocfs2_super *osb,
256 struct ocfs2_journal_handle *handle,
257 struct inode *inode,
258 struct buffer_head *fe_bh,
259 struct buffer_head *eb_bh,
260 struct buffer_head *last_eb_bh,
261 struct ocfs2_alloc_context *meta_ac)
262{
263 int status, new_blocks, i;
264 u64 next_blkno, new_last_eb_blk;
265 struct buffer_head *bh;
266 struct buffer_head **new_eb_bhs = NULL;
267 struct ocfs2_dinode *fe;
268 struct ocfs2_extent_block *eb;
269 struct ocfs2_extent_list *eb_el;
270 struct ocfs2_extent_list *el;
271
272 mlog_entry_void();
273
274 BUG_ON(!last_eb_bh);
275
276 fe = (struct ocfs2_dinode *) fe_bh->b_data;
277
278 if (eb_bh) {
279 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
280 el = &eb->h_list;
281 } else
282 el = &fe->id2.i_list;
283
284 /* we never add a branch to a leaf. */
285 BUG_ON(!el->l_tree_depth);
286
287 new_blocks = le16_to_cpu(el->l_tree_depth);
288
289 /* allocate the number of new eb blocks we need */
290 new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
291 GFP_KERNEL);
292 if (!new_eb_bhs) {
293 status = -ENOMEM;
294 mlog_errno(status);
295 goto bail;
296 }
297
298 status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
299 meta_ac, new_eb_bhs);
300 if (status < 0) {
301 mlog_errno(status);
302 goto bail;
303 }
304
305 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
306 * linked with the rest of the tree.
307 * conversly, new_eb_bhs[0] is the new bottommost leaf.
308 *
309 * when we leave the loop, new_last_eb_blk will point to the
310 * newest leaf, and next_blkno will point to the topmost extent
311 * block. */
312 next_blkno = new_last_eb_blk = 0;
313 for(i = 0; i < new_blocks; i++) {
314 bh = new_eb_bhs[i];
315 eb = (struct ocfs2_extent_block *) bh->b_data;
316 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
317 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
318 status = -EIO;
319 goto bail;
320 }
321 eb_el = &eb->h_list;
322
323 status = ocfs2_journal_access(handle, inode, bh,
324 OCFS2_JOURNAL_ACCESS_CREATE);
325 if (status < 0) {
326 mlog_errno(status);
327 goto bail;
328 }
329
330 eb->h_next_leaf_blk = 0;
331 eb_el->l_tree_depth = cpu_to_le16(i);
332 eb_el->l_next_free_rec = cpu_to_le16(1);
333 eb_el->l_recs[0].e_cpos = fe->i_clusters;
334 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
335 eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
336 if (!eb_el->l_tree_depth)
337 new_last_eb_blk = le64_to_cpu(eb->h_blkno);
338
339 status = ocfs2_journal_dirty(handle, bh);
340 if (status < 0) {
341 mlog_errno(status);
342 goto bail;
343 }
344
345 next_blkno = le64_to_cpu(eb->h_blkno);
346 }
347
348 /* This is a bit hairy. We want to update up to three blocks
349 * here without leaving any of them in an inconsistent state
350 * in case of error. We don't have to worry about
351 * journal_dirty erroring as it won't unless we've aborted the
352 * handle (in which case we would never be here) so reserving
353 * the write with journal_access is all we need to do. */
354 status = ocfs2_journal_access(handle, inode, last_eb_bh,
355 OCFS2_JOURNAL_ACCESS_WRITE);
356 if (status < 0) {
357 mlog_errno(status);
358 goto bail;
359 }
360 status = ocfs2_journal_access(handle, inode, fe_bh,
361 OCFS2_JOURNAL_ACCESS_WRITE);
362 if (status < 0) {
363 mlog_errno(status);
364 goto bail;
365 }
366 if (eb_bh) {
367 status = ocfs2_journal_access(handle, inode, eb_bh,
368 OCFS2_JOURNAL_ACCESS_WRITE);
369 if (status < 0) {
370 mlog_errno(status);
371 goto bail;
372 }
373 }
374
375 /* Link the new branch into the rest of the tree (el will
376 * either be on the fe, or the extent block passed in. */
377 i = le16_to_cpu(el->l_next_free_rec);
378 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
379 el->l_recs[i].e_cpos = fe->i_clusters;
380 el->l_recs[i].e_clusters = 0;
381 le16_add_cpu(&el->l_next_free_rec, 1);
382
383 /* fe needs a new last extent block pointer, as does the
384 * next_leaf on the previously last-extent-block. */
385 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
386
387 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
388 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
389
390 status = ocfs2_journal_dirty(handle, last_eb_bh);
391 if (status < 0)
392 mlog_errno(status);
393 status = ocfs2_journal_dirty(handle, fe_bh);
394 if (status < 0)
395 mlog_errno(status);
396 if (eb_bh) {
397 status = ocfs2_journal_dirty(handle, eb_bh);
398 if (status < 0)
399 mlog_errno(status);
400 }
401
402 status = 0;
403bail:
404 if (new_eb_bhs) {
405 for (i = 0; i < new_blocks; i++)
406 if (new_eb_bhs[i])
407 brelse(new_eb_bhs[i]);
408 kfree(new_eb_bhs);
409 }
410
411 mlog_exit(status);
412 return status;
413}
414
415/*
416 * adds another level to the allocation tree.
417 * returns back the new extent block so you can add a branch to it
418 * after this call.
419 */
420static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
421 struct ocfs2_journal_handle *handle,
422 struct inode *inode,
423 struct buffer_head *fe_bh,
424 struct ocfs2_alloc_context *meta_ac,
425 struct buffer_head **ret_new_eb_bh)
426{
427 int status, i;
428 struct buffer_head *new_eb_bh = NULL;
429 struct ocfs2_dinode *fe;
430 struct ocfs2_extent_block *eb;
431 struct ocfs2_extent_list *fe_el;
432 struct ocfs2_extent_list *eb_el;
433
434 mlog_entry_void();
435
436 status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
437 &new_eb_bh);
438 if (status < 0) {
439 mlog_errno(status);
440 goto bail;
441 }
442
443 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
444 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
445 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
446 status = -EIO;
447 goto bail;
448 }
449
450 eb_el = &eb->h_list;
451 fe = (struct ocfs2_dinode *) fe_bh->b_data;
452 fe_el = &fe->id2.i_list;
453
454 status = ocfs2_journal_access(handle, inode, new_eb_bh,
455 OCFS2_JOURNAL_ACCESS_CREATE);
456 if (status < 0) {
457 mlog_errno(status);
458 goto bail;
459 }
460
461 /* copy the fe data into the new extent block */
462 eb_el->l_tree_depth = fe_el->l_tree_depth;
463 eb_el->l_next_free_rec = fe_el->l_next_free_rec;
464 for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
465 eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
466 eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
467 eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
468 }
469
470 status = ocfs2_journal_dirty(handle, new_eb_bh);
471 if (status < 0) {
472 mlog_errno(status);
473 goto bail;
474 }
475
476 status = ocfs2_journal_access(handle, inode, fe_bh,
477 OCFS2_JOURNAL_ACCESS_WRITE);
478 if (status < 0) {
479 mlog_errno(status);
480 goto bail;
481 }
482
483 /* update fe now */
484 le16_add_cpu(&fe_el->l_tree_depth, 1);
485 fe_el->l_recs[0].e_cpos = 0;
486 fe_el->l_recs[0].e_blkno = eb->h_blkno;
487 fe_el->l_recs[0].e_clusters = fe->i_clusters;
488 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
489 fe_el->l_recs[i].e_cpos = 0;
490 fe_el->l_recs[i].e_clusters = 0;
491 fe_el->l_recs[i].e_blkno = 0;
492 }
493 fe_el->l_next_free_rec = cpu_to_le16(1);
494
495 /* If this is our 1st tree depth shift, then last_eb_blk
496 * becomes the allocated extent block */
497 if (fe_el->l_tree_depth == cpu_to_le16(1))
498 fe->i_last_eb_blk = eb->h_blkno;
499
500 status = ocfs2_journal_dirty(handle, fe_bh);
501 if (status < 0) {
502 mlog_errno(status);
503 goto bail;
504 }
505
506 *ret_new_eb_bh = new_eb_bh;
507 new_eb_bh = NULL;
508 status = 0;
509bail:
510 if (new_eb_bh)
511 brelse(new_eb_bh);
512
513 mlog_exit(status);
514 return status;
515}
516
517/*
518 * Expects the tree to already have room in the rightmost leaf for the
519 * extent. Updates all the extent blocks (and the dinode) on the way
520 * down.
521 */
522static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
523 struct ocfs2_journal_handle *handle,
524 struct inode *inode,
525 struct buffer_head *fe_bh,
526 u64 start_blk,
527 u32 new_clusters)
528{
529 int status, i, num_bhs = 0;
530 u64 next_blkno;
531 u16 next_free;
532 struct buffer_head **eb_bhs = NULL;
533 struct ocfs2_dinode *fe;
534 struct ocfs2_extent_block *eb;
535 struct ocfs2_extent_list *el;
536
537 mlog_entry_void();
538
539 status = ocfs2_journal_access(handle, inode, fe_bh,
540 OCFS2_JOURNAL_ACCESS_WRITE);
541 if (status < 0) {
542 mlog_errno(status);
543 goto bail;
544 }
545
546 fe = (struct ocfs2_dinode *) fe_bh->b_data;
547 el = &fe->id2.i_list;
548 if (el->l_tree_depth) {
549 /* This is another operation where we want to be
550 * careful about our tree updates. An error here means
551 * none of the previous changes we made should roll
552 * forward. As a result, we have to record the buffers
553 * for this part of the tree in an array and reserve a
554 * journal write to them before making any changes. */
555 num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
556 eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
557 GFP_KERNEL);
558 if (!eb_bhs) {
559 status = -ENOMEM;
560 mlog_errno(status);
561 goto bail;
562 }
563
564 i = 0;
565 while(el->l_tree_depth) {
566 next_free = le16_to_cpu(el->l_next_free_rec);
567 if (next_free == 0) {
568 ocfs2_error(inode->i_sb,
569 "Dinode %"MLFu64" has a bad "
570 "extent list",
571 OCFS2_I(inode)->ip_blkno);
572 status = -EIO;
573 goto bail;
574 }
575 next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
576
577 BUG_ON(i >= num_bhs);
578 status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
579 OCFS2_BH_CACHED, inode);
580 if (status < 0) {
581 mlog_errno(status);
582 goto bail;
583 }
584 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
585 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
586 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
587 eb);
588 status = -EIO;
589 goto bail;
590 }
591
592 status = ocfs2_journal_access(handle, inode, eb_bhs[i],
593 OCFS2_JOURNAL_ACCESS_WRITE);
594 if (status < 0) {
595 mlog_errno(status);
596 goto bail;
597 }
598
599 el = &eb->h_list;
600 i++;
601 /* When we leave this loop, eb_bhs[num_bhs - 1] will
602 * hold the bottom-most leaf extent block. */
603 }
604 BUG_ON(el->l_tree_depth);
605
606 el = &fe->id2.i_list;
607 /* If we have tree depth, then the fe update is
608 * trivial, and we want to switch el out for the
609 * bottom-most leaf in order to update it with the
610 * actual extent data below. */
611 next_free = le16_to_cpu(el->l_next_free_rec);
612 if (next_free == 0) {
613 ocfs2_error(inode->i_sb,
614 "Dinode %"MLFu64" has a bad "
615 "extent list",
616 OCFS2_I(inode)->ip_blkno);
617 status = -EIO;
618 goto bail;
619 }
620 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
621 new_clusters);
622 /* (num_bhs - 1) to avoid the leaf */
623 for(i = 0; i < (num_bhs - 1); i++) {
624 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
625 el = &eb->h_list;
626
627 /* finally, make our actual change to the
628 * intermediate extent blocks. */
629 next_free = le16_to_cpu(el->l_next_free_rec);
630 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
631 new_clusters);
632
633 status = ocfs2_journal_dirty(handle, eb_bhs[i]);
634 if (status < 0)
635 mlog_errno(status);
636 }
637 BUG_ON(i != (num_bhs - 1));
638 /* note that the leaf block wasn't touched in
639 * the loop above */
640 eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
641 el = &eb->h_list;
642 BUG_ON(el->l_tree_depth);
643 }
644
645 /* yay, we can finally add the actual extent now! */
646 i = le16_to_cpu(el->l_next_free_rec) - 1;
647 if (le16_to_cpu(el->l_next_free_rec) &&
648 ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
649 le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
650 } else if (le16_to_cpu(el->l_next_free_rec) &&
651 (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
652 /* having an empty extent at eof is legal. */
653 if (el->l_recs[i].e_cpos != fe->i_clusters) {
654 ocfs2_error(inode->i_sb,
655 "Dinode %"MLFu64" trailing extent is bad: "
656 "cpos (%u) != number of clusters (%u)",
657 le32_to_cpu(el->l_recs[i].e_cpos),
658 le32_to_cpu(fe->i_clusters));
659 status = -EIO;
660 goto bail;
661 }
662 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
663 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
664 } else {
665 /* No contiguous record, or no empty record at eof, so
666 * we add a new one. */
667
668 BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
669 le16_to_cpu(el->l_count));
670 i = le16_to_cpu(el->l_next_free_rec);
671
672 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
673 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
674 el->l_recs[i].e_cpos = fe->i_clusters;
675 le16_add_cpu(&el->l_next_free_rec, 1);
676 }
677
678 /*
679 * extent_map errors are not fatal, so they are ignored outside
680 * of flushing the thing.
681 */
682 status = ocfs2_extent_map_append(inode, &el->l_recs[i],
683 new_clusters);
684 if (status) {
685 mlog_errno(status);
686 ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
687 }
688
689 status = ocfs2_journal_dirty(handle, fe_bh);
690 if (status < 0)
691 mlog_errno(status);
692 if (fe->id2.i_list.l_tree_depth) {
693 status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
694 if (status < 0)
695 mlog_errno(status);
696 }
697
698 status = 0;
699bail:
700 if (eb_bhs) {
701 for (i = 0; i < num_bhs; i++)
702 if (eb_bhs[i])
703 brelse(eb_bhs[i]);
704 kfree(eb_bhs);
705 }
706
707 mlog_exit(status);
708 return status;
709}
710
711/*
712 * Should only be called when there is no space left in any of the
713 * leaf nodes. What we want to do is find the lowest tree depth
714 * non-leaf extent block with room for new records. There are three
715 * valid results of this search:
716 *
717 * 1) a lowest extent block is found, then we pass it back in
718 * *lowest_eb_bh and return '0'
719 *
720 * 2) the search fails to find anything, but the dinode has room. We
721 * pass NULL back in *lowest_eb_bh, but still return '0'
722 *
723 * 3) the search fails to find anything AND the dinode is full, in
724 * which case we return > 0
725 *
726 * return status < 0 indicates an error.
727 */
728static int ocfs2_find_branch_target(struct ocfs2_super *osb,
729 struct inode *inode,
730 struct buffer_head *fe_bh,
731 struct buffer_head **target_bh)
732{
733 int status = 0, i;
734 u64 blkno;
735 struct ocfs2_dinode *fe;
736 struct ocfs2_extent_block *eb;
737 struct ocfs2_extent_list *el;
738 struct buffer_head *bh = NULL;
739 struct buffer_head *lowest_bh = NULL;
740
741 mlog_entry_void();
742
743 *target_bh = NULL;
744
745 fe = (struct ocfs2_dinode *) fe_bh->b_data;
746 el = &fe->id2.i_list;
747
748 while(le16_to_cpu(el->l_tree_depth) > 1) {
749 if (le16_to_cpu(el->l_next_free_rec) == 0) {
750 ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty "
751 "extent list (next_free_rec == 0)",
752 OCFS2_I(inode)->ip_blkno);
753 status = -EIO;
754 goto bail;
755 }
756 i = le16_to_cpu(el->l_next_free_rec) - 1;
757 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
758 if (!blkno) {
759 ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent "
760 "list where extent # %d has no physical "
761 "block start",
762 OCFS2_I(inode)->ip_blkno, i);
763 status = -EIO;
764 goto bail;
765 }
766
767 if (bh) {
768 brelse(bh);
769 bh = NULL;
770 }
771
772 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
773 inode);
774 if (status < 0) {
775 mlog_errno(status);
776 goto bail;
777 }
778
779 eb = (struct ocfs2_extent_block *) bh->b_data;
780 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
781 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
782 status = -EIO;
783 goto bail;
784 }
785 el = &eb->h_list;
786
787 if (le16_to_cpu(el->l_next_free_rec) <
788 le16_to_cpu(el->l_count)) {
789 if (lowest_bh)
790 brelse(lowest_bh);
791 lowest_bh = bh;
792 get_bh(lowest_bh);
793 }
794 }
795
796 /* If we didn't find one and the fe doesn't have any room,
797 * then return '1' */
798 if (!lowest_bh
799 && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
800 status = 1;
801
802 *target_bh = lowest_bh;
803bail:
804 if (bh)
805 brelse(bh);
806
807 mlog_exit(status);
808 return status;
809}
810
811/* the caller needs to update fe->i_clusters */
812int ocfs2_insert_extent(struct ocfs2_super *osb,
813 struct ocfs2_journal_handle *handle,
814 struct inode *inode,
815 struct buffer_head *fe_bh,
816 u64 start_blk,
817 u32 new_clusters,
818 struct ocfs2_alloc_context *meta_ac)
819{
820 int status, i, shift;
821 struct buffer_head *last_eb_bh = NULL;
822 struct buffer_head *bh = NULL;
823 struct ocfs2_dinode *fe;
824 struct ocfs2_extent_block *eb;
825 struct ocfs2_extent_list *el;
826
827 mlog_entry_void();
828
829 mlog(0, "add %u clusters starting at block %"MLFu64" to "
830 "inode %"MLFu64"\n",
831 new_clusters, start_blk, OCFS2_I(inode)->ip_blkno);
832
833 fe = (struct ocfs2_dinode *) fe_bh->b_data;
834 el = &fe->id2.i_list;
835
836 if (el->l_tree_depth) {
837 /* jump to end of tree */
838 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
839 &last_eb_bh, OCFS2_BH_CACHED, inode);
840 if (status < 0) {
841 mlog_exit(status);
842 goto bail;
843 }
844 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
845 el = &eb->h_list;
846 }
847
848 /* Can we allocate without adding/shifting tree bits? */
849 i = le16_to_cpu(el->l_next_free_rec) - 1;
850 if (le16_to_cpu(el->l_next_free_rec) == 0
851 || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
852 || le32_to_cpu(el->l_recs[i].e_clusters) == 0
853 || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
854 goto out_add;
855
856 mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
857 "tree now.\n");
858
859 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
860 if (shift < 0) {
861 status = shift;
862 mlog_errno(status);
863 goto bail;
864 }
865
866 /* We traveled all the way to the bottom of the allocation tree
867 * and didn't find room for any more extents - we need to add
868 * another tree level */
869 if (shift) {
870 /* if we hit a leaf, we'd better be empty :) */
871 BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
872 le16_to_cpu(el->l_count));
873 BUG_ON(bh);
874 mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
875 "(current = %u)\n",
876 le16_to_cpu(fe->id2.i_list.l_tree_depth));
877
878 /* ocfs2_shift_tree_depth will return us a buffer with
879 * the new extent block (so we can pass that to
880 * ocfs2_add_branch). */
881 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
882 meta_ac, &bh);
883 if (status < 0) {
884 mlog_errno(status);
885 goto bail;
886 }
887 /* Special case: we have room now if we shifted from
888 * tree_depth 0 */
889 if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
890 goto out_add;
891 }
892
893 /* call ocfs2_add_branch to add the final part of the tree with
894 * the new data. */
895 mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
896 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
897 meta_ac);
898 if (status < 0) {
899 mlog_errno(status);
900 goto bail;
901 }
902
903out_add:
904 /* Finally, we can add clusters. */
905 status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
906 start_blk, new_clusters);
907 if (status < 0)
908 mlog_errno(status);
909
910bail:
911 if (bh)
912 brelse(bh);
913
914 if (last_eb_bh)
915 brelse(last_eb_bh);
916
917 mlog_exit(status);
918 return status;
919}
920
921static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
922{
923 struct buffer_head *tl_bh = osb->osb_tl_bh;
924 struct ocfs2_dinode *di;
925 struct ocfs2_truncate_log *tl;
926
927 di = (struct ocfs2_dinode *) tl_bh->b_data;
928 tl = &di->id2.i_dealloc;
929
930 mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
931 "slot %d, invalid truncate log parameters: used = "
932 "%u, count = %u\n", osb->slot_num,
933 le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
934 return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
935}
936
937static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
938 unsigned int new_start)
939{
940 unsigned int tail_index;
941 unsigned int current_tail;
942
943 /* No records, nothing to coalesce */
944 if (!le16_to_cpu(tl->tl_used))
945 return 0;
946
947 tail_index = le16_to_cpu(tl->tl_used) - 1;
948 current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
949 current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
950
951 return current_tail == new_start;
952}
953
954static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
955 struct ocfs2_journal_handle *handle,
956 u64 start_blk,
957 unsigned int num_clusters)
958{
959 int status, index;
960 unsigned int start_cluster, tl_count;
961 struct inode *tl_inode = osb->osb_tl_inode;
962 struct buffer_head *tl_bh = osb->osb_tl_bh;
963 struct ocfs2_dinode *di;
964 struct ocfs2_truncate_log *tl;
965
966 mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk,
967 num_clusters);
968
969 BUG_ON(!down_trylock(&tl_inode->i_sem));
970
971 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
972
973 di = (struct ocfs2_dinode *) tl_bh->b_data;
974 tl = &di->id2.i_dealloc;
975 if (!OCFS2_IS_VALID_DINODE(di)) {
976 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
977 status = -EIO;
978 goto bail;
979 }
980
981 tl_count = le16_to_cpu(tl->tl_count);
982 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
983 tl_count == 0,
984 "Truncate record count on #%"MLFu64" invalid ("
985 "wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno,
986 ocfs2_truncate_recs_per_inode(osb->sb),
987 le16_to_cpu(tl->tl_count));
988
989 /* Caller should have known to flush before calling us. */
990 index = le16_to_cpu(tl->tl_used);
991 if (index >= tl_count) {
992 status = -ENOSPC;
993 mlog_errno(status);
994 goto bail;
995 }
996
997 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
998 OCFS2_JOURNAL_ACCESS_WRITE);
999 if (status < 0) {
1000 mlog_errno(status);
1001 goto bail;
1002 }
1003
1004 mlog(0, "Log truncate of %u clusters starting at cluster %u to "
1005 "%"MLFu64" (index = %d)\n", num_clusters, start_cluster,
1006 OCFS2_I(tl_inode)->ip_blkno, index);
1007
1008 if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
1009 /*
1010 * Move index back to the record we are coalescing with.
1011 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
1012 */
1013 index--;
1014
1015 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
1016 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
1017 index, le32_to_cpu(tl->tl_recs[index].t_start),
1018 num_clusters);
1019 } else {
1020 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
1021 tl->tl_used = cpu_to_le16(index + 1);
1022 }
1023 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
1024
1025 status = ocfs2_journal_dirty(handle, tl_bh);
1026 if (status < 0) {
1027 mlog_errno(status);
1028 goto bail;
1029 }
1030
1031bail:
1032 mlog_exit(status);
1033 return status;
1034}
1035
1036static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
1037 struct ocfs2_journal_handle *handle,
1038 struct inode *data_alloc_inode,
1039 struct buffer_head *data_alloc_bh)
1040{
1041 int status = 0;
1042 int i;
1043 unsigned int num_clusters;
1044 u64 start_blk;
1045 struct ocfs2_truncate_rec rec;
1046 struct ocfs2_dinode *di;
1047 struct ocfs2_truncate_log *tl;
1048 struct inode *tl_inode = osb->osb_tl_inode;
1049 struct buffer_head *tl_bh = osb->osb_tl_bh;
1050
1051 mlog_entry_void();
1052
1053 di = (struct ocfs2_dinode *) tl_bh->b_data;
1054 tl = &di->id2.i_dealloc;
1055 i = le16_to_cpu(tl->tl_used) - 1;
1056 while (i >= 0) {
1057 /* Caller has given us at least enough credits to
1058 * update the truncate log dinode */
1059 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
1060 OCFS2_JOURNAL_ACCESS_WRITE);
1061 if (status < 0) {
1062 mlog_errno(status);
1063 goto bail;
1064 }
1065
1066 tl->tl_used = cpu_to_le16(i);
1067
1068 status = ocfs2_journal_dirty(handle, tl_bh);
1069 if (status < 0) {
1070 mlog_errno(status);
1071 goto bail;
1072 }
1073
1074 /* TODO: Perhaps we can calculate the bulk of the
1075 * credits up front rather than extending like
1076 * this. */
1077 status = ocfs2_extend_trans(handle,
1078 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
1079 if (status < 0) {
1080 mlog_errno(status);
1081 goto bail;
1082 }
1083
1084 rec = tl->tl_recs[i];
1085 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
1086 le32_to_cpu(rec.t_start));
1087 num_clusters = le32_to_cpu(rec.t_clusters);
1088
1089 /* if start_blk is not set, we ignore the record as
1090 * invalid. */
1091 if (start_blk) {
1092 mlog(0, "free record %d, start = %u, clusters = %u\n",
1093 i, le32_to_cpu(rec.t_start), num_clusters);
1094
1095 status = ocfs2_free_clusters(handle, data_alloc_inode,
1096 data_alloc_bh, start_blk,
1097 num_clusters);
1098 if (status < 0) {
1099 mlog_errno(status);
1100 goto bail;
1101 }
1102 }
1103 i--;
1104 }
1105
1106bail:
1107 mlog_exit(status);
1108 return status;
1109}
1110
1111/* Expects you to already be holding tl_inode->i_sem */
1112static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1113{
1114 int status;
1115 unsigned int num_to_flush;
1116 struct ocfs2_journal_handle *handle = NULL;
1117 struct inode *tl_inode = osb->osb_tl_inode;
1118 struct inode *data_alloc_inode = NULL;
1119 struct buffer_head *tl_bh = osb->osb_tl_bh;
1120 struct buffer_head *data_alloc_bh = NULL;
1121 struct ocfs2_dinode *di;
1122 struct ocfs2_truncate_log *tl;
1123
1124 mlog_entry_void();
1125
1126 BUG_ON(!down_trylock(&tl_inode->i_sem));
1127
1128 di = (struct ocfs2_dinode *) tl_bh->b_data;
1129 tl = &di->id2.i_dealloc;
1130 if (!OCFS2_IS_VALID_DINODE(di)) {
1131 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
1132 status = -EIO;
1133 goto bail;
1134 }
1135
1136 num_to_flush = le16_to_cpu(tl->tl_used);
1137 mlog(0, "Flush %u records from truncate log #%"MLFu64"\n",
1138 num_to_flush, OCFS2_I(tl_inode)->ip_blkno);
1139 if (!num_to_flush) {
1140 status = 0;
1141 goto bail;
1142 }
1143
1144 handle = ocfs2_alloc_handle(osb);
1145 if (!handle) {
1146 status = -ENOMEM;
1147 mlog_errno(status);
1148 goto bail;
1149 }
1150
1151 data_alloc_inode = ocfs2_get_system_file_inode(osb,
1152 GLOBAL_BITMAP_SYSTEM_INODE,
1153 OCFS2_INVALID_SLOT);
1154 if (!data_alloc_inode) {
1155 status = -EINVAL;
1156 mlog(ML_ERROR, "Could not get bitmap inode!\n");
1157 goto bail;
1158 }
1159
1160 ocfs2_handle_add_inode(handle, data_alloc_inode);
1161 status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
1162 if (status < 0) {
1163 mlog_errno(status);
1164 goto bail;
1165 }
1166
1167 handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
1168 if (IS_ERR(handle)) {
1169 status = PTR_ERR(handle);
1170 handle = NULL;
1171 mlog_errno(status);
1172 goto bail;
1173 }
1174
1175 status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
1176 data_alloc_bh);
1177 if (status < 0) {
1178 mlog_errno(status);
1179 goto bail;
1180 }
1181
1182bail:
1183 if (handle)
1184 ocfs2_commit_trans(handle);
1185
1186 if (data_alloc_inode)
1187 iput(data_alloc_inode);
1188
1189 if (data_alloc_bh)
1190 brelse(data_alloc_bh);
1191
1192 mlog_exit(status);
1193 return status;
1194}
1195
1196int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1197{
1198 int status;
1199 struct inode *tl_inode = osb->osb_tl_inode;
1200
1201 down(&tl_inode->i_sem);
1202 status = __ocfs2_flush_truncate_log(osb);
1203 up(&tl_inode->i_sem);
1204
1205 return status;
1206}
1207
1208static void ocfs2_truncate_log_worker(void *data)
1209{
1210 int status;
1211 struct ocfs2_super *osb = data;
1212
1213 mlog_entry_void();
1214
1215 status = ocfs2_flush_truncate_log(osb);
1216 if (status < 0)
1217 mlog_errno(status);
1218
1219 mlog_exit(status);
1220}
1221
1222#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
1223void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
1224 int cancel)
1225{
1226 if (osb->osb_tl_inode) {
1227 /* We want to push off log flushes while truncates are
1228 * still running. */
1229 if (cancel)
1230 cancel_delayed_work(&osb->osb_truncate_log_wq);
1231
1232 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
1233 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
1234 }
1235}
1236
1237static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
1238 int slot_num,
1239 struct inode **tl_inode,
1240 struct buffer_head **tl_bh)
1241{
1242 int status;
1243 struct inode *inode = NULL;
1244 struct buffer_head *bh = NULL;
1245
1246 inode = ocfs2_get_system_file_inode(osb,
1247 TRUNCATE_LOG_SYSTEM_INODE,
1248 slot_num);
1249 if (!inode) {
1250 status = -EINVAL;
1251 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
1252 goto bail;
1253 }
1254
1255 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
1256 OCFS2_BH_CACHED, inode);
1257 if (status < 0) {
1258 iput(inode);
1259 mlog_errno(status);
1260 goto bail;
1261 }
1262
1263 *tl_inode = inode;
1264 *tl_bh = bh;
1265bail:
1266 mlog_exit(status);
1267 return status;
1268}
1269
1270/* called during the 1st stage of node recovery. we stamp a clean
1271 * truncate log and pass back a copy for processing later. if the
1272 * truncate log does not require processing, a *tl_copy is set to
1273 * NULL. */
1274int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
1275 int slot_num,
1276 struct ocfs2_dinode **tl_copy)
1277{
1278 int status;
1279 struct inode *tl_inode = NULL;
1280 struct buffer_head *tl_bh = NULL;
1281 struct ocfs2_dinode *di;
1282 struct ocfs2_truncate_log *tl;
1283
1284 *tl_copy = NULL;
1285
1286 mlog(0, "recover truncate log from slot %d\n", slot_num);
1287
1288 status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
1289 if (status < 0) {
1290 mlog_errno(status);
1291 goto bail;
1292 }
1293
1294 di = (struct ocfs2_dinode *) tl_bh->b_data;
1295 tl = &di->id2.i_dealloc;
1296 if (!OCFS2_IS_VALID_DINODE(di)) {
1297 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
1298 status = -EIO;
1299 goto bail;
1300 }
1301
1302 if (le16_to_cpu(tl->tl_used)) {
1303 mlog(0, "We'll have %u logs to recover\n",
1304 le16_to_cpu(tl->tl_used));
1305
1306 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
1307 if (!(*tl_copy)) {
1308 status = -ENOMEM;
1309 mlog_errno(status);
1310 goto bail;
1311 }
1312
1313 /* Assuming the write-out below goes well, this copy
1314 * will be passed back to recovery for processing. */
1315 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
1316
1317 /* All we need to do to clear the truncate log is set
1318 * tl_used. */
1319 tl->tl_used = 0;
1320
1321 status = ocfs2_write_block(osb, tl_bh, tl_inode);
1322 if (status < 0) {
1323 mlog_errno(status);
1324 goto bail;
1325 }
1326 }
1327
1328bail:
1329 if (tl_inode)
1330 iput(tl_inode);
1331 if (tl_bh)
1332 brelse(tl_bh);
1333
1334 if (status < 0 && (*tl_copy)) {
1335 kfree(*tl_copy);
1336 *tl_copy = NULL;
1337 }
1338
1339 mlog_exit(status);
1340 return status;
1341}
1342
1343int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
1344 struct ocfs2_dinode *tl_copy)
1345{
1346 int status = 0;
1347 int i;
1348 unsigned int clusters, num_recs, start_cluster;
1349 u64 start_blk;
1350 struct ocfs2_journal_handle *handle;
1351 struct inode *tl_inode = osb->osb_tl_inode;
1352 struct ocfs2_truncate_log *tl;
1353
1354 mlog_entry_void();
1355
1356 if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
1357 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
1358 return -EINVAL;
1359 }
1360
1361 tl = &tl_copy->id2.i_dealloc;
1362 num_recs = le16_to_cpu(tl->tl_used);
1363 mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs,
1364 tl_copy->i_blkno);
1365
1366 down(&tl_inode->i_sem);
1367 for(i = 0; i < num_recs; i++) {
1368 if (ocfs2_truncate_log_needs_flush(osb)) {
1369 status = __ocfs2_flush_truncate_log(osb);
1370 if (status < 0) {
1371 mlog_errno(status);
1372 goto bail_up;
1373 }
1374 }
1375
1376 handle = ocfs2_start_trans(osb, NULL,
1377 OCFS2_TRUNCATE_LOG_UPDATE);
1378 if (IS_ERR(handle)) {
1379 status = PTR_ERR(handle);
1380 mlog_errno(status);
1381 goto bail_up;
1382 }
1383
1384 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
1385 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
1386 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
1387
1388 status = ocfs2_truncate_log_append(osb, handle,
1389 start_blk, clusters);
1390 ocfs2_commit_trans(handle);
1391 if (status < 0) {
1392 mlog_errno(status);
1393 goto bail_up;
1394 }
1395 }
1396
1397bail_up:
1398 up(&tl_inode->i_sem);
1399
1400 mlog_exit(status);
1401 return status;
1402}
1403
1404void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
1405{
1406 int status;
1407 struct inode *tl_inode = osb->osb_tl_inode;
1408
1409 mlog_entry_void();
1410
1411 if (tl_inode) {
1412 cancel_delayed_work(&osb->osb_truncate_log_wq);
1413 flush_workqueue(ocfs2_wq);
1414
1415 status = ocfs2_flush_truncate_log(osb);
1416 if (status < 0)
1417 mlog_errno(status);
1418
1419 brelse(osb->osb_tl_bh);
1420 iput(osb->osb_tl_inode);
1421 }
1422
1423 mlog_exit_void();
1424}
1425
1426int ocfs2_truncate_log_init(struct ocfs2_super *osb)
1427{
1428 int status;
1429 struct inode *tl_inode = NULL;
1430 struct buffer_head *tl_bh = NULL;
1431
1432 mlog_entry_void();
1433
1434 status = ocfs2_get_truncate_log_info(osb,
1435 osb->slot_num,
1436 &tl_inode,
1437 &tl_bh);
1438 if (status < 0)
1439 mlog_errno(status);
1440
1441 /* ocfs2_truncate_log_shutdown keys on the existence of
1442 * osb->osb_tl_inode so we don't set any of the osb variables
1443 * until we're sure all is well. */
1444 INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb);
1445 osb->osb_tl_bh = tl_bh;
1446 osb->osb_tl_inode = tl_inode;
1447
1448 mlog_exit(status);
1449 return status;
1450}
1451
1452/* This function will figure out whether the currently last extent
1453 * block will be deleted, and if it will, what the new last extent
1454 * block will be so we can update his h_next_leaf_blk field, as well
1455 * as the dinodes i_last_eb_blk */
1456static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
1457 struct inode *inode,
1458 struct ocfs2_dinode *fe,
1459 u32 new_i_clusters,
1460 struct buffer_head *old_last_eb,
1461 struct buffer_head **new_last_eb)
1462{
1463 int i, status = 0;
1464 u64 block = 0;
1465 struct ocfs2_extent_block *eb;
1466 struct ocfs2_extent_list *el;
1467 struct buffer_head *bh = NULL;
1468
1469 *new_last_eb = NULL;
1470
1471 if (!OCFS2_IS_VALID_DINODE(fe)) {
1472 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1473 status = -EIO;
1474 goto bail;
1475 }
1476
1477 /* we have no tree, so of course, no last_eb. */
1478 if (!fe->id2.i_list.l_tree_depth)
1479 goto bail;
1480
1481 /* trunc to zero special case - this makes tree_depth = 0
1482 * regardless of what it is. */
1483 if (!new_i_clusters)
1484 goto bail;
1485
1486 eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
1487 el = &(eb->h_list);
1488 BUG_ON(!el->l_next_free_rec);
1489
1490 /* Make sure that this guy will actually be empty after we
1491 * clear away the data. */
1492 if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
1493 goto bail;
1494
1495 /* Ok, at this point, we know that last_eb will definitely
1496 * change, so lets traverse the tree and find the second to
1497 * last extent block. */
1498 el = &(fe->id2.i_list);
1499 /* go down the tree, */
1500 do {
1501 for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
1502 if (le32_to_cpu(el->l_recs[i].e_cpos) <
1503 new_i_clusters) {
1504 block = le64_to_cpu(el->l_recs[i].e_blkno);
1505 break;
1506 }
1507 }
1508 BUG_ON(i < 0);
1509
1510 if (bh) {
1511 brelse(bh);
1512 bh = NULL;
1513 }
1514
1515 status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
1516 inode);
1517 if (status < 0) {
1518 mlog_errno(status);
1519 goto bail;
1520 }
1521 eb = (struct ocfs2_extent_block *) bh->b_data;
1522 el = &eb->h_list;
1523 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1524 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1525 status = -EIO;
1526 goto bail;
1527 }
1528 } while (el->l_tree_depth);
1529
1530 *new_last_eb = bh;
1531 get_bh(*new_last_eb);
1532 mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno));
1533bail:
1534 if (bh)
1535 brelse(bh);
1536
1537 return status;
1538}
1539
1540static int ocfs2_do_truncate(struct ocfs2_super *osb,
1541 unsigned int clusters_to_del,
1542 struct inode *inode,
1543 struct buffer_head *fe_bh,
1544 struct buffer_head *old_last_eb_bh,
1545 struct ocfs2_journal_handle *handle,
1546 struct ocfs2_truncate_context *tc)
1547{
1548 int status, i, depth;
1549 struct ocfs2_dinode *fe;
1550 struct ocfs2_extent_block *eb;
1551 struct ocfs2_extent_block *last_eb = NULL;
1552 struct ocfs2_extent_list *el;
1553 struct buffer_head *eb_bh = NULL;
1554 struct buffer_head *last_eb_bh = NULL;
1555 u64 next_eb = 0;
1556 u64 delete_blk = 0;
1557
1558 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1559
1560 status = ocfs2_find_new_last_ext_blk(osb,
1561 inode,
1562 fe,
1563 le32_to_cpu(fe->i_clusters) -
1564 clusters_to_del,
1565 old_last_eb_bh,
1566 &last_eb_bh);
1567 if (status < 0) {
1568 mlog_errno(status);
1569 goto bail;
1570 }
1571 if (last_eb_bh)
1572 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1573
1574 status = ocfs2_journal_access(handle, inode, fe_bh,
1575 OCFS2_JOURNAL_ACCESS_WRITE);
1576 if (status < 0) {
1577 mlog_errno(status);
1578 goto bail;
1579 }
1580 el = &(fe->id2.i_list);
1581
1582 spin_lock(&OCFS2_I(inode)->ip_lock);
1583 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
1584 clusters_to_del;
1585 spin_unlock(&OCFS2_I(inode)->ip_lock);
1586 le32_add_cpu(&fe->i_clusters, -clusters_to_del);
1587 fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
1588 fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
1589
1590 i = le16_to_cpu(el->l_next_free_rec) - 1;
1591
1592 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1593 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1594 /* tree depth zero, we can just delete the clusters, otherwise
1595 * we need to record the offset of the next level extent block
1596 * as we may overwrite it. */
1597 if (!el->l_tree_depth)
1598 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1599 + ocfs2_clusters_to_blocks(osb->sb,
1600 le32_to_cpu(el->l_recs[i].e_clusters));
1601 else
1602 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1603
1604 if (!el->l_recs[i].e_clusters) {
1605 /* if we deleted the whole extent record, then clear
1606 * out the other fields and update the extent
1607 * list. For depth > 0 trees, we've already recorded
1608 * the extent block in 'next_eb' */
1609 el->l_recs[i].e_cpos = 0;
1610 el->l_recs[i].e_blkno = 0;
1611 BUG_ON(!el->l_next_free_rec);
1612 le16_add_cpu(&el->l_next_free_rec, -1);
1613 }
1614
1615 depth = le16_to_cpu(el->l_tree_depth);
1616 if (!fe->i_clusters) {
1617 /* trunc to zero is a special case. */
1618 el->l_tree_depth = 0;
1619 fe->i_last_eb_blk = 0;
1620 } else if (last_eb)
1621 fe->i_last_eb_blk = last_eb->h_blkno;
1622
1623 status = ocfs2_journal_dirty(handle, fe_bh);
1624 if (status < 0) {
1625 mlog_errno(status);
1626 goto bail;
1627 }
1628
1629 if (last_eb) {
1630 /* If there will be a new last extent block, then by
1631 * definition, there cannot be any leaves to the right of
1632 * him. */
1633 status = ocfs2_journal_access(handle, inode, last_eb_bh,
1634 OCFS2_JOURNAL_ACCESS_WRITE);
1635 if (status < 0) {
1636 mlog_errno(status);
1637 goto bail;
1638 }
1639 last_eb->h_next_leaf_blk = 0;
1640 status = ocfs2_journal_dirty(handle, last_eb_bh);
1641 if (status < 0) {
1642 mlog_errno(status);
1643 goto bail;
1644 }
1645 }
1646
1647 /* if our tree depth > 0, update all the tree blocks below us. */
1648 while (depth) {
1649 mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n",
1650 depth, next_eb);
1651 status = ocfs2_read_block(osb, next_eb, &eb_bh,
1652 OCFS2_BH_CACHED, inode);
1653 if (status < 0) {
1654 mlog_errno(status);
1655 goto bail;
1656 }
1657 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
1658 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1659 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1660 status = -EIO;
1661 goto bail;
1662 }
1663 el = &(eb->h_list);
1664
1665 status = ocfs2_journal_access(handle, inode, eb_bh,
1666 OCFS2_JOURNAL_ACCESS_WRITE);
1667 if (status < 0) {
1668 mlog_errno(status);
1669 goto bail;
1670 }
1671
1672 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1673 BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
1674
1675 i = le16_to_cpu(el->l_next_free_rec) - 1;
1676
1677 mlog(0, "extent block %"MLFu64", before: record %d: "
1678 "(%u, %u, %"MLFu64"), next = %u\n",
1679 le64_to_cpu(eb->h_blkno), i,
1680 le32_to_cpu(el->l_recs[i].e_cpos),
1681 le32_to_cpu(el->l_recs[i].e_clusters),
1682 le64_to_cpu(el->l_recs[i].e_blkno),
1683 le16_to_cpu(el->l_next_free_rec));
1684
1685 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1686 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1687
1688 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1689 /* bottom-most block requires us to delete data.*/
1690 if (!el->l_tree_depth)
1691 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1692 + ocfs2_clusters_to_blocks(osb->sb,
1693 le32_to_cpu(el->l_recs[i].e_clusters));
1694 if (!el->l_recs[i].e_clusters) {
1695 el->l_recs[i].e_cpos = 0;
1696 el->l_recs[i].e_blkno = 0;
1697 BUG_ON(!el->l_next_free_rec);
1698 le16_add_cpu(&el->l_next_free_rec, -1);
1699 }
1700 mlog(0, "extent block %"MLFu64", after: record %d: "
1701 "(%u, %u, %"MLFu64"), next = %u\n",
1702 le64_to_cpu(eb->h_blkno), i,
1703 le32_to_cpu(el->l_recs[i].e_cpos),
1704 le32_to_cpu(el->l_recs[i].e_clusters),
1705 le64_to_cpu(el->l_recs[i].e_blkno),
1706 le16_to_cpu(el->l_next_free_rec));
1707
1708 status = ocfs2_journal_dirty(handle, eb_bh);
1709 if (status < 0) {
1710 mlog_errno(status);
1711 goto bail;
1712 }
1713
1714 if (!el->l_next_free_rec) {
1715 mlog(0, "deleting this extent block.\n");
1716
1717 ocfs2_remove_from_cache(inode, eb_bh);
1718
1719 BUG_ON(eb->h_suballoc_slot);
1720 BUG_ON(el->l_recs[0].e_clusters);
1721 BUG_ON(el->l_recs[0].e_cpos);
1722 BUG_ON(el->l_recs[0].e_blkno);
1723 status = ocfs2_free_extent_block(handle,
1724 tc->tc_ext_alloc_inode,
1725 tc->tc_ext_alloc_bh,
1726 eb);
1727 if (status < 0) {
1728 mlog_errno(status);
1729 goto bail;
1730 }
1731 }
1732 brelse(eb_bh);
1733 eb_bh = NULL;
1734 depth--;
1735 }
1736
1737 BUG_ON(!delete_blk);
1738 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
1739 clusters_to_del);
1740 if (status < 0) {
1741 mlog_errno(status);
1742 goto bail;
1743 }
1744 status = 0;
1745bail:
1746 if (!status)
1747 ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
1748 else
1749 ocfs2_extent_map_drop(inode, 0);
1750 mlog_exit(status);
1751 return status;
1752}
1753
1754/*
1755 * It is expected, that by the time you call this function,
1756 * inode->i_size and fe->i_size have been adjusted.
1757 *
1758 * WARNING: This will kfree the truncate context
1759 */
1760int ocfs2_commit_truncate(struct ocfs2_super *osb,
1761 struct inode *inode,
1762 struct buffer_head *fe_bh,
1763 struct ocfs2_truncate_context *tc)
1764{
1765 int status, i, credits, tl_sem = 0;
1766 u32 clusters_to_del, target_i_clusters;
1767 u64 last_eb = 0;
1768 struct ocfs2_dinode *fe;
1769 struct ocfs2_extent_block *eb;
1770 struct ocfs2_extent_list *el;
1771 struct buffer_head *last_eb_bh;
1772 struct ocfs2_journal_handle *handle = NULL;
1773 struct inode *tl_inode = osb->osb_tl_inode;
1774
1775 mlog_entry_void();
1776
1777 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1778
1779 target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1780 i_size_read(inode));
1781
1782 last_eb_bh = tc->tc_last_eb_bh;
1783 tc->tc_last_eb_bh = NULL;
1784
1785 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1786
1787 if (fe->id2.i_list.l_tree_depth) {
1788 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1789 el = &eb->h_list;
1790 } else
1791 el = &fe->id2.i_list;
1792 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1793start:
1794 mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
1795 "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", "
1796 "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
1797 le32_to_cpu(fe->i_clusters), last_eb,
1798 le64_to_cpu(fe->i_last_eb_blk),
1799 le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
1800
1801 if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
1802 mlog(0, "last_eb changed!\n");
1803 BUG_ON(!fe->id2.i_list.l_tree_depth);
1804 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1805 /* i_last_eb_blk may have changed, read it if
1806 * necessary. We don't have to worry about the
1807 * truncate to zero case here (where there becomes no
1808 * last_eb) because we never loop back after our work
1809 * is done. */
1810 if (last_eb_bh) {
1811 brelse(last_eb_bh);
1812 last_eb_bh = NULL;
1813 }
1814
1815 status = ocfs2_read_block(osb, last_eb,
1816 &last_eb_bh, OCFS2_BH_CACHED,
1817 inode);
1818 if (status < 0) {
1819 mlog_errno(status);
1820 goto bail;
1821 }
1822 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1823 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1824 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1825 status = -EIO;
1826 goto bail;
1827 }
1828 el = &(eb->h_list);
1829 }
1830
1831 /* by now, el will point to the extent list on the bottom most
1832 * portion of this tree. */
1833 i = le16_to_cpu(el->l_next_free_rec) - 1;
1834 if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
1835 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
1836 else
1837 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
1838 le32_to_cpu(el->l_recs[i].e_cpos)) -
1839 target_i_clusters;
1840
1841 mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
1842
1843 down(&tl_inode->i_sem);
1844 tl_sem = 1;
1845 /* ocfs2_truncate_log_needs_flush guarantees us at least one
1846 * record is free for use. If there isn't any, we flush to get
1847 * an empty truncate log. */
1848 if (ocfs2_truncate_log_needs_flush(osb)) {
1849 status = __ocfs2_flush_truncate_log(osb);
1850 if (status < 0) {
1851 mlog_errno(status);
1852 goto bail;
1853 }
1854 }
1855
1856 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
1857 fe, el);
1858 handle = ocfs2_start_trans(osb, NULL, credits);
1859 if (IS_ERR(handle)) {
1860 status = PTR_ERR(handle);
1861 handle = NULL;
1862 mlog_errno(status);
1863 goto bail;
1864 }
1865
1866 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1867 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
1868 if (status < 0)
1869 mlog_errno(status);
1870
1871 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
1872 last_eb_bh, handle, tc);
1873 if (status < 0) {
1874 mlog_errno(status);
1875 goto bail;
1876 }
1877
1878 up(&tl_inode->i_sem);
1879 tl_sem = 0;
1880
1881 ocfs2_commit_trans(handle);
1882 handle = NULL;
1883
1884 BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
1885 if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
1886 goto start;
1887bail:
1888 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1889
1890 ocfs2_schedule_truncate_log_flush(osb, 1);
1891
1892 if (tl_sem)
1893 up(&tl_inode->i_sem);
1894
1895 if (handle)
1896 ocfs2_commit_trans(handle);
1897
1898 if (last_eb_bh)
1899 brelse(last_eb_bh);
1900
1901 /* This will drop the ext_alloc cluster lock for us */
1902 ocfs2_free_truncate_context(tc);
1903
1904 mlog_exit(status);
1905 return status;
1906}
1907
1908
1909/*
1910 * Expects the inode to already be locked. This will figure out which
1911 * inodes need to be locked and will put them on the returned truncate
1912 * context.
1913 */
1914int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1915 struct inode *inode,
1916 struct buffer_head *fe_bh,
1917 struct ocfs2_truncate_context **tc)
1918{
1919 int status, metadata_delete;
1920 unsigned int new_i_clusters;
1921 struct ocfs2_dinode *fe;
1922 struct ocfs2_extent_block *eb;
1923 struct ocfs2_extent_list *el;
1924 struct buffer_head *last_eb_bh = NULL;
1925 struct inode *ext_alloc_inode = NULL;
1926 struct buffer_head *ext_alloc_bh = NULL;
1927
1928 mlog_entry_void();
1929
1930 *tc = NULL;
1931
1932 new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1933 i_size_read(inode));
1934 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1935
1936 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
1937 "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size);
1938
1939 if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
1940 ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count "
1941 "%u and size %"MLFu64" whereas struct inode has "
1942 "cluster count %u and size %llu which caused an "
1943 "invalid truncate to %u clusters.",
1944 le64_to_cpu(fe->i_blkno),
1945 le32_to_cpu(fe->i_clusters),
1946 le64_to_cpu(fe->i_size),
1947 OCFS2_I(inode)->ip_clusters, i_size_read(inode),
1948 new_i_clusters);
1949 mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
1950 status = -EIO;
1951 goto bail;
1952 }
1953
1954 *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
1955 if (!(*tc)) {
1956 status = -ENOMEM;
1957 mlog_errno(status);
1958 goto bail;
1959 }
1960
1961 metadata_delete = 0;
1962 if (fe->id2.i_list.l_tree_depth) {
1963 /* If we have a tree, then the truncate may result in
1964 * metadata deletes. Figure this out from the
1965 * rightmost leaf block.*/
1966 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
1967 &last_eb_bh, OCFS2_BH_CACHED, inode);
1968 if (status < 0) {
1969 mlog_errno(status);
1970 goto bail;
1971 }
1972 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1973 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1974 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1975
1976 brelse(last_eb_bh);
1977 status = -EIO;
1978 goto bail;
1979 }
1980 el = &(eb->h_list);
1981 if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
1982 metadata_delete = 1;
1983 }
1984
1985 (*tc)->tc_last_eb_bh = last_eb_bh;
1986
1987 if (metadata_delete) {
1988 mlog(0, "Will have to delete metadata for this trunc. "
1989 "locking allocator.\n");
1990 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
1991 if (!ext_alloc_inode) {
1992 status = -ENOMEM;
1993 mlog_errno(status);
1994 goto bail;
1995 }
1996
1997 down(&ext_alloc_inode->i_sem);
1998 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
1999
2000 status = ocfs2_meta_lock(ext_alloc_inode,
2001 NULL,
2002 &ext_alloc_bh,
2003 1);
2004 if (status < 0) {
2005 mlog_errno(status);
2006 goto bail;
2007 }
2008 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
2009 (*tc)->tc_ext_alloc_locked = 1;
2010 }
2011
2012 status = 0;
2013bail:
2014 if (status < 0) {
2015 if (*tc)
2016 ocfs2_free_truncate_context(*tc);
2017 *tc = NULL;
2018 }
2019 mlog_exit_void();
2020 return status;
2021}
2022
2023static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
2024{
2025 if (tc->tc_ext_alloc_inode) {
2026 if (tc->tc_ext_alloc_locked)
2027 ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
2028
2029 up(&tc->tc_ext_alloc_inode->i_sem);
2030 iput(tc->tc_ext_alloc_inode);
2031 }
2032
2033 if (tc->tc_ext_alloc_bh)
2034 brelse(tc->tc_ext_alloc_bh);
2035
2036 if (tc->tc_last_eb_bh)
2037 brelse(tc->tc_last_eb_bh);
2038
2039 kfree(tc);
2040}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
new file mode 100644
index 000000000000..12ba897743f4
--- /dev/null
+++ b/fs/ocfs2/alloc.h
@@ -0,0 +1,82 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * alloc.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_ALLOC_H
27#define OCFS2_ALLOC_H
28
29struct ocfs2_alloc_context;
30int ocfs2_insert_extent(struct ocfs2_super *osb,
31 struct ocfs2_journal_handle *handle,
32 struct inode *inode,
33 struct buffer_head *fe_bh,
34 u64 blkno,
35 u32 new_clusters,
36 struct ocfs2_alloc_context *meta_ac);
37int ocfs2_num_free_extents(struct ocfs2_super *osb,
38 struct inode *inode,
39 struct ocfs2_dinode *fe);
40/* how many new metadata chunks would an allocation need at maximum? */
41static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
42{
43 /*
44 * Rather than do all the work of determining how much we need
45 * (involves a ton of reads and locks), just ask for the
46 * maximal limit. That's a tree depth shift. So, one block for
47 * level of the tree (current l_tree_depth), one block for the
48 * new tree_depth==0 extent_block, and one block at the new
49 * top-of-the tree.
50 */
51 return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
52}
53
54int ocfs2_truncate_log_init(struct ocfs2_super *osb);
55void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
56void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
57 int cancel);
58int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
59int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
60 int slot_num,
61 struct ocfs2_dinode **tl_copy);
62int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
63 struct ocfs2_dinode *tl_copy);
64
65struct ocfs2_truncate_context {
66 struct inode *tc_ext_alloc_inode;
67 struct buffer_head *tc_ext_alloc_bh;
68 int tc_ext_alloc_locked; /* is it cluster locked? */
69 /* these get destroyed once it's passed to ocfs2_commit_truncate. */
70 struct buffer_head *tc_last_eb_bh;
71};
72
73int ocfs2_prepare_truncate(struct ocfs2_super *osb,
74 struct inode *inode,
75 struct buffer_head *fe_bh,
76 struct ocfs2_truncate_context **tc);
77int ocfs2_commit_truncate(struct ocfs2_super *osb,
78 struct inode *inode,
79 struct buffer_head *fe_bh,
80 struct ocfs2_truncate_context *tc);
81
82#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
new file mode 100644
index 000000000000..8f4467a930a5
--- /dev/null
+++ b/fs/ocfs2/aops.c
@@ -0,0 +1,643 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/fs.h>
23#include <linux/slab.h>
24#include <linux/highmem.h>
25#include <linux/pagemap.h>
26#include <asm/byteorder.h>
27
28#define MLOG_MASK_PREFIX ML_FILE_IO
29#include <cluster/masklog.h>
30
31#include "ocfs2.h"
32
33#include "alloc.h"
34#include "aops.h"
35#include "dlmglue.h"
36#include "extent_map.h"
37#include "file.h"
38#include "inode.h"
39#include "journal.h"
40#include "super.h"
41#include "symlink.h"
42
43#include "buffer_head_io.h"
44
45static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
46 struct buffer_head *bh_result, int create)
47{
48 int err = -EIO;
49 int status;
50 struct ocfs2_dinode *fe = NULL;
51 struct buffer_head *bh = NULL;
52 struct buffer_head *buffer_cache_bh = NULL;
53 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
54 void *kaddr;
55
56 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
57 (unsigned long long)iblock, bh_result, create);
58
59 BUG_ON(ocfs2_inode_is_fast_symlink(inode));
60
61 if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
62 mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
63 (unsigned long long)iblock);
64 goto bail;
65 }
66
67 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
68 OCFS2_I(inode)->ip_blkno,
69 &bh, OCFS2_BH_CACHED, inode);
70 if (status < 0) {
71 mlog_errno(status);
72 goto bail;
73 }
74 fe = (struct ocfs2_dinode *) bh->b_data;
75
76 if (!OCFS2_IS_VALID_DINODE(fe)) {
77 mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
78 fe->i_blkno, 7, fe->i_signature);
79 goto bail;
80 }
81
82 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
83 le32_to_cpu(fe->i_clusters))) {
84 mlog(ML_ERROR, "block offset is outside the allocated size: "
85 "%llu\n", (unsigned long long)iblock);
86 goto bail;
87 }
88
89 /* We don't use the page cache to create symlink data, so if
90 * need be, copy it over from the buffer cache. */
91 if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
92 u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
93 iblock;
94 buffer_cache_bh = sb_getblk(osb->sb, blkno);
95 if (!buffer_cache_bh) {
96 mlog(ML_ERROR, "couldn't getblock for symlink!\n");
97 goto bail;
98 }
99
100 /* we haven't locked out transactions, so a commit
101 * could've happened. Since we've got a reference on
102 * the bh, even if it commits while we're doing the
103 * copy, the data is still good. */
104 if (buffer_jbd(buffer_cache_bh)
105 && ocfs2_inode_is_new(inode)) {
106 kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
107 if (!kaddr) {
108 mlog(ML_ERROR, "couldn't kmap!\n");
109 goto bail;
110 }
111 memcpy(kaddr + (bh_result->b_size * iblock),
112 buffer_cache_bh->b_data,
113 bh_result->b_size);
114 kunmap_atomic(kaddr, KM_USER0);
115 set_buffer_uptodate(bh_result);
116 }
117 brelse(buffer_cache_bh);
118 }
119
120 map_bh(bh_result, inode->i_sb,
121 le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
122
123 err = 0;
124
125bail:
126 if (bh)
127 brelse(bh);
128
129 mlog_exit(err);
130 return err;
131}
132
133static int ocfs2_get_block(struct inode *inode, sector_t iblock,
134 struct buffer_head *bh_result, int create)
135{
136 int err = 0;
137 u64 p_blkno, past_eof;
138
139 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
140 (unsigned long long)iblock, bh_result, create);
141
142 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
143 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
144 inode, inode->i_ino);
145
146 if (S_ISLNK(inode->i_mode)) {
147 /* this always does I/O for some reason. */
148 err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
149 goto bail;
150 }
151
152 /* this can happen if another node truncs after our extend! */
153 spin_lock(&OCFS2_I(inode)->ip_lock);
154 if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
155 OCFS2_I(inode)->ip_clusters))
156 err = -EIO;
157 spin_unlock(&OCFS2_I(inode)->ip_lock);
158 if (err)
159 goto bail;
160
161 err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
162 NULL);
163 if (err) {
164 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
165 "%"MLFu64", NULL)\n", err, inode,
166 (unsigned long long)iblock, p_blkno);
167 goto bail;
168 }
169
170 map_bh(bh_result, inode->i_sb, p_blkno);
171
172 if (bh_result->b_blocknr == 0) {
173 err = -EIO;
174 mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
175 "blkno=(%"MLFu64")\n", (unsigned long long)iblock,
176 p_blkno, OCFS2_I(inode)->ip_blkno);
177 }
178
179 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
180 mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
181
182 if (create && (iblock >= past_eof))
183 set_buffer_new(bh_result);
184
185bail:
186 if (err < 0)
187 err = -EIO;
188
189 mlog_exit(err);
190 return err;
191}
192
193static int ocfs2_readpage(struct file *file, struct page *page)
194{
195 struct inode *inode = page->mapping->host;
196 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
197 int ret, unlock = 1;
198
199 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
200
201 ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
202 if (ret != 0) {
203 if (ret == AOP_TRUNCATED_PAGE)
204 unlock = 0;
205 mlog_errno(ret);
206 goto out;
207 }
208
209 down_read(&OCFS2_I(inode)->ip_alloc_sem);
210
211 /*
212 * i_size might have just been updated as we grabed the meta lock. We
213 * might now be discovering a truncate that hit on another node.
214 * block_read_full_page->get_block freaks out if it is asked to read
215 * beyond the end of a file, so we check here. Callers
216 * (generic_file_read, fault->nopage) are clever enough to check i_size
217 * and notice that the page they just read isn't needed.
218 *
219 * XXX sys_readahead() seems to get that wrong?
220 */
221 if (start >= i_size_read(inode)) {
222 char *addr = kmap(page);
223 memset(addr, 0, PAGE_SIZE);
224 flush_dcache_page(page);
225 kunmap(page);
226 SetPageUptodate(page);
227 ret = 0;
228 goto out_alloc;
229 }
230
231 ret = ocfs2_data_lock_with_page(inode, 0, page);
232 if (ret != 0) {
233 if (ret == AOP_TRUNCATED_PAGE)
234 unlock = 0;
235 mlog_errno(ret);
236 goto out_alloc;
237 }
238
239 ret = block_read_full_page(page, ocfs2_get_block);
240 unlock = 0;
241
242 ocfs2_data_unlock(inode, 0);
243out_alloc:
244 up_read(&OCFS2_I(inode)->ip_alloc_sem);
245 ocfs2_meta_unlock(inode, 0);
246out:
247 if (unlock)
248 unlock_page(page);
249 mlog_exit(ret);
250 return ret;
251}
252
253/* Note: Because we don't support holes, our allocation has
254 * already happened (allocation writes zeros to the file data)
255 * so we don't have to worry about ordered writes in
256 * ocfs2_writepage.
257 *
258 * ->writepage is called during the process of invalidating the page cache
259 * during blocked lock processing. It can't block on any cluster locks
260 * to during block mapping. It's relying on the fact that the block
261 * mapping can't have disappeared under the dirty pages that it is
262 * being asked to write back.
263 */
264static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
265{
266 int ret;
267
268 mlog_entry("(0x%p)\n", page);
269
270 ret = block_write_full_page(page, ocfs2_get_block, wbc);
271
272 mlog_exit(ret);
273
274 return ret;
275}
276
277/*
278 * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
279 * from loopback. It must be able to perform its own locking around
280 * ocfs2_get_block().
281 */
282int ocfs2_prepare_write(struct file *file, struct page *page,
283 unsigned from, unsigned to)
284{
285 struct inode *inode = page->mapping->host;
286 int ret;
287
288 mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
289
290 ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
291 if (ret != 0) {
292 mlog_errno(ret);
293 goto out;
294 }
295
296 down_read(&OCFS2_I(inode)->ip_alloc_sem);
297
298 ret = block_prepare_write(page, from, to, ocfs2_get_block);
299
300 up_read(&OCFS2_I(inode)->ip_alloc_sem);
301
302 ocfs2_meta_unlock(inode, 0);
303out:
304 mlog_exit(ret);
305 return ret;
306}
307
308/* Taken from ext3. We don't necessarily need the full blown
309 * functionality yet, but IMHO it's better to cut and paste the whole
310 * thing so we can avoid introducing our own bugs (and easily pick up
311 * their fixes when they happen) --Mark */
312static int walk_page_buffers( handle_t *handle,
313 struct buffer_head *head,
314 unsigned from,
315 unsigned to,
316 int *partial,
317 int (*fn)( handle_t *handle,
318 struct buffer_head *bh))
319{
320 struct buffer_head *bh;
321 unsigned block_start, block_end;
322 unsigned blocksize = head->b_size;
323 int err, ret = 0;
324 struct buffer_head *next;
325
326 for ( bh = head, block_start = 0;
327 ret == 0 && (bh != head || !block_start);
328 block_start = block_end, bh = next)
329 {
330 next = bh->b_this_page;
331 block_end = block_start + blocksize;
332 if (block_end <= from || block_start >= to) {
333 if (partial && !buffer_uptodate(bh))
334 *partial = 1;
335 continue;
336 }
337 err = (*fn)(handle, bh);
338 if (!ret)
339 ret = err;
340 }
341 return ret;
342}
343
344struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
345 struct page *page,
346 unsigned from,
347 unsigned to)
348{
349 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
350 struct ocfs2_journal_handle *handle = NULL;
351 int ret = 0;
352
353 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
354 if (!handle) {
355 ret = -ENOMEM;
356 mlog_errno(ret);
357 goto out;
358 }
359
360 if (ocfs2_should_order_data(inode)) {
361 ret = walk_page_buffers(handle->k_handle,
362 page_buffers(page),
363 from, to, NULL,
364 ocfs2_journal_dirty_data);
365 if (ret < 0)
366 mlog_errno(ret);
367 }
368out:
369 if (ret) {
370 if (handle)
371 ocfs2_commit_trans(handle);
372 handle = ERR_PTR(ret);
373 }
374 return handle;
375}
376
377static int ocfs2_commit_write(struct file *file, struct page *page,
378 unsigned from, unsigned to)
379{
380 int ret, extending = 0, locklevel = 0;
381 loff_t new_i_size;
382 struct buffer_head *di_bh = NULL;
383 struct inode *inode = page->mapping->host;
384 struct ocfs2_journal_handle *handle = NULL;
385
386 mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
387
388 /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
389 * us to sample inode->i_size here without the metadata lock:
390 *
391 * 1) We're currently holding the inode alloc lock, so no
392 * nodes can change it underneath us.
393 *
394 * 2) We've had to take the metadata lock at least once
395 * already to check for extending writes, hence insuring
396 * that our current copy is also up to date.
397 */
398 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
399 if (new_i_size > i_size_read(inode)) {
400 extending = 1;
401 locklevel = 1;
402 }
403
404 ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
405 if (ret != 0) {
406 mlog_errno(ret);
407 goto out;
408 }
409
410 ret = ocfs2_data_lock_with_page(inode, 1, page);
411 if (ret != 0) {
412 mlog_errno(ret);
413 goto out_unlock_meta;
414 }
415
416 if (extending) {
417 handle = ocfs2_start_walk_page_trans(inode, page, from, to);
418 if (IS_ERR(handle)) {
419 ret = PTR_ERR(handle);
420 handle = NULL;
421 goto out_unlock_data;
422 }
423
424 /* Mark our buffer early. We'd rather catch this error up here
425 * as opposed to after a successful commit_write which would
426 * require us to set back inode->i_size. */
427 ret = ocfs2_journal_access(handle, inode, di_bh,
428 OCFS2_JOURNAL_ACCESS_WRITE);
429 if (ret < 0) {
430 mlog_errno(ret);
431 goto out_commit;
432 }
433 }
434
435 /* might update i_size */
436 ret = generic_commit_write(file, page, from, to);
437 if (ret < 0) {
438 mlog_errno(ret);
439 goto out_commit;
440 }
441
442 if (extending) {
443 loff_t size = (u64) i_size_read(inode);
444 struct ocfs2_dinode *di =
445 (struct ocfs2_dinode *)di_bh->b_data;
446
447 /* ocfs2_mark_inode_dirty is too heavy to use here. */
448 inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
449 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
450
451 di->i_size = cpu_to_le64(size);
452 di->i_ctime = di->i_mtime =
453 cpu_to_le64(inode->i_mtime.tv_sec);
454 di->i_ctime_nsec = di->i_mtime_nsec =
455 cpu_to_le32(inode->i_mtime.tv_nsec);
456
457 ret = ocfs2_journal_dirty(handle, di_bh);
458 if (ret < 0) {
459 mlog_errno(ret);
460 goto out_commit;
461 }
462 }
463
464 BUG_ON(extending && (i_size_read(inode) != new_i_size));
465
466out_commit:
467 if (handle)
468 ocfs2_commit_trans(handle);
469out_unlock_data:
470 ocfs2_data_unlock(inode, 1);
471out_unlock_meta:
472 ocfs2_meta_unlock(inode, locklevel);
473out:
474 if (di_bh)
475 brelse(di_bh);
476
477 mlog_exit(ret);
478 return ret;
479}
480
481static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
482{
483 sector_t status;
484 u64 p_blkno = 0;
485 int err = 0;
486 struct inode *inode = mapping->host;
487
488 mlog_entry("(block = %llu)\n", (unsigned long long)block);
489
490 /* We don't need to lock journal system files, since they aren't
491 * accessed concurrently from multiple nodes.
492 */
493 if (!INODE_JOURNAL(inode)) {
494 err = ocfs2_meta_lock(inode, NULL, NULL, 0);
495 if (err) {
496 if (err != -ENOENT)
497 mlog_errno(err);
498 goto bail;
499 }
500 down_read(&OCFS2_I(inode)->ip_alloc_sem);
501 }
502
503 err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
504 NULL);
505
506 if (!INODE_JOURNAL(inode)) {
507 up_read(&OCFS2_I(inode)->ip_alloc_sem);
508 ocfs2_meta_unlock(inode, 0);
509 }
510
511 if (err) {
512 mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
513 (unsigned long long)block);
514 mlog_errno(err);
515 goto bail;
516 }
517
518
519bail:
520 status = err ? 0 : p_blkno;
521
522 mlog_exit((int)status);
523
524 return status;
525}
526
527/*
528 * TODO: Make this into a generic get_blocks function.
529 *
530 * From do_direct_io in direct-io.c:
531 * "So what we do is to permit the ->get_blocks function to populate
532 * bh.b_size with the size of IO which is permitted at this offset and
533 * this i_blkbits."
534 *
535 * This function is called directly from get_more_blocks in direct-io.c.
536 *
537 * called like this: dio->get_blocks(dio->inode, fs_startblk,
538 * fs_count, map_bh, dio->rw == WRITE);
539 */
540static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
541 unsigned long max_blocks,
542 struct buffer_head *bh_result, int create)
543{
544 int ret;
545 u64 vbo_max; /* file offset, max_blocks from iblock */
546 u64 p_blkno;
547 int contig_blocks;
548 unsigned char blocksize_bits;
549
550 if (!inode || !bh_result) {
551 mlog(ML_ERROR, "inode or bh_result is null\n");
552 return -EIO;
553 }
554
555 blocksize_bits = inode->i_sb->s_blocksize_bits;
556
557 /* This function won't even be called if the request isn't all
558 * nicely aligned and of the right size, so there's no need
559 * for us to check any of that. */
560
561 vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
562
563 spin_lock(&OCFS2_I(inode)->ip_lock);
564 if ((iblock + max_blocks) >
565 ocfs2_clusters_to_blocks(inode->i_sb,
566 OCFS2_I(inode)->ip_clusters)) {
567 spin_unlock(&OCFS2_I(inode)->ip_lock);
568 ret = -EIO;
569 goto bail;
570 }
571 spin_unlock(&OCFS2_I(inode)->ip_lock);
572
573 /* This figures out the size of the next contiguous block, and
574 * our logical offset */
575 ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
576 &contig_blocks);
577 if (ret) {
578 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
579 (unsigned long long)iblock);
580 ret = -EIO;
581 goto bail;
582 }
583
584 map_bh(bh_result, inode->i_sb, p_blkno);
585
586 /* make sure we don't map more than max_blocks blocks here as
587 that's all the kernel will handle at this point. */
588 if (max_blocks < contig_blocks)
589 contig_blocks = max_blocks;
590 bh_result->b_size = contig_blocks << blocksize_bits;
591bail:
592 return ret;
593}
594
595/*
596 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
597 * particularly interested in the aio/dio case. Like the core uses
598 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
599 * truncation on another.
600 */
601static void ocfs2_dio_end_io(struct kiocb *iocb,
602 loff_t offset,
603 ssize_t bytes,
604 void *private)
605{
606 struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
607
608 /* this io's submitter should not have unlocked this before we could */
609 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
610 ocfs2_iocb_clear_rw_locked(iocb);
611 up_read(&inode->i_alloc_sem);
612 ocfs2_rw_unlock(inode, 0);
613}
614
615static ssize_t ocfs2_direct_IO(int rw,
616 struct kiocb *iocb,
617 const struct iovec *iov,
618 loff_t offset,
619 unsigned long nr_segs)
620{
621 struct file *file = iocb->ki_filp;
622 struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
623 int ret;
624
625 mlog_entry_void();
626 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
627 inode->i_sb->s_bdev, iov, offset,
628 nr_segs,
629 ocfs2_direct_IO_get_blocks,
630 ocfs2_dio_end_io);
631 mlog_exit(ret);
632 return ret;
633}
634
635struct address_space_operations ocfs2_aops = {
636 .readpage = ocfs2_readpage,
637 .writepage = ocfs2_writepage,
638 .prepare_write = ocfs2_prepare_write,
639 .commit_write = ocfs2_commit_write,
640 .bmap = ocfs2_bmap,
641 .sync_page = block_sync_page,
642 .direct_IO = ocfs2_direct_IO
643};
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
new file mode 100644
index 000000000000..d40456d509a0
--- /dev/null
+++ b/fs/ocfs2/aops.h
@@ -0,0 +1,41 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef OCFS2_AOPS_H
23#define OCFS2_AOPS_H
24
25int ocfs2_prepare_write(struct file *file, struct page *page,
26 unsigned from, unsigned to);
27
28struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
29 struct page *page,
30 unsigned from,
31 unsigned to);
32
33/* all ocfs2_dio_end_io()'s fault */
34#define ocfs2_iocb_is_rw_locked(iocb) \
35 test_bit(0, (unsigned long *)&iocb->private)
36#define ocfs2_iocb_set_rw_locked(iocb) \
37 set_bit(0, (unsigned long *)&iocb->private)
38#define ocfs2_iocb_clear_rw_locked(iocb) \
39 clear_bit(0, (unsigned long *)&iocb->private)
40
41#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
new file mode 100644
index 000000000000..d424041b38e9
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.c
@@ -0,0 +1,232 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * io.c
5 *
6 * Buffer cache handling
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30
31#include <cluster/masklog.h>
32
33#include "ocfs2.h"
34
35#include "alloc.h"
36#include "inode.h"
37#include "journal.h"
38#include "uptodate.h"
39
40#include "buffer_head_io.h"
41
42int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
43 struct inode *inode)
44{
45 int ret = 0;
46
47 mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
48 (unsigned long long)bh->b_blocknr, inode);
49
50 BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
51 BUG_ON(buffer_jbd(bh));
52
53 /* No need to check for a soft readonly file system here. non
54 * journalled writes are only ever done on system files which
55 * can get modified during recovery even if read-only. */
56 if (ocfs2_is_hard_readonly(osb)) {
57 ret = -EROFS;
58 goto out;
59 }
60
61 down(&OCFS2_I(inode)->ip_io_sem);
62
63 lock_buffer(bh);
64 set_buffer_uptodate(bh);
65
66 /* remove from dirty list before I/O. */
67 clear_buffer_dirty(bh);
68
69 get_bh(bh); /* for end_buffer_write_sync() */
70 bh->b_end_io = end_buffer_write_sync;
71 submit_bh(WRITE, bh);
72
73 wait_on_buffer(bh);
74
75 if (buffer_uptodate(bh)) {
76 ocfs2_set_buffer_uptodate(inode, bh);
77 } else {
78 /* We don't need to remove the clustered uptodate
79 * information for this bh as it's not marked locally
80 * uptodate. */
81 ret = -EIO;
82 brelse(bh);
83 }
84
85 up(&OCFS2_I(inode)->ip_io_sem);
86out:
87 mlog_exit(ret);
88 return ret;
89}
90
91int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
92 struct buffer_head *bhs[], int flags,
93 struct inode *inode)
94{
95 int status = 0;
96 struct super_block *sb;
97 int i, ignore_cache = 0;
98 struct buffer_head *bh;
99
100 mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
101 block, nr, flags, inode);
102
103 if (osb == NULL || osb->sb == NULL || bhs == NULL) {
104 status = -EINVAL;
105 mlog_errno(status);
106 goto bail;
107 }
108
109 if (nr < 0) {
110 mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
111 status = -EINVAL;
112 mlog_errno(status);
113 goto bail;
114 }
115
116 if (nr == 0) {
117 mlog(ML_BH_IO, "No buffers will be read!\n");
118 status = 0;
119 goto bail;
120 }
121
122 sb = osb->sb;
123
124 if (flags & OCFS2_BH_CACHED && !inode)
125 flags &= ~OCFS2_BH_CACHED;
126
127 if (inode)
128 down(&OCFS2_I(inode)->ip_io_sem);
129 for (i = 0 ; i < nr ; i++) {
130 if (bhs[i] == NULL) {
131 bhs[i] = sb_getblk(sb, block++);
132 if (bhs[i] == NULL) {
133 if (inode)
134 up(&OCFS2_I(inode)->ip_io_sem);
135 status = -EIO;
136 mlog_errno(status);
137 goto bail;
138 }
139 }
140 bh = bhs[i];
141 ignore_cache = 0;
142
143 if (flags & OCFS2_BH_CACHED &&
144 !ocfs2_buffer_uptodate(inode, bh)) {
145 mlog(ML_UPTODATE,
146 "bh (%llu), inode %"MLFu64" not uptodate\n",
147 (unsigned long long)bh->b_blocknr,
148 OCFS2_I(inode)->ip_blkno);
149 ignore_cache = 1;
150 }
151
152 /* XXX: Can we ever get this and *not* have the cached
153 * flag set? */
154 if (buffer_jbd(bh)) {
155 if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
156 mlog(ML_BH_IO, "trying to sync read a jbd "
157 "managed bh (blocknr = %llu)\n",
158 (unsigned long long)bh->b_blocknr);
159 continue;
160 }
161
162 if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
163 if (buffer_dirty(bh)) {
164 /* This should probably be a BUG, or
165 * at least return an error. */
166 mlog(ML_BH_IO, "asking me to sync read a dirty "
167 "buffer! (blocknr = %llu)\n",
168 (unsigned long long)bh->b_blocknr);
169 continue;
170 }
171
172 lock_buffer(bh);
173 if (buffer_jbd(bh)) {
174#ifdef CATCH_BH_JBD_RACES
175 mlog(ML_ERROR, "block %llu had the JBD bit set "
176 "while I was in lock_buffer!",
177 (unsigned long long)bh->b_blocknr);
178 BUG();
179#else
180 unlock_buffer(bh);
181 continue;
182#endif
183 }
184 clear_buffer_uptodate(bh);
185 get_bh(bh); /* for end_buffer_read_sync() */
186 bh->b_end_io = end_buffer_read_sync;
187 if (flags & OCFS2_BH_READAHEAD)
188 submit_bh(READA, bh);
189 else
190 submit_bh(READ, bh);
191 continue;
192 }
193 }
194
195 status = 0;
196
197 for (i = (nr - 1); i >= 0; i--) {
198 bh = bhs[i];
199
200 /* We know this can't have changed as we hold the
201 * inode sem. Avoid doing any work on the bh if the
202 * journal has it. */
203 if (!buffer_jbd(bh))
204 wait_on_buffer(bh);
205
206 if (!buffer_uptodate(bh)) {
207 /* Status won't be cleared from here on out,
208 * so we can safely record this and loop back
209 * to cleanup the other buffers. Don't need to
210 * remove the clustered uptodate information
211 * for this bh as it's not marked locally
212 * uptodate. */
213 status = -EIO;
214 brelse(bh);
215 bhs[i] = NULL;
216 continue;
217 }
218
219 if (inode)
220 ocfs2_set_buffer_uptodate(inode, bh);
221 }
222 if (inode)
223 up(&OCFS2_I(inode)->ip_io_sem);
224
225 mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
226 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
227
228bail:
229
230 mlog_exit(status);
231 return status;
232}
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
new file mode 100644
index 000000000000..6ecb90937b68
--- /dev/null
+++ b/fs/ocfs2/buffer_head_io.h
@@ -0,0 +1,73 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_buffer_head.h
5 *
6 * Buffer cache handling functions defined
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_BUFFER_HEAD_IO_H
27#define OCFS2_BUFFER_HEAD_IO_H
28
29#include <linux/buffer_head.h>
30
31void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
32 int uptodate);
33
34static inline int ocfs2_read_block(struct ocfs2_super *osb,
35 u64 off,
36 struct buffer_head **bh,
37 int flags,
38 struct inode *inode);
39
40int ocfs2_write_block(struct ocfs2_super *osb,
41 struct buffer_head *bh,
42 struct inode *inode);
43int ocfs2_read_blocks(struct ocfs2_super *osb,
44 u64 block,
45 int nr,
46 struct buffer_head *bhs[],
47 int flags,
48 struct inode *inode);
49
50
51#define OCFS2_BH_CACHED 1
52#define OCFS2_BH_READAHEAD 8 /* use this to pass READA down to submit_bh */
53
54static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
55 struct buffer_head **bh, int flags,
56 struct inode *inode)
57{
58 int status = 0;
59
60 if (bh == NULL) {
61 printk("ocfs2: bh == NULL\n");
62 status = -EINVAL;
63 goto bail;
64 }
65
66 status = ocfs2_read_blocks(osb, off, 1, bh,
67 flags, inode);
68
69bail:
70 return status;
71}
72
73#endif /* OCFS2_BUFFER_HEAD_IO_H */
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
new file mode 100644
index 000000000000..cdd162f13650
--- /dev/null
+++ b/fs/ocfs2/cluster/Makefile
@@ -0,0 +1,4 @@
1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
2
3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
4 quorum.o tcp.o ver.o
diff --git a/fs/ocfs2/cluster/endian.h b/fs/ocfs2/cluster/endian.h
new file mode 100644
index 000000000000..2df9082f4e35
--- /dev/null
+++ b/fs/ocfs2/cluster/endian.h
@@ -0,0 +1,30 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef OCFS2_CLUSTER_ENDIAN_H
23#define OCFS2_CLUSTER_ENDIAN_H
24
25static inline void be32_add_cpu(__be32 *var, u32 val)
26{
27 *var = cpu_to_be32(be32_to_cpu(*var) + val);
28}
29
30#endif /* OCFS2_CLUSTER_ENDIAN_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
new file mode 100644
index 000000000000..7307ba528913
--- /dev/null
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -0,0 +1,1797 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/kernel.h>
23#include <linux/sched.h>
24#include <linux/jiffies.h>
25#include <linux/module.h>
26#include <linux/fs.h>
27#include <linux/bio.h>
28#include <linux/blkdev.h>
29#include <linux/delay.h>
30#include <linux/file.h>
31#include <linux/kthread.h>
32#include <linux/configfs.h>
33#include <linux/random.h>
34#include <linux/crc32.h>
35#include <linux/time.h>
36
37#include "heartbeat.h"
38#include "tcp.h"
39#include "nodemanager.h"
40#include "quorum.h"
41
42#include "masklog.h"
43
44
45/*
46 * The first heartbeat pass had one global thread that would serialize all hb
47 * callback calls. This global serializing sem should only be removed once
48 * we've made sure that all callees can deal with being called concurrently
49 * from multiple hb region threads.
50 */
51static DECLARE_RWSEM(o2hb_callback_sem);
52
53/*
54 * multiple hb threads are watching multiple regions. A node is live
55 * whenever any of the threads sees activity from the node in its region.
56 */
57static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
58static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
59static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
60static LIST_HEAD(o2hb_node_events);
61static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
62
63static LIST_HEAD(o2hb_all_regions);
64
65static struct o2hb_callback {
66 struct list_head list;
67} o2hb_callbacks[O2HB_NUM_CB];
68
69static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);
70
71#define O2HB_DEFAULT_BLOCK_BITS 9
72
73unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;
74
75/* Only sets a new threshold if there are no active regions.
76 *
77 * No locking or otherwise interesting code is required for reading
78 * o2hb_dead_threshold as it can't change once regions are active and
79 * it's not interesting to anyone until then anyway. */
80static void o2hb_dead_threshold_set(unsigned int threshold)
81{
82 if (threshold > O2HB_MIN_DEAD_THRESHOLD) {
83 spin_lock(&o2hb_live_lock);
84 if (list_empty(&o2hb_all_regions))
85 o2hb_dead_threshold = threshold;
86 spin_unlock(&o2hb_live_lock);
87 }
88}
89
90struct o2hb_node_event {
91 struct list_head hn_item;
92 enum o2hb_callback_type hn_event_type;
93 struct o2nm_node *hn_node;
94 int hn_node_num;
95};
96
97struct o2hb_disk_slot {
98 struct o2hb_disk_heartbeat_block *ds_raw_block;
99 u8 ds_node_num;
100 u64 ds_last_time;
101 u64 ds_last_generation;
102 u16 ds_equal_samples;
103 u16 ds_changed_samples;
104 struct list_head ds_live_item;
105};
106
107/* each thread owns a region.. when we're asked to tear down the region
108 * we ask the thread to stop, who cleans up the region */
109struct o2hb_region {
110 struct config_item hr_item;
111
112 struct list_head hr_all_item;
113 unsigned hr_unclean_stop:1;
114
115 /* protected by the hr_callback_sem */
116 struct task_struct *hr_task;
117
118 unsigned int hr_blocks;
119 unsigned long long hr_start_block;
120
121 unsigned int hr_block_bits;
122 unsigned int hr_block_bytes;
123
124 unsigned int hr_slots_per_page;
125 unsigned int hr_num_pages;
126
127 struct page **hr_slot_data;
128 struct block_device *hr_bdev;
129 struct o2hb_disk_slot *hr_slots;
130
131 /* let the person setting up hb wait for it to return until it
132 * has reached a 'steady' state. This will be fixed when we have
133 * a more complete api that doesn't lead to this sort of fragility. */
134 atomic_t hr_steady_iterations;
135
136 char hr_dev_name[BDEVNAME_SIZE];
137
138 unsigned int hr_timeout_ms;
139
140 /* randomized as the region goes up and down so that a node
141 * recognizes a node going up and down in one iteration */
142 u64 hr_generation;
143
144 struct work_struct hr_write_timeout_work;
145 unsigned long hr_last_timeout_start;
146
147 /* Used during o2hb_check_slot to hold a copy of the block
148 * being checked because we temporarily have to zero out the
149 * crc field. */
150 struct o2hb_disk_heartbeat_block *hr_tmp_block;
151};
152
153struct o2hb_bio_wait_ctxt {
154 atomic_t wc_num_reqs;
155 struct completion wc_io_complete;
156};
157
158static void o2hb_write_timeout(void *arg)
159{
160 struct o2hb_region *reg = arg;
161
162 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
163 "milliseconds\n", reg->hr_dev_name,
164 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
165 o2quo_disk_timeout();
166}
167
168static void o2hb_arm_write_timeout(struct o2hb_region *reg)
169{
170 mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS);
171
172 cancel_delayed_work(&reg->hr_write_timeout_work);
173 reg->hr_last_timeout_start = jiffies;
174 schedule_delayed_work(&reg->hr_write_timeout_work,
175 msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
176}
177
178static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
179{
180 cancel_delayed_work(&reg->hr_write_timeout_work);
181 flush_scheduled_work();
182}
183
184static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
185 unsigned int num_ios)
186{
187 atomic_set(&wc->wc_num_reqs, num_ios);
188 init_completion(&wc->wc_io_complete);
189}
190
191/* Used in error paths too */
192static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
193 unsigned int num)
194{
195 /* sadly atomic_sub_and_test() isn't available on all platforms. The
196 * good news is that the fast path only completes one at a time */
197 while(num--) {
198 if (atomic_dec_and_test(&wc->wc_num_reqs)) {
199 BUG_ON(num > 0);
200 complete(&wc->wc_io_complete);
201 }
202 }
203}
204
205static void o2hb_wait_on_io(struct o2hb_region *reg,
206 struct o2hb_bio_wait_ctxt *wc)
207{
208 struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
209
210 blk_run_address_space(mapping);
211
212 wait_for_completion(&wc->wc_io_complete);
213}
214
215static int o2hb_bio_end_io(struct bio *bio,
216 unsigned int bytes_done,
217 int error)
218{
219 struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
220
221 if (error)
222 mlog(ML_ERROR, "IO Error %d\n", error);
223
224 if (bio->bi_size)
225 return 1;
226
227 o2hb_bio_wait_dec(wc, 1);
228 return 0;
229}
230
231/* Setup a Bio to cover I/O against num_slots slots starting at
232 * start_slot. */
233static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
234 struct o2hb_bio_wait_ctxt *wc,
235 unsigned int start_slot,
236 unsigned int num_slots)
237{
238 int i, nr_vecs, len, first_page, last_page;
239 unsigned int vec_len, vec_start;
240 unsigned int bits = reg->hr_block_bits;
241 unsigned int spp = reg->hr_slots_per_page;
242 struct bio *bio;
243 struct page *page;
244
245 nr_vecs = (num_slots + spp - 1) / spp;
246
247 /* Testing has shown this allocation to take long enough under
248 * GFP_KERNEL that the local node can get fenced. It would be
249 * nicest if we could pre-allocate these bios and avoid this
250 * all together. */
251 bio = bio_alloc(GFP_ATOMIC, nr_vecs);
252 if (!bio) {
253 mlog(ML_ERROR, "Could not alloc slots BIO!\n");
254 bio = ERR_PTR(-ENOMEM);
255 goto bail;
256 }
257
258 /* Must put everything in 512 byte sectors for the bio... */
259 bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9);
260 bio->bi_bdev = reg->hr_bdev;
261 bio->bi_private = wc;
262 bio->bi_end_io = o2hb_bio_end_io;
263
264 first_page = start_slot / spp;
265 last_page = first_page + nr_vecs;
266 vec_start = (start_slot << bits) % PAGE_CACHE_SIZE;
267 for(i = first_page; i < last_page; i++) {
268 page = reg->hr_slot_data[i];
269
270 vec_len = PAGE_CACHE_SIZE;
271 /* last page might be short */
272 if (((i + 1) * spp) > (start_slot + num_slots))
273 vec_len = ((num_slots + start_slot) % spp) << bits;
274 vec_len -= vec_start;
275
276 mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
277 i, vec_len, vec_start);
278
279 len = bio_add_page(bio, page, vec_len, vec_start);
280 if (len != vec_len) {
281 bio_put(bio);
282 bio = ERR_PTR(-EIO);
283
284 mlog(ML_ERROR, "Error adding page to bio i = %d, "
285 "vec_len = %u, len = %d\n, start = %u\n",
286 i, vec_len, len, vec_start);
287 goto bail;
288 }
289
290 vec_start = 0;
291 }
292
293bail:
294 return bio;
295}
296
297/*
298 * Compute the maximum number of sectors the bdev can handle in one bio,
299 * as a power of two.
300 *
301 * Stolen from oracleasm, thanks Joel!
302 */
303static int compute_max_sectors(struct block_device *bdev)
304{
305 int max_pages, max_sectors, pow_two_sectors;
306
307 struct request_queue *q;
308
309 q = bdev_get_queue(bdev);
310 max_pages = q->max_sectors >> (PAGE_SHIFT - 9);
311 if (max_pages > BIO_MAX_PAGES)
312 max_pages = BIO_MAX_PAGES;
313 if (max_pages > q->max_phys_segments)
314 max_pages = q->max_phys_segments;
315 if (max_pages > q->max_hw_segments)
316 max_pages = q->max_hw_segments;
317 max_pages--; /* Handle I/Os that straddle a page */
318
319 max_sectors = max_pages << (PAGE_SHIFT - 9);
320
321 /* Why is fls() 1-based???? */
322 pow_two_sectors = 1 << (fls(max_sectors) - 1);
323
324 return pow_two_sectors;
325}
326
327static inline void o2hb_compute_request_limits(struct o2hb_region *reg,
328 unsigned int num_slots,
329 unsigned int *num_bios,
330 unsigned int *slots_per_bio)
331{
332 unsigned int max_sectors, io_sectors;
333
334 max_sectors = compute_max_sectors(reg->hr_bdev);
335
336 io_sectors = num_slots << (reg->hr_block_bits - 9);
337
338 *num_bios = (io_sectors + max_sectors - 1) / max_sectors;
339 *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9);
340
341 mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This "
342 "device can handle %u sectors of I/O\n", io_sectors, num_slots,
343 max_sectors);
344 mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n",
345 *num_bios, *slots_per_bio);
346}
347
348static int o2hb_read_slots(struct o2hb_region *reg,
349 unsigned int max_slots)
350{
351 unsigned int num_bios, slots_per_bio, start_slot, num_slots;
352 int i, status;
353 struct o2hb_bio_wait_ctxt wc;
354 struct bio **bios;
355 struct bio *bio;
356
357 o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio);
358
359 bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL);
360 if (!bios) {
361 status = -ENOMEM;
362 mlog_errno(status);
363 return status;
364 }
365
366 o2hb_bio_wait_init(&wc, num_bios);
367
368 num_slots = slots_per_bio;
369 for(i = 0; i < num_bios; i++) {
370 start_slot = i * slots_per_bio;
371
372 /* adjust num_slots at last bio */
373 if (max_slots < (start_slot + num_slots))
374 num_slots = max_slots - start_slot;
375
376 bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots);
377 if (IS_ERR(bio)) {
378 o2hb_bio_wait_dec(&wc, num_bios - i);
379
380 status = PTR_ERR(bio);
381 mlog_errno(status);
382 goto bail_and_wait;
383 }
384 bios[i] = bio;
385
386 submit_bio(READ, bio);
387 }
388
389 status = 0;
390
391bail_and_wait:
392 o2hb_wait_on_io(reg, &wc);
393
394 if (bios) {
395 for(i = 0; i < num_bios; i++)
396 if (bios[i])
397 bio_put(bios[i]);
398 kfree(bios);
399 }
400
401 return status;
402}
403
404static int o2hb_issue_node_write(struct o2hb_region *reg,
405 struct bio **write_bio,
406 struct o2hb_bio_wait_ctxt *write_wc)
407{
408 int status;
409 unsigned int slot;
410 struct bio *bio;
411
412 o2hb_bio_wait_init(write_wc, 1);
413
414 slot = o2nm_this_node();
415
416 bio = o2hb_setup_one_bio(reg, write_wc, slot, 1);
417 if (IS_ERR(bio)) {
418 status = PTR_ERR(bio);
419 mlog_errno(status);
420 goto bail;
421 }
422
423 submit_bio(WRITE, bio);
424
425 *write_bio = bio;
426 status = 0;
427bail:
428 return status;
429}
430
431static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
432 struct o2hb_disk_heartbeat_block *hb_block)
433{
434 __le32 old_cksum;
435 u32 ret;
436
437 /* We want to compute the block crc with a 0 value in the
438 * hb_cksum field. Save it off here and replace after the
439 * crc. */
440 old_cksum = hb_block->hb_cksum;
441 hb_block->hb_cksum = 0;
442
443 ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);
444
445 hb_block->hb_cksum = old_cksum;
446
447 return ret;
448}
449
450static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block)
451{
452 mlog(ML_ERROR, "Dump slot information: seq = 0x%"MLFx64", node = %u, "
453 "cksum = 0x%x, generation 0x%"MLFx64"\n",
454 le64_to_cpu(hb_block->hb_seq), hb_block->hb_node,
455 le32_to_cpu(hb_block->hb_cksum),
456 le64_to_cpu(hb_block->hb_generation));
457}
458
459static int o2hb_verify_crc(struct o2hb_region *reg,
460 struct o2hb_disk_heartbeat_block *hb_block)
461{
462 u32 read, computed;
463
464 read = le32_to_cpu(hb_block->hb_cksum);
465 computed = o2hb_compute_block_crc_le(reg, hb_block);
466
467 return read == computed;
468}
469
470/* We want to make sure that nobody is heartbeating on top of us --
471 * this will help detect an invalid configuration. */
472static int o2hb_check_last_timestamp(struct o2hb_region *reg)
473{
474 int node_num, ret;
475 struct o2hb_disk_slot *slot;
476 struct o2hb_disk_heartbeat_block *hb_block;
477
478 node_num = o2nm_this_node();
479
480 ret = 1;
481 slot = &reg->hr_slots[node_num];
482 /* Don't check on our 1st timestamp */
483 if (slot->ds_last_time) {
484 hb_block = slot->ds_raw_block;
485
486 if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
487 ret = 0;
488 }
489
490 return ret;
491}
492
493static inline void o2hb_prepare_block(struct o2hb_region *reg,
494 u64 generation)
495{
496 int node_num;
497 u64 cputime;
498 struct o2hb_disk_slot *slot;
499 struct o2hb_disk_heartbeat_block *hb_block;
500
501 node_num = o2nm_this_node();
502 slot = &reg->hr_slots[node_num];
503
504 hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
505 memset(hb_block, 0, reg->hr_block_bytes);
506 /* TODO: time stuff */
507 cputime = CURRENT_TIME.tv_sec;
508 if (!cputime)
509 cputime = 1;
510
511 hb_block->hb_seq = cpu_to_le64(cputime);
512 hb_block->hb_node = node_num;
513 hb_block->hb_generation = cpu_to_le64(generation);
514
515 /* This step must always happen last! */
516 hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
517 hb_block));
518
519 mlog(ML_HB_BIO, "our node generation = 0x%"MLFx64", cksum = 0x%x\n",
520 cpu_to_le64(generation), le32_to_cpu(hb_block->hb_cksum));
521}
522
523static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
524 struct o2nm_node *node,
525 int idx)
526{
527 struct list_head *iter;
528 struct o2hb_callback_func *f;
529
530 list_for_each(iter, &hbcall->list) {
531 f = list_entry(iter, struct o2hb_callback_func, hc_item);
532 mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
533 (f->hc_func)(node, idx, f->hc_data);
534 }
535}
536
537/* Will run the list in order until we process the passed event */
538static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
539{
540 int empty;
541 struct o2hb_callback *hbcall;
542 struct o2hb_node_event *event;
543
544 spin_lock(&o2hb_live_lock);
545 empty = list_empty(&queued_event->hn_item);
546 spin_unlock(&o2hb_live_lock);
547 if (empty)
548 return;
549
550 /* Holding callback sem assures we don't alter the callback
551 * lists when doing this, and serializes ourselves with other
552 * processes wanting callbacks. */
553 down_write(&o2hb_callback_sem);
554
555 spin_lock(&o2hb_live_lock);
556 while (!list_empty(&o2hb_node_events)
557 && !list_empty(&queued_event->hn_item)) {
558 event = list_entry(o2hb_node_events.next,
559 struct o2hb_node_event,
560 hn_item);
561 list_del_init(&event->hn_item);
562 spin_unlock(&o2hb_live_lock);
563
564 mlog(ML_HEARTBEAT, "Node %s event for %d\n",
565 event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN",
566 event->hn_node_num);
567
568 hbcall = hbcall_from_type(event->hn_event_type);
569
570 /* We should *never* have gotten on to the list with a
571 * bad type... This isn't something that we should try
572 * to recover from. */
573 BUG_ON(IS_ERR(hbcall));
574
575 o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num);
576
577 spin_lock(&o2hb_live_lock);
578 }
579 spin_unlock(&o2hb_live_lock);
580
581 up_write(&o2hb_callback_sem);
582}
583
584static void o2hb_queue_node_event(struct o2hb_node_event *event,
585 enum o2hb_callback_type type,
586 struct o2nm_node *node,
587 int node_num)
588{
589 assert_spin_locked(&o2hb_live_lock);
590
591 event->hn_event_type = type;
592 event->hn_node = node;
593 event->hn_node_num = node_num;
594
595 mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n",
596 type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num);
597
598 list_add_tail(&event->hn_item, &o2hb_node_events);
599}
600
601static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
602{
603 struct o2hb_node_event event =
604 { .hn_item = LIST_HEAD_INIT(event.hn_item), };
605 struct o2nm_node *node;
606
607 node = o2nm_get_node_by_num(slot->ds_node_num);
608 if (!node)
609 return;
610
611 spin_lock(&o2hb_live_lock);
612 if (!list_empty(&slot->ds_live_item)) {
613 mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n",
614 slot->ds_node_num);
615
616 list_del_init(&slot->ds_live_item);
617
618 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
619 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
620
621 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
622 slot->ds_node_num);
623 }
624 }
625 spin_unlock(&o2hb_live_lock);
626
627 o2hb_run_event_list(&event);
628
629 o2nm_node_put(node);
630}
631
632static int o2hb_check_slot(struct o2hb_region *reg,
633 struct o2hb_disk_slot *slot)
634{
635 int changed = 0, gen_changed = 0;
636 struct o2hb_node_event event =
637 { .hn_item = LIST_HEAD_INIT(event.hn_item), };
638 struct o2nm_node *node;
639 struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
640 u64 cputime;
641
642 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
643
644 /* Is this correct? Do we assume that the node doesn't exist
645 * if we're not configured for him? */
646 node = o2nm_get_node_by_num(slot->ds_node_num);
647 if (!node)
648 return 0;
649
650 if (!o2hb_verify_crc(reg, hb_block)) {
651 /* all paths from here will drop o2hb_live_lock for
652 * us. */
653 spin_lock(&o2hb_live_lock);
654
655 /* Don't print an error on the console in this case -
656 * a freshly formatted heartbeat area will not have a
657 * crc set on it. */
658 if (list_empty(&slot->ds_live_item))
659 goto out;
660
661 /* The node is live but pushed out a bad crc. We
662 * consider it a transient miss but don't populate any
663 * other values as they may be junk. */
664 mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
665 slot->ds_node_num, reg->hr_dev_name);
666 o2hb_dump_slot(hb_block);
667
668 slot->ds_equal_samples++;
669 goto fire_callbacks;
670 }
671
672 /* we don't care if these wrap.. the state transitions below
673 * clear at the right places */
674 cputime = le64_to_cpu(hb_block->hb_seq);
675 if (slot->ds_last_time != cputime)
676 slot->ds_changed_samples++;
677 else
678 slot->ds_equal_samples++;
679 slot->ds_last_time = cputime;
680
681 /* The node changed heartbeat generations. We assume this to
682 * mean it dropped off but came back before we timed out. We
683 * want to consider it down for the time being but don't want
684 * to lose any changed_samples state we might build up to
685 * considering it live again. */
686 if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
687 gen_changed = 1;
688 slot->ds_equal_samples = 0;
689 mlog(ML_HEARTBEAT, "Node %d changed generation (0x%"MLFx64" "
690 "to 0x%"MLFx64")\n", slot->ds_node_num,
691 slot->ds_last_generation,
692 le64_to_cpu(hb_block->hb_generation));
693 }
694
695 slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
696
697 mlog(ML_HEARTBEAT, "Slot %d gen 0x%"MLFx64" cksum 0x%x "
698 "seq %"MLFu64" last %"MLFu64" changed %u equal %u\n",
699 slot->ds_node_num, slot->ds_last_generation,
700 le32_to_cpu(hb_block->hb_cksum), le64_to_cpu(hb_block->hb_seq),
701 slot->ds_last_time, slot->ds_changed_samples,
702 slot->ds_equal_samples);
703
704 spin_lock(&o2hb_live_lock);
705
706fire_callbacks:
707 /* dead nodes only come to life after some number of
708 * changes at any time during their dead time */
709 if (list_empty(&slot->ds_live_item) &&
710 slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
711 mlog(ML_HEARTBEAT, "Node %d (id 0x%"MLFx64") joined my "
712 "region\n", slot->ds_node_num, slot->ds_last_generation);
713
714 /* first on the list generates a callback */
715 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
716 set_bit(slot->ds_node_num, o2hb_live_node_bitmap);
717
718 o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
719 slot->ds_node_num);
720
721 changed = 1;
722 }
723
724 list_add_tail(&slot->ds_live_item,
725 &o2hb_live_slots[slot->ds_node_num]);
726
727 slot->ds_equal_samples = 0;
728 goto out;
729 }
730
731 /* if the list is dead, we're done.. */
732 if (list_empty(&slot->ds_live_item))
733 goto out;
734
735 /* live nodes only go dead after enough consequtive missed
736 * samples.. reset the missed counter whenever we see
737 * activity */
738 if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
739 mlog(ML_HEARTBEAT, "Node %d left my region\n",
740 slot->ds_node_num);
741
742 /* last off the live_slot generates a callback */
743 list_del_init(&slot->ds_live_item);
744 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
745 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
746
747 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
748 slot->ds_node_num);
749
750 changed = 1;
751 }
752
753 /* We don't clear this because the node is still
754 * actually writing new blocks. */
755 if (!gen_changed)
756 slot->ds_changed_samples = 0;
757 goto out;
758 }
759 if (slot->ds_changed_samples) {
760 slot->ds_changed_samples = 0;
761 slot->ds_equal_samples = 0;
762 }
763out:
764 spin_unlock(&o2hb_live_lock);
765
766 o2hb_run_event_list(&event);
767
768 o2nm_node_put(node);
769 return changed;
770}
771
772/* This could be faster if we just implmented a find_last_bit, but I
773 * don't think the circumstances warrant it. */
774static int o2hb_highest_node(unsigned long *nodes,
775 int numbits)
776{
777 int highest, node;
778
779 highest = numbits;
780 node = -1;
781 while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
782 if (node >= numbits)
783 break;
784
785 highest = node;
786 }
787
788 return highest;
789}
790
791static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
792{
793 int i, ret, highest_node, change = 0;
794 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
795 struct bio *write_bio;
796 struct o2hb_bio_wait_ctxt write_wc;
797
798 if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes)))
799 return;
800
801 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
802 if (highest_node >= O2NM_MAX_NODES) {
803 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
804 return;
805 }
806
807 /* No sense in reading the slots of nodes that don't exist
808 * yet. Of course, if the node definitions have holes in them
809 * then we're reading an empty slot anyway... Consider this
810 * best-effort. */
811 ret = o2hb_read_slots(reg, highest_node + 1);
812 if (ret < 0) {
813 mlog_errno(ret);
814 return;
815 }
816
817 /* With an up to date view of the slots, we can check that no
818 * other node has been improperly configured to heartbeat in
819 * our slot. */
820 if (!o2hb_check_last_timestamp(reg))
821 mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
822 "in our slot!\n", reg->hr_dev_name);
823
824 /* fill in the proper info for our next heartbeat */
825 o2hb_prepare_block(reg, reg->hr_generation);
826
827 /* And fire off the write. Note that we don't wait on this I/O
828 * until later. */
829 ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
830 if (ret < 0) {
831 mlog_errno(ret);
832 return;
833 }
834
835 i = -1;
836 while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
837
838 change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
839 }
840
841 /*
842 * We have to be sure we've advertised ourselves on disk
843 * before we can go to steady state. This ensures that
844 * people we find in our steady state have seen us.
845 */
846 o2hb_wait_on_io(reg, &write_wc);
847 bio_put(write_bio);
848 o2hb_arm_write_timeout(reg);
849
850 /* let the person who launched us know when things are steady */
851 if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
852 if (atomic_dec_and_test(&reg->hr_steady_iterations))
853 wake_up(&o2hb_steady_queue);
854 }
855}
856
857/* Subtract b from a, storing the result in a. a *must* have a larger
858 * value than b. */
859static void o2hb_tv_subtract(struct timeval *a,
860 struct timeval *b)
861{
862 /* just return 0 when a is after b */
863 if (a->tv_sec < b->tv_sec ||
864 (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
865 a->tv_sec = 0;
866 a->tv_usec = 0;
867 return;
868 }
869
870 a->tv_sec -= b->tv_sec;
871 a->tv_usec -= b->tv_usec;
872 while ( a->tv_usec < 0 ) {
873 a->tv_sec--;
874 a->tv_usec += 1000000;
875 }
876}
877
878static unsigned int o2hb_elapsed_msecs(struct timeval *start,
879 struct timeval *end)
880{
881 struct timeval res = *end;
882
883 o2hb_tv_subtract(&res, start);
884
885 return res.tv_sec * 1000 + res.tv_usec / 1000;
886}
887
888/*
889 * we ride the region ref that the region dir holds. before the region
890 * dir is removed and drops it ref it will wait to tear down this
891 * thread.
892 */
893static int o2hb_thread(void *data)
894{
895 int i, ret;
896 struct o2hb_region *reg = data;
897 struct bio *write_bio;
898 struct o2hb_bio_wait_ctxt write_wc;
899 struct timeval before_hb, after_hb;
900 unsigned int elapsed_msec;
901
902 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
903
904 set_user_nice(current, -20);
905
906 while (!kthread_should_stop() && !reg->hr_unclean_stop) {
907 /* We track the time spent inside
908 * o2hb_do_disk_heartbeat so that we avoid more then
909 * hr_timeout_ms between disk writes. On busy systems
910 * this should result in a heartbeat which is less
911 * likely to time itself out. */
912 do_gettimeofday(&before_hb);
913
914 o2hb_do_disk_heartbeat(reg);
915
916 do_gettimeofday(&after_hb);
917 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
918
919 mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
920 before_hb.tv_sec, before_hb.tv_usec,
921 after_hb.tv_sec, after_hb.tv_usec, elapsed_msec);
922
923 if (elapsed_msec < reg->hr_timeout_ms) {
924 /* the kthread api has blocked signals for us so no
925 * need to record the return value. */
926 msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
927 }
928 }
929
930 o2hb_disarm_write_timeout(reg);
931
932 /* unclean stop is only used in very bad situation */
933 for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
934 o2hb_shutdown_slot(&reg->hr_slots[i]);
935
936 /* Explicit down notification - avoid forcing the other nodes
937 * to timeout on this region when we could just as easily
938 * write a clear generation - thus indicating to them that
939 * this node has left this region.
940 *
941 * XXX: Should we skip this on unclean_stop? */
942 o2hb_prepare_block(reg, 0);
943 ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
944 if (ret == 0) {
945 o2hb_wait_on_io(reg, &write_wc);
946 bio_put(write_bio);
947 } else {
948 mlog_errno(ret);
949 }
950
951 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
952
953 return 0;
954}
955
956void o2hb_init(void)
957{
958 int i;
959
960 for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++)
961 INIT_LIST_HEAD(&o2hb_callbacks[i].list);
962
963 for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
964 INIT_LIST_HEAD(&o2hb_live_slots[i]);
965
966 INIT_LIST_HEAD(&o2hb_node_events);
967
968 memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
969}
970
971/* if we're already in a callback then we're already serialized by the sem */
972static void o2hb_fill_node_map_from_callback(unsigned long *map,
973 unsigned bytes)
974{
975 BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
976
977 memcpy(map, &o2hb_live_node_bitmap, bytes);
978}
979
980/*
981 * get a map of all nodes that are heartbeating in any regions
982 */
983void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
984{
985 /* callers want to serialize this map and callbacks so that they
986 * can trust that they don't miss nodes coming to the party */
987 down_read(&o2hb_callback_sem);
988 spin_lock(&o2hb_live_lock);
989 o2hb_fill_node_map_from_callback(map, bytes);
990 spin_unlock(&o2hb_live_lock);
991 up_read(&o2hb_callback_sem);
992}
993EXPORT_SYMBOL_GPL(o2hb_fill_node_map);
994
995/*
996 * heartbeat configfs bits. The heartbeat set is a default set under
997 * the cluster set in nodemanager.c.
998 */
999
1000static struct o2hb_region *to_o2hb_region(struct config_item *item)
1001{
1002 return item ? container_of(item, struct o2hb_region, hr_item) : NULL;
1003}
1004
1005/* drop_item only drops its ref after killing the thread, nothing should
1006 * be using the region anymore. this has to clean up any state that
1007 * attributes might have built up. */
1008static void o2hb_region_release(struct config_item *item)
1009{
1010 int i;
1011 struct page *page;
1012 struct o2hb_region *reg = to_o2hb_region(item);
1013
1014 if (reg->hr_tmp_block)
1015 kfree(reg->hr_tmp_block);
1016
1017 if (reg->hr_slot_data) {
1018 for (i = 0; i < reg->hr_num_pages; i++) {
1019 page = reg->hr_slot_data[i];
1020 if (page)
1021 __free_page(page);
1022 }
1023 kfree(reg->hr_slot_data);
1024 }
1025
1026 if (reg->hr_bdev)
1027 blkdev_put(reg->hr_bdev);
1028
1029 if (reg->hr_slots)
1030 kfree(reg->hr_slots);
1031
1032 spin_lock(&o2hb_live_lock);
1033 list_del(&reg->hr_all_item);
1034 spin_unlock(&o2hb_live_lock);
1035
1036 kfree(reg);
1037}
1038
1039static int o2hb_read_block_input(struct o2hb_region *reg,
1040 const char *page,
1041 size_t count,
1042 unsigned long *ret_bytes,
1043 unsigned int *ret_bits)
1044{
1045 unsigned long bytes;
1046 char *p = (char *)page;
1047
1048 bytes = simple_strtoul(p, &p, 0);
1049 if (!p || (*p && (*p != '\n')))
1050 return -EINVAL;
1051
1052 /* Heartbeat and fs min / max block sizes are the same. */
1053 if (bytes > 4096 || bytes < 512)
1054 return -ERANGE;
1055 if (hweight16(bytes) != 1)
1056 return -EINVAL;
1057
1058 if (ret_bytes)
1059 *ret_bytes = bytes;
1060 if (ret_bits)
1061 *ret_bits = ffs(bytes) - 1;
1062
1063 return 0;
1064}
1065
1066static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg,
1067 char *page)
1068{
1069 return sprintf(page, "%u\n", reg->hr_block_bytes);
1070}
1071
1072static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg,
1073 const char *page,
1074 size_t count)
1075{
1076 int status;
1077 unsigned long block_bytes;
1078 unsigned int block_bits;
1079
1080 if (reg->hr_bdev)
1081 return -EINVAL;
1082
1083 status = o2hb_read_block_input(reg, page, count,
1084 &block_bytes, &block_bits);
1085 if (status)
1086 return status;
1087
1088 reg->hr_block_bytes = (unsigned int)block_bytes;
1089 reg->hr_block_bits = block_bits;
1090
1091 return count;
1092}
1093
1094static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg,
1095 char *page)
1096{
1097 return sprintf(page, "%llu\n", reg->hr_start_block);
1098}
1099
1100static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg,
1101 const char *page,
1102 size_t count)
1103{
1104 unsigned long long tmp;
1105 char *p = (char *)page;
1106
1107 if (reg->hr_bdev)
1108 return -EINVAL;
1109
1110 tmp = simple_strtoull(p, &p, 0);
1111 if (!p || (*p && (*p != '\n')))
1112 return -EINVAL;
1113
1114 reg->hr_start_block = tmp;
1115
1116 return count;
1117}
1118
1119static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg,
1120 char *page)
1121{
1122 return sprintf(page, "%d\n", reg->hr_blocks);
1123}
1124
1125static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg,
1126 const char *page,
1127 size_t count)
1128{
1129 unsigned long tmp;
1130 char *p = (char *)page;
1131
1132 if (reg->hr_bdev)
1133 return -EINVAL;
1134
1135 tmp = simple_strtoul(p, &p, 0);
1136 if (!p || (*p && (*p != '\n')))
1137 return -EINVAL;
1138
1139 if (tmp > O2NM_MAX_NODES || tmp == 0)
1140 return -ERANGE;
1141
1142 reg->hr_blocks = (unsigned int)tmp;
1143
1144 return count;
1145}
1146
1147static ssize_t o2hb_region_dev_read(struct o2hb_region *reg,
1148 char *page)
1149{
1150 unsigned int ret = 0;
1151
1152 if (reg->hr_bdev)
1153 ret = sprintf(page, "%s\n", reg->hr_dev_name);
1154
1155 return ret;
1156}
1157
1158static void o2hb_init_region_params(struct o2hb_region *reg)
1159{
1160 reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
1161 reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
1162
1163 mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
1164 reg->hr_start_block, reg->hr_blocks);
1165 mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n",
1166 reg->hr_block_bytes, reg->hr_block_bits);
1167 mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms);
1168 mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold);
1169}
1170
1171static int o2hb_map_slot_data(struct o2hb_region *reg)
1172{
1173 int i, j;
1174 unsigned int last_slot;
1175 unsigned int spp = reg->hr_slots_per_page;
1176 struct page *page;
1177 char *raw;
1178 struct o2hb_disk_slot *slot;
1179
1180 reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
1181 if (reg->hr_tmp_block == NULL) {
1182 mlog_errno(-ENOMEM);
1183 return -ENOMEM;
1184 }
1185
1186 reg->hr_slots = kcalloc(reg->hr_blocks,
1187 sizeof(struct o2hb_disk_slot), GFP_KERNEL);
1188 if (reg->hr_slots == NULL) {
1189 mlog_errno(-ENOMEM);
1190 return -ENOMEM;
1191 }
1192
1193 for(i = 0; i < reg->hr_blocks; i++) {
1194 slot = &reg->hr_slots[i];
1195 slot->ds_node_num = i;
1196 INIT_LIST_HEAD(&slot->ds_live_item);
1197 slot->ds_raw_block = NULL;
1198 }
1199
1200 reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp;
1201 mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks "
1202 "at %u blocks per page\n",
1203 reg->hr_num_pages, reg->hr_blocks, spp);
1204
1205 reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
1206 GFP_KERNEL);
1207 if (!reg->hr_slot_data) {
1208 mlog_errno(-ENOMEM);
1209 return -ENOMEM;
1210 }
1211
1212 for(i = 0; i < reg->hr_num_pages; i++) {
1213 page = alloc_page(GFP_KERNEL);
1214 if (!page) {
1215 mlog_errno(-ENOMEM);
1216 return -ENOMEM;
1217 }
1218
1219 reg->hr_slot_data[i] = page;
1220
1221 last_slot = i * spp;
1222 raw = page_address(page);
1223 for (j = 0;
1224 (j < spp) && ((j + last_slot) < reg->hr_blocks);
1225 j++) {
1226 BUG_ON((j + last_slot) >= reg->hr_blocks);
1227
1228 slot = &reg->hr_slots[j + last_slot];
1229 slot->ds_raw_block =
1230 (struct o2hb_disk_heartbeat_block *) raw;
1231
1232 raw += reg->hr_block_bytes;
1233 }
1234 }
1235
1236 return 0;
1237}
1238
1239/* Read in all the slots available and populate the tracking
1240 * structures so that we can start with a baseline idea of what's
1241 * there. */
1242static int o2hb_populate_slot_data(struct o2hb_region *reg)
1243{
1244 int ret, i;
1245 struct o2hb_disk_slot *slot;
1246 struct o2hb_disk_heartbeat_block *hb_block;
1247
1248 mlog_entry_void();
1249
1250 ret = o2hb_read_slots(reg, reg->hr_blocks);
1251 if (ret) {
1252 mlog_errno(ret);
1253 goto out;
1254 }
1255
1256 /* We only want to get an idea of the values initially in each
1257 * slot, so we do no verification - o2hb_check_slot will
1258 * actually determine if each configured slot is valid and
1259 * whether any values have changed. */
1260 for(i = 0; i < reg->hr_blocks; i++) {
1261 slot = &reg->hr_slots[i];
1262 hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block;
1263
1264 /* Only fill the values that o2hb_check_slot uses to
1265 * determine changing slots */
1266 slot->ds_last_time = le64_to_cpu(hb_block->hb_seq);
1267 slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);
1268 }
1269
1270out:
1271 mlog_exit(ret);
1272 return ret;
1273}
1274
1275/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */
1276static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1277 const char *page,
1278 size_t count)
1279{
1280 long fd;
1281 int sectsize;
1282 char *p = (char *)page;
1283 struct file *filp = NULL;
1284 struct inode *inode = NULL;
1285 ssize_t ret = -EINVAL;
1286
1287 if (reg->hr_bdev)
1288 goto out;
1289
1290 /* We can't heartbeat without having had our node number
1291 * configured yet. */
1292 if (o2nm_this_node() == O2NM_MAX_NODES)
1293 goto out;
1294
1295 fd = simple_strtol(p, &p, 0);
1296 if (!p || (*p && (*p != '\n')))
1297 goto out;
1298
1299 if (fd < 0 || fd >= INT_MAX)
1300 goto out;
1301
1302 filp = fget(fd);
1303 if (filp == NULL)
1304 goto out;
1305
1306 if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
1307 reg->hr_block_bytes == 0)
1308 goto out;
1309
1310 inode = igrab(filp->f_mapping->host);
1311 if (inode == NULL)
1312 goto out;
1313
1314 if (!S_ISBLK(inode->i_mode))
1315 goto out;
1316
1317 reg->hr_bdev = I_BDEV(filp->f_mapping->host);
1318 ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0);
1319 if (ret) {
1320 reg->hr_bdev = NULL;
1321 goto out;
1322 }
1323 inode = NULL;
1324
1325 bdevname(reg->hr_bdev, reg->hr_dev_name);
1326
1327 sectsize = bdev_hardsect_size(reg->hr_bdev);
1328 if (sectsize != reg->hr_block_bytes) {
1329 mlog(ML_ERROR,
1330 "blocksize %u incorrect for device, expected %d",
1331 reg->hr_block_bytes, sectsize);
1332 ret = -EINVAL;
1333 goto out;
1334 }
1335
1336 o2hb_init_region_params(reg);
1337
1338 /* Generation of zero is invalid */
1339 do {
1340 get_random_bytes(&reg->hr_generation,
1341 sizeof(reg->hr_generation));
1342 } while (reg->hr_generation == 0);
1343
1344 ret = o2hb_map_slot_data(reg);
1345 if (ret) {
1346 mlog_errno(ret);
1347 goto out;
1348 }
1349
1350 ret = o2hb_populate_slot_data(reg);
1351 if (ret) {
1352 mlog_errno(ret);
1353 goto out;
1354 }
1355
1356 INIT_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout, reg);
1357
1358 /*
1359 * A node is considered live after it has beat LIVE_THRESHOLD
1360 * times. We're not steady until we've given them a chance
1361 * _after_ our first read.
1362 */
1363 atomic_set(&reg->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1);
1364
1365 reg->hr_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
1366 reg->hr_item.ci_name);
1367 if (IS_ERR(reg->hr_task)) {
1368 ret = PTR_ERR(reg->hr_task);
1369 mlog_errno(ret);
1370 reg->hr_task = NULL;
1371 goto out;
1372 }
1373
1374 ret = wait_event_interruptible(o2hb_steady_queue,
1375 atomic_read(&reg->hr_steady_iterations) == 0);
1376 if (ret) {
1377 kthread_stop(reg->hr_task);
1378 reg->hr_task = NULL;
1379 goto out;
1380 }
1381
1382 ret = count;
1383out:
1384 if (filp)
1385 fput(filp);
1386 if (inode)
1387 iput(inode);
1388 if (ret < 0) {
1389 if (reg->hr_bdev) {
1390 blkdev_put(reg->hr_bdev);
1391 reg->hr_bdev = NULL;
1392 }
1393 }
1394 return ret;
1395}
1396
1397struct o2hb_region_attribute {
1398 struct configfs_attribute attr;
1399 ssize_t (*show)(struct o2hb_region *, char *);
1400 ssize_t (*store)(struct o2hb_region *, const char *, size_t);
1401};
1402
1403static struct o2hb_region_attribute o2hb_region_attr_block_bytes = {
1404 .attr = { .ca_owner = THIS_MODULE,
1405 .ca_name = "block_bytes",
1406 .ca_mode = S_IRUGO | S_IWUSR },
1407 .show = o2hb_region_block_bytes_read,
1408 .store = o2hb_region_block_bytes_write,
1409};
1410
1411static struct o2hb_region_attribute o2hb_region_attr_start_block = {
1412 .attr = { .ca_owner = THIS_MODULE,
1413 .ca_name = "start_block",
1414 .ca_mode = S_IRUGO | S_IWUSR },
1415 .show = o2hb_region_start_block_read,
1416 .store = o2hb_region_start_block_write,
1417};
1418
1419static struct o2hb_region_attribute o2hb_region_attr_blocks = {
1420 .attr = { .ca_owner = THIS_MODULE,
1421 .ca_name = "blocks",
1422 .ca_mode = S_IRUGO | S_IWUSR },
1423 .show = o2hb_region_blocks_read,
1424 .store = o2hb_region_blocks_write,
1425};
1426
1427static struct o2hb_region_attribute o2hb_region_attr_dev = {
1428 .attr = { .ca_owner = THIS_MODULE,
1429 .ca_name = "dev",
1430 .ca_mode = S_IRUGO | S_IWUSR },
1431 .show = o2hb_region_dev_read,
1432 .store = o2hb_region_dev_write,
1433};
1434
1435static struct configfs_attribute *o2hb_region_attrs[] = {
1436 &o2hb_region_attr_block_bytes.attr,
1437 &o2hb_region_attr_start_block.attr,
1438 &o2hb_region_attr_blocks.attr,
1439 &o2hb_region_attr_dev.attr,
1440 NULL,
1441};
1442
1443static ssize_t o2hb_region_show(struct config_item *item,
1444 struct configfs_attribute *attr,
1445 char *page)
1446{
1447 struct o2hb_region *reg = to_o2hb_region(item);
1448 struct o2hb_region_attribute *o2hb_region_attr =
1449 container_of(attr, struct o2hb_region_attribute, attr);
1450 ssize_t ret = 0;
1451
1452 if (o2hb_region_attr->show)
1453 ret = o2hb_region_attr->show(reg, page);
1454 return ret;
1455}
1456
1457static ssize_t o2hb_region_store(struct config_item *item,
1458 struct configfs_attribute *attr,
1459 const char *page, size_t count)
1460{
1461 struct o2hb_region *reg = to_o2hb_region(item);
1462 struct o2hb_region_attribute *o2hb_region_attr =
1463 container_of(attr, struct o2hb_region_attribute, attr);
1464 ssize_t ret = -EINVAL;
1465
1466 if (o2hb_region_attr->store)
1467 ret = o2hb_region_attr->store(reg, page, count);
1468 return ret;
1469}
1470
1471static struct configfs_item_operations o2hb_region_item_ops = {
1472 .release = o2hb_region_release,
1473 .show_attribute = o2hb_region_show,
1474 .store_attribute = o2hb_region_store,
1475};
1476
1477static struct config_item_type o2hb_region_type = {
1478 .ct_item_ops = &o2hb_region_item_ops,
1479 .ct_attrs = o2hb_region_attrs,
1480 .ct_owner = THIS_MODULE,
1481};
1482
1483/* heartbeat set */
1484
1485struct o2hb_heartbeat_group {
1486 struct config_group hs_group;
1487 /* some stuff? */
1488};
1489
1490static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group)
1491{
1492 return group ?
1493 container_of(group, struct o2hb_heartbeat_group, hs_group)
1494 : NULL;
1495}
1496
1497static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
1498 const char *name)
1499{
1500 struct o2hb_region *reg = NULL;
1501 struct config_item *ret = NULL;
1502
1503 reg = kcalloc(1, sizeof(struct o2hb_region), GFP_KERNEL);
1504 if (reg == NULL)
1505 goto out; /* ENOMEM */
1506
1507 config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
1508
1509 ret = &reg->hr_item;
1510
1511 spin_lock(&o2hb_live_lock);
1512 list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
1513 spin_unlock(&o2hb_live_lock);
1514out:
1515 if (ret == NULL)
1516 kfree(reg);
1517
1518 return ret;
1519}
1520
1521static void o2hb_heartbeat_group_drop_item(struct config_group *group,
1522 struct config_item *item)
1523{
1524 struct o2hb_region *reg = to_o2hb_region(item);
1525
1526 /* stop the thread when the user removes the region dir */
1527 if (reg->hr_task) {
1528 kthread_stop(reg->hr_task);
1529 reg->hr_task = NULL;
1530 }
1531
1532 config_item_put(item);
1533}
1534
1535struct o2hb_heartbeat_group_attribute {
1536 struct configfs_attribute attr;
1537 ssize_t (*show)(struct o2hb_heartbeat_group *, char *);
1538 ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t);
1539};
1540
1541static ssize_t o2hb_heartbeat_group_show(struct config_item *item,
1542 struct configfs_attribute *attr,
1543 char *page)
1544{
1545 struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
1546 struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
1547 container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
1548 ssize_t ret = 0;
1549
1550 if (o2hb_heartbeat_group_attr->show)
1551 ret = o2hb_heartbeat_group_attr->show(reg, page);
1552 return ret;
1553}
1554
1555static ssize_t o2hb_heartbeat_group_store(struct config_item *item,
1556 struct configfs_attribute *attr,
1557 const char *page, size_t count)
1558{
1559 struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item));
1560 struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr =
1561 container_of(attr, struct o2hb_heartbeat_group_attribute, attr);
1562 ssize_t ret = -EINVAL;
1563
1564 if (o2hb_heartbeat_group_attr->store)
1565 ret = o2hb_heartbeat_group_attr->store(reg, page, count);
1566 return ret;
1567}
1568
1569static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group,
1570 char *page)
1571{
1572 return sprintf(page, "%u\n", o2hb_dead_threshold);
1573}
1574
1575static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group,
1576 const char *page,
1577 size_t count)
1578{
1579 unsigned long tmp;
1580 char *p = (char *)page;
1581
1582 tmp = simple_strtoul(p, &p, 10);
1583 if (!p || (*p && (*p != '\n')))
1584 return -EINVAL;
1585
1586 /* this will validate ranges for us. */
1587 o2hb_dead_threshold_set((unsigned int) tmp);
1588
1589 return count;
1590}
1591
1592static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = {
1593 .attr = { .ca_owner = THIS_MODULE,
1594 .ca_name = "dead_threshold",
1595 .ca_mode = S_IRUGO | S_IWUSR },
1596 .show = o2hb_heartbeat_group_threshold_show,
1597 .store = o2hb_heartbeat_group_threshold_store,
1598};
1599
1600static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
1601 &o2hb_heartbeat_group_attr_threshold.attr,
1602 NULL,
1603};
1604
1605static struct configfs_item_operations o2hb_hearbeat_group_item_ops = {
1606 .show_attribute = o2hb_heartbeat_group_show,
1607 .store_attribute = o2hb_heartbeat_group_store,
1608};
1609
1610static struct configfs_group_operations o2hb_heartbeat_group_group_ops = {
1611 .make_item = o2hb_heartbeat_group_make_item,
1612 .drop_item = o2hb_heartbeat_group_drop_item,
1613};
1614
1615static struct config_item_type o2hb_heartbeat_group_type = {
1616 .ct_group_ops = &o2hb_heartbeat_group_group_ops,
1617 .ct_item_ops = &o2hb_hearbeat_group_item_ops,
1618 .ct_attrs = o2hb_heartbeat_group_attrs,
1619 .ct_owner = THIS_MODULE,
1620};
1621
1622/* this is just here to avoid touching group in heartbeat.h which the
1623 * entire damn world #includes */
1624struct config_group *o2hb_alloc_hb_set(void)
1625{
1626 struct o2hb_heartbeat_group *hs = NULL;
1627 struct config_group *ret = NULL;
1628
1629 hs = kcalloc(1, sizeof(struct o2hb_heartbeat_group), GFP_KERNEL);
1630 if (hs == NULL)
1631 goto out;
1632
1633 config_group_init_type_name(&hs->hs_group, "heartbeat",
1634 &o2hb_heartbeat_group_type);
1635
1636 ret = &hs->hs_group;
1637out:
1638 if (ret == NULL)
1639 kfree(hs);
1640 return ret;
1641}
1642
1643void o2hb_free_hb_set(struct config_group *group)
1644{
1645 struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group);
1646 kfree(hs);
1647}
1648
1649/* hb callback registration and issueing */
1650
1651static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
1652{
1653 if (type == O2HB_NUM_CB)
1654 return ERR_PTR(-EINVAL);
1655
1656 return &o2hb_callbacks[type];
1657}
1658
1659void o2hb_setup_callback(struct o2hb_callback_func *hc,
1660 enum o2hb_callback_type type,
1661 o2hb_cb_func *func,
1662 void *data,
1663 int priority)
1664{
1665 INIT_LIST_HEAD(&hc->hc_item);
1666 hc->hc_func = func;
1667 hc->hc_data = data;
1668 hc->hc_priority = priority;
1669 hc->hc_type = type;
1670 hc->hc_magic = O2HB_CB_MAGIC;
1671}
1672EXPORT_SYMBOL_GPL(o2hb_setup_callback);
1673
1674int o2hb_register_callback(struct o2hb_callback_func *hc)
1675{
1676 struct o2hb_callback_func *tmp;
1677 struct list_head *iter;
1678 struct o2hb_callback *hbcall;
1679 int ret;
1680
1681 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
1682 BUG_ON(!list_empty(&hc->hc_item));
1683
1684 hbcall = hbcall_from_type(hc->hc_type);
1685 if (IS_ERR(hbcall)) {
1686 ret = PTR_ERR(hbcall);
1687 goto out;
1688 }
1689
1690 down_write(&o2hb_callback_sem);
1691
1692 list_for_each(iter, &hbcall->list) {
1693 tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
1694 if (hc->hc_priority < tmp->hc_priority) {
1695 list_add_tail(&hc->hc_item, iter);
1696 break;
1697 }
1698 }
1699 if (list_empty(&hc->hc_item))
1700 list_add_tail(&hc->hc_item, &hbcall->list);
1701
1702 up_write(&o2hb_callback_sem);
1703 ret = 0;
1704out:
1705 mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n",
1706 ret, __builtin_return_address(0), hc);
1707 return ret;
1708}
1709EXPORT_SYMBOL_GPL(o2hb_register_callback);
1710
1711int o2hb_unregister_callback(struct o2hb_callback_func *hc)
1712{
1713 BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
1714
1715 mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
1716 __builtin_return_address(0), hc);
1717
1718 if (list_empty(&hc->hc_item))
1719 return 0;
1720
1721 down_write(&o2hb_callback_sem);
1722
1723 list_del_init(&hc->hc_item);
1724
1725 up_write(&o2hb_callback_sem);
1726
1727 return 0;
1728}
1729EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
1730
1731int o2hb_check_node_heartbeating(u8 node_num)
1732{
1733 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
1734
1735 o2hb_fill_node_map(testing_map, sizeof(testing_map));
1736 if (!test_bit(node_num, testing_map)) {
1737 mlog(ML_HEARTBEAT,
1738 "node (%u) does not have heartbeating enabled.\n",
1739 node_num);
1740 return 0;
1741 }
1742
1743 return 1;
1744}
1745EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
1746
1747int o2hb_check_node_heartbeating_from_callback(u8 node_num)
1748{
1749 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
1750
1751 o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
1752 if (!test_bit(node_num, testing_map)) {
1753 mlog(ML_HEARTBEAT,
1754 "node (%u) does not have heartbeating enabled.\n",
1755 node_num);
1756 return 0;
1757 }
1758
1759 return 1;
1760}
1761EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
1762
1763/* Makes sure our local node is configured with a node number, and is
1764 * heartbeating. */
1765int o2hb_check_local_node_heartbeating(void)
1766{
1767 u8 node_num;
1768
1769 /* if this node was set then we have networking */
1770 node_num = o2nm_this_node();
1771 if (node_num == O2NM_MAX_NODES) {
1772 mlog(ML_HEARTBEAT, "this node has not been configured.\n");
1773 return 0;
1774 }
1775
1776 return o2hb_check_node_heartbeating(node_num);
1777}
1778EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
1779
1780/*
1781 * this is just a hack until we get the plumbing which flips file systems
1782 * read only and drops the hb ref instead of killing the node dead.
1783 */
1784void o2hb_stop_all_regions(void)
1785{
1786 struct o2hb_region *reg;
1787
1788 mlog(ML_ERROR, "stopping heartbeat on all active regions.\n");
1789
1790 spin_lock(&o2hb_live_lock);
1791
1792 list_for_each_entry(reg, &o2hb_all_regions, hr_all_item)
1793 reg->hr_unclean_stop = 1;
1794
1795 spin_unlock(&o2hb_live_lock);
1796}
1797EXPORT_SYMBOL_GPL(o2hb_stop_all_regions);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
new file mode 100644
index 000000000000..cac6223206a9
--- /dev/null
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -0,0 +1,82 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * heartbeat.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef O2CLUSTER_HEARTBEAT_H
28#define O2CLUSTER_HEARTBEAT_H
29
30#include "ocfs2_heartbeat.h"
31
32#define O2HB_REGION_TIMEOUT_MS 2000
33
34/* number of changes to be seen as live */
35#define O2HB_LIVE_THRESHOLD 2
36/* number of equal samples to be seen as dead */
37extern unsigned int o2hb_dead_threshold;
38#define O2HB_DEFAULT_DEAD_THRESHOLD 7
39/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
40#define O2HB_MIN_DEAD_THRESHOLD 2
41#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
42
43#define O2HB_CB_MAGIC 0x51d1e4ec
44
45/* callback stuff */
46enum o2hb_callback_type {
47 O2HB_NODE_DOWN_CB = 0,
48 O2HB_NODE_UP_CB,
49 O2HB_NUM_CB
50};
51
52struct o2nm_node;
53typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *);
54
55struct o2hb_callback_func {
56 u32 hc_magic;
57 struct list_head hc_item;
58 o2hb_cb_func *hc_func;
59 void *hc_data;
60 int hc_priority;
61 enum o2hb_callback_type hc_type;
62};
63
64struct config_group *o2hb_alloc_hb_set(void);
65void o2hb_free_hb_set(struct config_group *group);
66
67void o2hb_setup_callback(struct o2hb_callback_func *hc,
68 enum o2hb_callback_type type,
69 o2hb_cb_func *func,
70 void *data,
71 int priority);
72int o2hb_register_callback(struct o2hb_callback_func *hc);
73int o2hb_unregister_callback(struct o2hb_callback_func *hc);
74void o2hb_fill_node_map(unsigned long *map,
75 unsigned bytes);
76void o2hb_init(void);
77int o2hb_check_node_heartbeating(u8 node_num);
78int o2hb_check_node_heartbeating_from_callback(u8 node_num);
79int o2hb_check_local_node_heartbeating(void);
80void o2hb_stop_all_regions(void);
81
82#endif /* O2CLUSTER_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
new file mode 100644
index 000000000000..fd741cea5705
--- /dev/null
+++ b/fs/ocfs2/cluster/masklog.c
@@ -0,0 +1,166 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/module.h>
23#include <linux/kernel.h>
24#include <linux/proc_fs.h>
25#include <linux/seq_file.h>
26#include <linux/string.h>
27#include <asm/uaccess.h>
28
29#include "masklog.h"
30
31struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
32EXPORT_SYMBOL_GPL(mlog_and_bits);
33struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
34EXPORT_SYMBOL_GPL(mlog_not_bits);
35
36static ssize_t mlog_mask_show(u64 mask, char *buf)
37{
38 char *state;
39
40 if (__mlog_test_u64(mask, mlog_and_bits))
41 state = "allow";
42 else if (__mlog_test_u64(mask, mlog_not_bits))
43 state = "deny";
44 else
45 state = "off";
46
47 return snprintf(buf, PAGE_SIZE, "%s\n", state);
48}
49
50static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
51{
52 if (!strnicmp(buf, "allow", 5)) {
53 __mlog_set_u64(mask, mlog_and_bits);
54 __mlog_clear_u64(mask, mlog_not_bits);
55 } else if (!strnicmp(buf, "deny", 4)) {
56 __mlog_set_u64(mask, mlog_not_bits);
57 __mlog_clear_u64(mask, mlog_and_bits);
58 } else if (!strnicmp(buf, "off", 3)) {
59 __mlog_clear_u64(mask, mlog_not_bits);
60 __mlog_clear_u64(mask, mlog_and_bits);
61 } else
62 return -EINVAL;
63
64 return count;
65}
66
67struct mlog_attribute {
68 struct attribute attr;
69 u64 mask;
70};
71
72#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr)
73
74#define define_mask(_name) { \
75 .attr = { \
76 .name = #_name, \
77 .mode = S_IRUGO | S_IWUSR, \
78 }, \
79 .mask = ML_##_name, \
80}
81
82static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
83 define_mask(ENTRY),
84 define_mask(EXIT),
85 define_mask(TCP),
86 define_mask(MSG),
87 define_mask(SOCKET),
88 define_mask(HEARTBEAT),
89 define_mask(HB_BIO),
90 define_mask(DLMFS),
91 define_mask(DLM),
92 define_mask(DLM_DOMAIN),
93 define_mask(DLM_THREAD),
94 define_mask(DLM_MASTER),
95 define_mask(DLM_RECOVERY),
96 define_mask(AIO),
97 define_mask(JOURNAL),
98 define_mask(DISK_ALLOC),
99 define_mask(SUPER),
100 define_mask(FILE_IO),
101 define_mask(EXTENT_MAP),
102 define_mask(DLM_GLUE),
103 define_mask(BH_IO),
104 define_mask(UPTODATE),
105 define_mask(NAMEI),
106 define_mask(INODE),
107 define_mask(VOTE),
108 define_mask(DCACHE),
109 define_mask(CONN),
110 define_mask(QUORUM),
111 define_mask(EXPORT),
112 define_mask(ERROR),
113 define_mask(NOTICE),
114 define_mask(KTHREAD),
115};
116
117static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
118
119static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
120 char *buf)
121{
122 struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
123
124 return mlog_mask_show(mlog_attr->mask, buf);
125}
126
127static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
128 const char *buf, size_t count)
129{
130 struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
131
132 return mlog_mask_store(mlog_attr->mask, buf, count);
133}
134
135static struct sysfs_ops mlog_attr_ops = {
136 .show = mlog_show,
137 .store = mlog_store,
138};
139
140static struct kobj_type mlog_ktype = {
141 .default_attrs = mlog_attr_ptrs,
142 .sysfs_ops = &mlog_attr_ops,
143};
144
145static struct kset mlog_kset = {
146 .kobj = {.name = "logmask", .ktype = &mlog_ktype},
147};
148
149int mlog_sys_init(struct subsystem *o2cb_subsys)
150{
151 int i = 0;
152
153 while (mlog_attrs[i].attr.mode) {
154 mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
155 i++;
156 }
157 mlog_attr_ptrs[i] = NULL;
158
159 mlog_kset.subsys = o2cb_subsys;
160 return kset_register(&mlog_kset);
161}
162
163void mlog_sys_shutdown(void)
164{
165 kset_unregister(&mlog_kset);
166}
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
new file mode 100644
index 000000000000..f5ef5ea61a05
--- /dev/null
+++ b/fs/ocfs2/cluster/masklog.h
@@ -0,0 +1,275 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef O2CLUSTER_MASKLOG_H
23#define O2CLUSTER_MASKLOG_H
24
25/*
26 * For now this is a trivial wrapper around printk() that gives the critical
27 * ability to enable sets of debugging output at run-time. In the future this
28 * will almost certainly be redirected to relayfs so that it can pay a
29 * substantially lower heisenberg tax.
30 *
31 * Callers associate the message with a bitmask and a global bitmask is
32 * maintained with help from /proc. If any of the bits match the message is
33 * output.
34 *
35 * We must have efficient bit tests on i386 and it seems gcc still emits crazy
36 * code for the 64bit compare. It emits very good code for the dual unsigned
37 * long tests, though, completely avoiding tests that can never pass if the
38 * caller gives a constant bitmask that fills one of the longs with all 0s. So
39 * the desire is to have almost all of the calls decided on by comparing just
40 * one of the longs. This leads to having infrequently given bits that are
41 * frequently matched in the high bits.
42 *
43 * _ERROR and _NOTICE are used for messages that always go to the console and
44 * have appropriate KERN_ prefixes. We wrap these in our function instead of
45 * just calling printk() so that this can eventually make its way through
46 * relayfs along with the debugging messages. Everything else gets KERN_DEBUG.
47 * The inline tests and macro dance give GCC the opportunity to quite cleverly
48 * only emit the appropriage printk() when the caller passes in a constant
49 * mask, as is almost always the case.
50 *
51 * All this bitmask nonsense is hidden from the /proc interface so that Joel
52 * doesn't have an aneurism. Reading the file gives a straight forward
53 * indication of which bits are on or off:
54 * ENTRY off
55 * EXIT off
56 * TCP off
57 * MSG off
58 * SOCKET off
59 * ERROR off
60 * NOTICE on
61 *
62 * Writing changes the state of a given bit and requires a strictly formatted
63 * single write() call:
64 *
65 * write(fd, "ENTRY on", 8);
66 *
67 * would turn the entry bit on. "1" is also accepted in the place of "on", and
68 * "off" and "0" behave as expected.
69 *
70 * Some trivial shell can flip all the bits on or off:
71 *
72 * log_mask="/proc/fs/ocfs2_nodemanager/log_mask"
73 * cat $log_mask | (
74 * while read bit status; do
75 * # $1 is "on" or "off", say
76 * echo "$bit $1" > $log_mask
77 * done
78 * )
79 */
80
81/* for task_struct */
82#include <linux/sched.h>
83
84/* bits that are frequently given and infrequently matched in the low word */
85/* NOTE: If you add a flag, you need to also update mlog.c! */
86#define ML_ENTRY 0x0000000000000001ULL /* func call entry */
87#define ML_EXIT 0x0000000000000002ULL /* func call exit */
88#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */
89#define ML_MSG 0x0000000000000008ULL /* net network messages */
90#define ML_SOCKET 0x0000000000000010ULL /* net socket lifetime */
91#define ML_HEARTBEAT 0x0000000000000020ULL /* hb all heartbeat tracking */
92#define ML_HB_BIO 0x0000000000000040ULL /* hb io tracing */
93#define ML_DLMFS 0x0000000000000080ULL /* dlm user dlmfs */
94#define ML_DLM 0x0000000000000100ULL /* dlm general debugging */
95#define ML_DLM_DOMAIN 0x0000000000000200ULL /* dlm domain debugging */
96#define ML_DLM_THREAD 0x0000000000000400ULL /* dlm domain thread */
97#define ML_DLM_MASTER 0x0000000000000800ULL /* dlm master functions */
98#define ML_DLM_RECOVERY 0x0000000000001000ULL /* dlm master functions */
99#define ML_AIO 0x0000000000002000ULL /* ocfs2 aio read and write */
100#define ML_JOURNAL 0x0000000000004000ULL /* ocfs2 journalling functions */
101#define ML_DISK_ALLOC 0x0000000000008000ULL /* ocfs2 disk allocation */
102#define ML_SUPER 0x0000000000010000ULL /* ocfs2 mount / umount */
103#define ML_FILE_IO 0x0000000000020000ULL /* ocfs2 file I/O */
104#define ML_EXTENT_MAP 0x0000000000040000ULL /* ocfs2 extent map caching */
105#define ML_DLM_GLUE 0x0000000000080000ULL /* ocfs2 dlm glue layer */
106#define ML_BH_IO 0x0000000000100000ULL /* ocfs2 buffer I/O */
107#define ML_UPTODATE 0x0000000000200000ULL /* ocfs2 caching sequence #'s */
108#define ML_NAMEI 0x0000000000400000ULL /* ocfs2 directory / namespace */
109#define ML_INODE 0x0000000000800000ULL /* ocfs2 inode manipulation */
110#define ML_VOTE 0x0000000001000000ULL /* ocfs2 node messaging */
111#define ML_DCACHE 0x0000000002000000ULL /* ocfs2 dcache operations */
112#define ML_CONN 0x0000000004000000ULL /* net connection management */
113#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
114#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
115/* bits that are infrequently given and frequently matched in the high word */
116#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
117#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
118#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
119
120#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
121#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
122#ifndef MLOG_MASK_PREFIX
123#define MLOG_MASK_PREFIX 0
124#endif
125
126#define MLOG_MAX_BITS 64
127
128struct mlog_bits {
129 unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG];
130};
131
132extern struct mlog_bits mlog_and_bits, mlog_not_bits;
133
134#if BITS_PER_LONG == 32
135
136#define __mlog_test_u64(mask, bits) \
137 ( (u32)(mask & 0xffffffff) & bits.words[0] || \
138 ((u64)(mask) >> 32) & bits.words[1] )
139#define __mlog_set_u64(mask, bits) do { \
140 bits.words[0] |= (u32)(mask & 0xffffffff); \
141 bits.words[1] |= (u64)(mask) >> 32; \
142} while (0)
143#define __mlog_clear_u64(mask, bits) do { \
144 bits.words[0] &= ~((u32)(mask & 0xffffffff)); \
145 bits.words[1] &= ~((u64)(mask) >> 32); \
146} while (0)
147#define MLOG_BITS_RHS(mask) { \
148 { \
149 [0] = (u32)(mask & 0xffffffff), \
150 [1] = (u64)(mask) >> 32, \
151 } \
152}
153
154#else /* 32bit long above, 64bit long below */
155
156#define __mlog_test_u64(mask, bits) ((mask) & bits.words[0])
157#define __mlog_set_u64(mask, bits) do { \
158 bits.words[0] |= (mask); \
159} while (0)
160#define __mlog_clear_u64(mask, bits) do { \
161 bits.words[0] &= ~(mask); \
162} while (0)
163#define MLOG_BITS_RHS(mask) { { (mask) } }
164
165#endif
166
167/*
168 * smp_processor_id() "helpfully" screams when called outside preemptible
169 * regions in current kernels. sles doesn't have the variants that don't
170 * scream. just do this instead of trying to guess which we're building
171 * against.. *sigh*.
172 */
173#define __mlog_cpu_guess ({ \
174 unsigned long _cpu = get_cpu(); \
175 put_cpu(); \
176 _cpu; \
177})
178
179/* In the following two macros, the whitespace after the ',' just
180 * before ##args is intentional. Otherwise, gcc 2.95 will eat the
181 * previous token if args expands to nothing.
182 */
183#define __mlog_printk(level, fmt, args...) \
184 printk(level "(%u,%lu):%s:%d " fmt, current->pid, \
185 __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \
186 ##args)
187
188#define mlog(mask, fmt, args...) do { \
189 u64 __m = MLOG_MASK_PREFIX | (mask); \
190 if (__mlog_test_u64(__m, mlog_and_bits) && \
191 !__mlog_test_u64(__m, mlog_not_bits)) { \
192 if (__m & ML_ERROR) \
193 __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \
194 else if (__m & ML_NOTICE) \
195 __mlog_printk(KERN_NOTICE, fmt , ##args); \
196 else __mlog_printk(KERN_INFO, fmt , ##args); \
197 } \
198} while (0)
199
200#define mlog_errno(st) do { \
201 int _st = (st); \
202 if (_st != -ERESTARTSYS && _st != -EINTR && \
203 _st != AOP_TRUNCATED_PAGE) \
204 mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
205} while (0)
206
207#define mlog_entry(fmt, args...) do { \
208 mlog(ML_ENTRY, "ENTRY:" fmt , ##args); \
209} while (0)
210
211#define mlog_entry_void() do { \
212 mlog(ML_ENTRY, "ENTRY:\n"); \
213} while (0)
214
215/* We disable this for old compilers since they don't have support for
216 * __builtin_types_compatible_p.
217 */
218#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) && \
219 !defined(__CHECKER__)
220#define mlog_exit(st) do { \
221 if (__builtin_types_compatible_p(typeof(st), unsigned long)) \
222 mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st)); \
223 else if (__builtin_types_compatible_p(typeof(st), signed long)) \
224 mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st)); \
225 else if (__builtin_types_compatible_p(typeof(st), unsigned int) \
226 || __builtin_types_compatible_p(typeof(st), unsigned short) \
227 || __builtin_types_compatible_p(typeof(st), unsigned char)) \
228 mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st)); \
229 else if (__builtin_types_compatible_p(typeof(st), signed int) \
230 || __builtin_types_compatible_p(typeof(st), signed short) \
231 || __builtin_types_compatible_p(typeof(st), signed char)) \
232 mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st)); \
233 else if (__builtin_types_compatible_p(typeof(st), long long)) \
234 mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
235 else \
236 mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st)); \
237} while (0)
238#else
239#define mlog_exit(st) do { \
240 mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
241} while (0)
242#endif
243
244#define mlog_exit_ptr(ptr) do { \
245 mlog(ML_EXIT, "EXIT: %p\n", ptr); \
246} while (0)
247
248#define mlog_exit_void() do { \
249 mlog(ML_EXIT, "EXIT\n"); \
250} while (0)
251
252#define mlog_bug_on_msg(cond, fmt, args...) do { \
253 if (cond) { \
254 mlog(ML_ERROR, "bug expression: " #cond "\n"); \
255 mlog(ML_ERROR, fmt, ##args); \
256 BUG(); \
257 } \
258} while (0)
259
260#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64)
261#define MLFi64 "lld"
262#define MLFu64 "llu"
263#define MLFx64 "llx"
264#else
265#define MLFi64 "ld"
266#define MLFu64 "lu"
267#define MLFx64 "lx"
268#endif
269
270#include <linux/kobject.h>
271#include <linux/sysfs.h>
272int mlog_sys_init(struct subsystem *o2cb_subsys);
273void mlog_sys_shutdown(void);
274
275#endif /* O2CLUSTER_MASKLOG_H */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
new file mode 100644
index 000000000000..5fd60c105913
--- /dev/null
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -0,0 +1,791 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2004, 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#include <linux/kernel.h>
23#include <linux/module.h>
24#include <linux/sysctl.h>
25#include <linux/configfs.h>
26
27#include "endian.h"
28#include "tcp.h"
29#include "nodemanager.h"
30#include "heartbeat.h"
31#include "masklog.h"
32#include "sys.h"
33#include "ver.h"
34
35/* for now we operate under the assertion that there can be only one
36 * cluster active at a time. Changing this will require trickling
37 * cluster references throughout where nodes are looked up */
38static struct o2nm_cluster *o2nm_single_cluster = NULL;
39
40#define OCFS2_MAX_HB_CTL_PATH 256
41static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
42
43static ctl_table ocfs2_nm_table[] = {
44 {
45 .ctl_name = 1,
46 .procname = "hb_ctl_path",
47 .data = ocfs2_hb_ctl_path,
48 .maxlen = OCFS2_MAX_HB_CTL_PATH,
49 .mode = 0644,
50 .proc_handler = &proc_dostring,
51 .strategy = &sysctl_string,
52 },
53 { .ctl_name = 0 }
54};
55
56static ctl_table ocfs2_mod_table[] = {
57 {
58 .ctl_name = KERN_OCFS2_NM,
59 .procname = "nm",
60 .data = NULL,
61 .maxlen = 0,
62 .mode = 0555,
63 .child = ocfs2_nm_table
64 },
65 { .ctl_name = 0}
66};
67
68static ctl_table ocfs2_kern_table[] = {
69 {
70 .ctl_name = KERN_OCFS2,
71 .procname = "ocfs2",
72 .data = NULL,
73 .maxlen = 0,
74 .mode = 0555,
75 .child = ocfs2_mod_table
76 },
77 { .ctl_name = 0}
78};
79
80static ctl_table ocfs2_root_table[] = {
81 {
82 .ctl_name = CTL_FS,
83 .procname = "fs",
84 .data = NULL,
85 .maxlen = 0,
86 .mode = 0555,
87 .child = ocfs2_kern_table
88 },
89 { .ctl_name = 0 }
90};
91
92static struct ctl_table_header *ocfs2_table_header = NULL;
93
94const char *o2nm_get_hb_ctl_path(void)
95{
96 return ocfs2_hb_ctl_path;
97}
98EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path);
99
100struct o2nm_cluster {
101 struct config_group cl_group;
102 unsigned cl_has_local:1;
103 u8 cl_local_node;
104 rwlock_t cl_nodes_lock;
105 struct o2nm_node *cl_nodes[O2NM_MAX_NODES];
106 struct rb_root cl_node_ip_tree;
107 /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
108 unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
109};
110
111struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
112{
113 struct o2nm_node *node = NULL;
114
115 if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL)
116 goto out;
117
118 read_lock(&o2nm_single_cluster->cl_nodes_lock);
119 node = o2nm_single_cluster->cl_nodes[node_num];
120 if (node)
121 config_item_get(&node->nd_item);
122 read_unlock(&o2nm_single_cluster->cl_nodes_lock);
123out:
124 return node;
125}
126EXPORT_SYMBOL_GPL(o2nm_get_node_by_num);
127
128int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
129{
130 struct o2nm_cluster *cluster = o2nm_single_cluster;
131
132 BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap)));
133
134 if (cluster == NULL)
135 return -EINVAL;
136
137 read_lock(&cluster->cl_nodes_lock);
138 memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
139 read_unlock(&cluster->cl_nodes_lock);
140
141 return 0;
142}
143EXPORT_SYMBOL_GPL(o2nm_configured_node_map);
144
145static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
146 __be32 ip_needle,
147 struct rb_node ***ret_p,
148 struct rb_node **ret_parent)
149{
150 struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
151 struct rb_node *parent = NULL;
152 struct o2nm_node *node, *ret = NULL;
153
154 while (*p) {
155 parent = *p;
156 node = rb_entry(parent, struct o2nm_node, nd_ip_node);
157
158 if (memcmp(&ip_needle, &node->nd_ipv4_address,
159 sizeof(ip_needle)) < 0)
160 p = &(*p)->rb_left;
161 else if (memcmp(&ip_needle, &node->nd_ipv4_address,
162 sizeof(ip_needle)) > 0)
163 p = &(*p)->rb_right;
164 else {
165 ret = node;
166 break;
167 }
168 }
169
170 if (ret_p != NULL)
171 *ret_p = p;
172 if (ret_parent != NULL)
173 *ret_parent = parent;
174
175 return ret;
176}
177
178struct o2nm_node *o2nm_get_node_by_ip(__be32 addr)
179{
180 struct o2nm_node *node = NULL;
181 struct o2nm_cluster *cluster = o2nm_single_cluster;
182
183 if (cluster == NULL)
184 goto out;
185
186 read_lock(&cluster->cl_nodes_lock);
187 node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
188 if (node)
189 config_item_get(&node->nd_item);
190 read_unlock(&cluster->cl_nodes_lock);
191
192out:
193 return node;
194}
195EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip);
196
197void o2nm_node_put(struct o2nm_node *node)
198{
199 config_item_put(&node->nd_item);
200}
201EXPORT_SYMBOL_GPL(o2nm_node_put);
202
203void o2nm_node_get(struct o2nm_node *node)
204{
205 config_item_get(&node->nd_item);
206}
207EXPORT_SYMBOL_GPL(o2nm_node_get);
208
209u8 o2nm_this_node(void)
210{
211 u8 node_num = O2NM_MAX_NODES;
212
213 if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local)
214 node_num = o2nm_single_cluster->cl_local_node;
215
216 return node_num;
217}
218EXPORT_SYMBOL_GPL(o2nm_this_node);
219
220/* node configfs bits */
221
222static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item)
223{
224 return item ?
225 container_of(to_config_group(item), struct o2nm_cluster,
226 cl_group)
227 : NULL;
228}
229
230static struct o2nm_node *to_o2nm_node(struct config_item *item)
231{
232 return item ? container_of(item, struct o2nm_node, nd_item) : NULL;
233}
234
235static void o2nm_node_release(struct config_item *item)
236{
237 struct o2nm_node *node = to_o2nm_node(item);
238 kfree(node);
239}
240
241static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
242{
243 return sprintf(page, "%d\n", node->nd_num);
244}
245
246static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
247{
248 /* through the first node_set .parent
249 * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
250 return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
251}
252
253enum {
254 O2NM_NODE_ATTR_NUM = 0,
255 O2NM_NODE_ATTR_PORT,
256 O2NM_NODE_ATTR_ADDRESS,
257 O2NM_NODE_ATTR_LOCAL,
258};
259
260static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
261 size_t count)
262{
263 struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
264 unsigned long tmp;
265 char *p = (char *)page;
266
267 tmp = simple_strtoul(p, &p, 0);
268 if (!p || (*p && (*p != '\n')))
269 return -EINVAL;
270
271 if (tmp >= O2NM_MAX_NODES)
272 return -ERANGE;
273
274 /* once we're in the cl_nodes tree networking can look us up by
275 * node number and try to use our address and port attributes
276 * to connect to this node.. make sure that they've been set
277 * before writing the node attribute? */
278 if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
279 !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
280 return -EINVAL; /* XXX */
281
282 write_lock(&cluster->cl_nodes_lock);
283 if (cluster->cl_nodes[tmp])
284 p = NULL;
285 else {
286 cluster->cl_nodes[tmp] = node;
287 node->nd_num = tmp;
288 set_bit(tmp, cluster->cl_nodes_bitmap);
289 }
290 write_unlock(&cluster->cl_nodes_lock);
291 if (p == NULL)
292 return -EEXIST;
293
294 return count;
295}
296static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
297{
298 return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
299}
300
301static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
302 const char *page, size_t count)
303{
304 unsigned long tmp;
305 char *p = (char *)page;
306
307 tmp = simple_strtoul(p, &p, 0);
308 if (!p || (*p && (*p != '\n')))
309 return -EINVAL;
310
311 if (tmp == 0)
312 return -EINVAL;
313 if (tmp >= (u16)-1)
314 return -ERANGE;
315
316 node->nd_ipv4_port = htons(tmp);
317
318 return count;
319}
320
321static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
322{
323 return sprintf(page, "%u.%u.%u.%u\n", NIPQUAD(node->nd_ipv4_address));
324}
325
326static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
327 const char *page,
328 size_t count)
329{
330 struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
331 int ret, i;
332 struct rb_node **p, *parent;
333 unsigned int octets[4];
334 __be32 ipv4_addr = 0;
335
336 ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2],
337 &octets[1], &octets[0]);
338 if (ret != 4)
339 return -EINVAL;
340
341 for (i = 0; i < ARRAY_SIZE(octets); i++) {
342 if (octets[i] > 255)
343 return -ERANGE;
344 be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
345 }
346
347 ret = 0;
348 write_lock(&cluster->cl_nodes_lock);
349 if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
350 ret = -EEXIST;
351 else {
352 rb_link_node(&node->nd_ip_node, parent, p);
353 rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
354 }
355 write_unlock(&cluster->cl_nodes_lock);
356 if (ret)
357 return ret;
358
359 memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
360
361 return count;
362}
363
364static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
365{
366 return sprintf(page, "%d\n", node->nd_local);
367}
368
369static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
370 size_t count)
371{
372 struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
373 unsigned long tmp;
374 char *p = (char *)page;
375 ssize_t ret;
376
377 tmp = simple_strtoul(p, &p, 0);
378 if (!p || (*p && (*p != '\n')))
379 return -EINVAL;
380
381 tmp = !!tmp; /* boolean of whether this node wants to be local */
382
383 /* setting local turns on networking rx for now so we require having
384 * set everything else first */
385 if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
386 !test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) ||
387 !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
388 return -EINVAL; /* XXX */
389
390 /* the only failure case is trying to set a new local node
391 * when a different one is already set */
392 if (tmp && tmp == cluster->cl_has_local &&
393 cluster->cl_local_node != node->nd_num)
394 return -EBUSY;
395
396 /* bring up the rx thread if we're setting the new local node. */
397 if (tmp && !cluster->cl_has_local) {
398 ret = o2net_start_listening(node);
399 if (ret)
400 return ret;
401 }
402
403 if (!tmp && cluster->cl_has_local &&
404 cluster->cl_local_node == node->nd_num) {
405 o2net_stop_listening(node);
406 cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
407 }
408
409 node->nd_local = tmp;
410 if (node->nd_local) {
411 cluster->cl_has_local = tmp;
412 cluster->cl_local_node = node->nd_num;
413 }
414
415 return count;
416}
417
418struct o2nm_node_attribute {
419 struct configfs_attribute attr;
420 ssize_t (*show)(struct o2nm_node *, char *);
421 ssize_t (*store)(struct o2nm_node *, const char *, size_t);
422};
423
424static struct o2nm_node_attribute o2nm_node_attr_num = {
425 .attr = { .ca_owner = THIS_MODULE,
426 .ca_name = "num",
427 .ca_mode = S_IRUGO | S_IWUSR },
428 .show = o2nm_node_num_read,
429 .store = o2nm_node_num_write,
430};
431
432static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
433 .attr = { .ca_owner = THIS_MODULE,
434 .ca_name = "ipv4_port",
435 .ca_mode = S_IRUGO | S_IWUSR },
436 .show = o2nm_node_ipv4_port_read,
437 .store = o2nm_node_ipv4_port_write,
438};
439
440static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
441 .attr = { .ca_owner = THIS_MODULE,
442 .ca_name = "ipv4_address",
443 .ca_mode = S_IRUGO | S_IWUSR },
444 .show = o2nm_node_ipv4_address_read,
445 .store = o2nm_node_ipv4_address_write,
446};
447
448static struct o2nm_node_attribute o2nm_node_attr_local = {
449 .attr = { .ca_owner = THIS_MODULE,
450 .ca_name = "local",
451 .ca_mode = S_IRUGO | S_IWUSR },
452 .show = o2nm_node_local_read,
453 .store = o2nm_node_local_write,
454};
455
456static struct configfs_attribute *o2nm_node_attrs[] = {
457 [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
458 [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
459 [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
460 [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
461 NULL,
462};
463
464static int o2nm_attr_index(struct configfs_attribute *attr)
465{
466 int i;
467 for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
468 if (attr == o2nm_node_attrs[i])
469 return i;
470 }
471 BUG();
472 return 0;
473}
474
475static ssize_t o2nm_node_show(struct config_item *item,
476 struct configfs_attribute *attr,
477 char *page)
478{
479 struct o2nm_node *node = to_o2nm_node(item);
480 struct o2nm_node_attribute *o2nm_node_attr =
481 container_of(attr, struct o2nm_node_attribute, attr);
482 ssize_t ret = 0;
483
484 if (o2nm_node_attr->show)
485 ret = o2nm_node_attr->show(node, page);
486 return ret;
487}
488
489static ssize_t o2nm_node_store(struct config_item *item,
490 struct configfs_attribute *attr,
491 const char *page, size_t count)
492{
493 struct o2nm_node *node = to_o2nm_node(item);
494 struct o2nm_node_attribute *o2nm_node_attr =
495 container_of(attr, struct o2nm_node_attribute, attr);
496 ssize_t ret;
497 int attr_index = o2nm_attr_index(attr);
498
499 if (o2nm_node_attr->store == NULL) {
500 ret = -EINVAL;
501 goto out;
502 }
503
504 if (test_bit(attr_index, &node->nd_set_attributes))
505 return -EBUSY;
506
507 ret = o2nm_node_attr->store(node, page, count);
508 if (ret < count)
509 goto out;
510
511 set_bit(attr_index, &node->nd_set_attributes);
512out:
513 return ret;
514}
515
516static struct configfs_item_operations o2nm_node_item_ops = {
517 .release = o2nm_node_release,
518 .show_attribute = o2nm_node_show,
519 .store_attribute = o2nm_node_store,
520};
521
522static struct config_item_type o2nm_node_type = {
523 .ct_item_ops = &o2nm_node_item_ops,
524 .ct_attrs = o2nm_node_attrs,
525 .ct_owner = THIS_MODULE,
526};
527
528/* node set */
529
530struct o2nm_node_group {
531 struct config_group ns_group;
532 /* some stuff? */
533};
534
535#if 0
536static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
537{
538 return group ?
539 container_of(group, struct o2nm_node_group, ns_group)
540 : NULL;
541}
542#endif
543
544static struct config_item *o2nm_node_group_make_item(struct config_group *group,
545 const char *name)
546{
547 struct o2nm_node *node = NULL;
548 struct config_item *ret = NULL;
549
550 if (strlen(name) > O2NM_MAX_NAME_LEN)
551 goto out; /* ENAMETOOLONG */
552
553 node = kcalloc(1, sizeof(struct o2nm_node), GFP_KERNEL);
554 if (node == NULL)
555 goto out; /* ENOMEM */
556
557 strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
558 config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
559 spin_lock_init(&node->nd_lock);
560
561 ret = &node->nd_item;
562
563out:
564 if (ret == NULL)
565 kfree(node);
566
567 return ret;
568}
569
570static void o2nm_node_group_drop_item(struct config_group *group,
571 struct config_item *item)
572{
573 struct o2nm_node *node = to_o2nm_node(item);
574 struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
575
576 o2net_disconnect_node(node);
577
578 if (cluster->cl_has_local &&
579 (cluster->cl_local_node == node->nd_num)) {
580 cluster->cl_has_local = 0;
581 cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
582 o2net_stop_listening(node);
583 }
584
585 /* XXX call into net to stop this node from trading messages */
586
587 write_lock(&cluster->cl_nodes_lock);
588
589 /* XXX sloppy */
590 if (node->nd_ipv4_address)
591 rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
592
593 /* nd_num might be 0 if the node number hasn't been set.. */
594 if (cluster->cl_nodes[node->nd_num] == node) {
595 cluster->cl_nodes[node->nd_num] = NULL;
596 clear_bit(node->nd_num, cluster->cl_nodes_bitmap);
597 }
598 write_unlock(&cluster->cl_nodes_lock);
599
600 config_item_put(item);
601}
602
603static struct configfs_group_operations o2nm_node_group_group_ops = {
604 .make_item = o2nm_node_group_make_item,
605 .drop_item = o2nm_node_group_drop_item,
606};
607
608static struct config_item_type o2nm_node_group_type = {
609 .ct_group_ops = &o2nm_node_group_group_ops,
610 .ct_owner = THIS_MODULE,
611};
612
613/* cluster */
614
615static void o2nm_cluster_release(struct config_item *item)
616{
617 struct o2nm_cluster *cluster = to_o2nm_cluster(item);
618
619 kfree(cluster->cl_group.default_groups);
620 kfree(cluster);
621}
622
623static struct configfs_item_operations o2nm_cluster_item_ops = {
624 .release = o2nm_cluster_release,
625};
626
627static struct config_item_type o2nm_cluster_type = {
628 .ct_item_ops = &o2nm_cluster_item_ops,
629 .ct_owner = THIS_MODULE,
630};
631
632/* cluster set */
633
634struct o2nm_cluster_group {
635 struct configfs_subsystem cs_subsys;
636 /* some stuff? */
637};
638
639#if 0
640static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group)
641{
642 return group ?
643 container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys)
644 : NULL;
645}
646#endif
647
648static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
649 const char *name)
650{
651 struct o2nm_cluster *cluster = NULL;
652 struct o2nm_node_group *ns = NULL;
653 struct config_group *o2hb_group = NULL, *ret = NULL;
654 void *defs = NULL;
655
656 /* this runs under the parent dir's i_sem; there can be only
657 * one caller in here at a time */
658 if (o2nm_single_cluster)
659 goto out; /* ENOSPC */
660
661 cluster = kcalloc(1, sizeof(struct o2nm_cluster), GFP_KERNEL);
662 ns = kcalloc(1, sizeof(struct o2nm_node_group), GFP_KERNEL);
663 defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
664 o2hb_group = o2hb_alloc_hb_set();
665 if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
666 goto out;
667
668 config_group_init_type_name(&cluster->cl_group, name,
669 &o2nm_cluster_type);
670 config_group_init_type_name(&ns->ns_group, "node",
671 &o2nm_node_group_type);
672
673 cluster->cl_group.default_groups = defs;
674 cluster->cl_group.default_groups[0] = &ns->ns_group;
675 cluster->cl_group.default_groups[1] = o2hb_group;
676 cluster->cl_group.default_groups[2] = NULL;
677 rwlock_init(&cluster->cl_nodes_lock);
678 cluster->cl_node_ip_tree = RB_ROOT;
679
680 ret = &cluster->cl_group;
681 o2nm_single_cluster = cluster;
682
683out:
684 if (ret == NULL) {
685 kfree(cluster);
686 kfree(ns);
687 o2hb_free_hb_set(o2hb_group);
688 kfree(defs);
689 }
690
691 return ret;
692}
693
694static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
695{
696 struct o2nm_cluster *cluster = to_o2nm_cluster(item);
697 int i;
698 struct config_item *killme;
699
700 BUG_ON(o2nm_single_cluster != cluster);
701 o2nm_single_cluster = NULL;
702
703 for (i = 0; cluster->cl_group.default_groups[i]; i++) {
704 killme = &cluster->cl_group.default_groups[i]->cg_item;
705 cluster->cl_group.default_groups[i] = NULL;
706 config_item_put(killme);
707 }
708
709 config_item_put(item);
710}
711
712static struct configfs_group_operations o2nm_cluster_group_group_ops = {
713 .make_group = o2nm_cluster_group_make_group,
714 .drop_item = o2nm_cluster_group_drop_item,
715};
716
717static struct config_item_type o2nm_cluster_group_type = {
718 .ct_group_ops = &o2nm_cluster_group_group_ops,
719 .ct_owner = THIS_MODULE,
720};
721
722static struct o2nm_cluster_group o2nm_cluster_group = {
723 .cs_subsys = {
724 .su_group = {
725 .cg_item = {
726 .ci_namebuf = "cluster",
727 .ci_type = &o2nm_cluster_group_type,
728 },
729 },
730 },
731};
732
733static void __exit exit_o2nm(void)
734{
735 if (ocfs2_table_header)
736 unregister_sysctl_table(ocfs2_table_header);
737
738 /* XXX sync with hb callbacks and shut down hb? */
739 o2net_unregister_hb_callbacks();
740 configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
741 o2cb_sys_shutdown();
742
743 o2net_exit();
744}
745
746static int __init init_o2nm(void)
747{
748 int ret = -1;
749
750 cluster_print_version();
751
752 o2hb_init();
753 o2net_init();
754
755 ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0);
756 if (!ocfs2_table_header) {
757 printk(KERN_ERR "nodemanager: unable to register sysctl\n");
758 ret = -ENOMEM; /* or something. */
759 goto out;
760 }
761
762 ret = o2net_register_hb_callbacks();
763 if (ret)
764 goto out_sysctl;
765
766 config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
767 init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
768 ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
769 if (ret) {
770 printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
771 goto out_callbacks;
772 }
773
774 ret = o2cb_sys_init();
775 if (!ret)
776 goto out;
777
778 configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
779out_callbacks:
780 o2net_unregister_hb_callbacks();
781out_sysctl:
782 unregister_sysctl_table(ocfs2_table_header);
783out:
784 return ret;
785}
786
787MODULE_AUTHOR("Oracle");
788MODULE_LICENSE("GPL");
789
790module_init(init_o2nm)
791module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
new file mode 100644
index 000000000000..fce8033c310f
--- /dev/null
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -0,0 +1,64 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * nodemanager.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef O2CLUSTER_NODEMANAGER_H
28#define O2CLUSTER_NODEMANAGER_H
29
30#include "ocfs2_nodemanager.h"
31
32/* This totally doesn't belong here. */
33#include <linux/configfs.h>
34#include <linux/rbtree.h>
35
36#define KERN_OCFS2 988
37#define KERN_OCFS2_NM 1
38
39const char *o2nm_get_hb_ctl_path(void);
40
41struct o2nm_node {
42 spinlock_t nd_lock;
43 struct config_item nd_item;
44 char nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */
45 __u8 nd_num;
46 /* only one address per node, as attributes, for now. */
47 __be32 nd_ipv4_address;
48 __be16 nd_ipv4_port;
49 struct rb_node nd_ip_node;
50 /* there can be only one local node for now */
51 int nd_local;
52
53 unsigned long nd_set_attributes;
54};
55
56u8 o2nm_this_node(void);
57
58int o2nm_configured_node_map(unsigned long *map, unsigned bytes);
59struct o2nm_node *o2nm_get_node_by_num(u8 node_num);
60struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
61void o2nm_node_get(struct o2nm_node *node);
62void o2nm_node_put(struct o2nm_node *node);
63
64#endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h
new file mode 100644
index 000000000000..94096069cb43
--- /dev/null
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -0,0 +1,37 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_heartbeat.h
5 *
6 * On-disk structures for ocfs2_heartbeat
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef _OCFS2_HEARTBEAT_H
27#define _OCFS2_HEARTBEAT_H
28
29struct o2hb_disk_heartbeat_block {
30 __le64 hb_seq;
31 __u8 hb_node;
32 __u8 hb_pad1[3];
33 __le32 hb_cksum;
34 __le64 hb_generation;
35};
36
37#endif /* _OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h
new file mode 100644
index 000000000000..5b9854bad571
--- /dev/null
+++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h
@@ -0,0 +1,39 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_nodemanager.h
5 *
6 * Header describing the interface between userspace and the kernel
7 * for the ocfs2_nodemanager module.
8 *
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 *
26 */
27
28#ifndef _OCFS2_NODEMANAGER_H
29#define _OCFS2_NODEMANAGER_H
30
31#define O2NM_API_VERSION 5
32
33#define O2NM_MAX_NODES 255
34#define O2NM_INVALID_NODE_NUM 255
35
36/* host name, group name, cluster name all 64 bytes */
37#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
38
39#endif /* _OCFS2_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
new file mode 100644
index 000000000000..7bba98fbfc15
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.c
@@ -0,0 +1,315 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 *
3 * vim: noexpandtab sw=8 ts=8 sts=0:
4 *
5 * Copyright (C) 2005 Oracle. All rights reserved.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public
18 * License along with this program; if not, write to the
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 021110-1307, USA.
21 */
22
23/* This quorum hack is only here until we transition to some more rational
24 * approach that is driven from userspace. Honest. No foolin'.
25 *
26 * Imagine two nodes lose network connectivity to each other but they're still
27 * up and operating in every other way. Presumably a network timeout indicates
28 * that a node is broken and should be recovered. They can't both recover each
29 * other and both carry on without serialising their access to the file system.
30 * They need to decide who is authoritative. Now extend that problem to
31 * arbitrary groups of nodes losing connectivity between each other.
32 *
33 * So we declare that a node which has given up on connecting to a majority
34 * of nodes who are still heartbeating will fence itself.
35 *
36 * There are huge opportunities for races here. After we give up on a node's
37 * connection we need to wait long enough to give heartbeat an opportunity
38 * to declare the node as truly dead. We also need to be careful with the
39 * race between when we see a node start heartbeating and when we connect
40 * to it.
41 *
42 * So nodes that are in this transtion put a hold on the quorum decision
43 * with a counter. As they fall out of this transition they drop the count
44 * and if they're the last, they fire off the decision.
45 */
46#include <linux/kernel.h>
47#include <linux/slab.h>
48#include <linux/workqueue.h>
49
50#include "heartbeat.h"
51#include "nodemanager.h"
52#define MLOG_MASK_PREFIX ML_QUORUM
53#include "masklog.h"
54#include "quorum.h"
55
56static struct o2quo_state {
57 spinlock_t qs_lock;
58 struct work_struct qs_work;
59 int qs_pending;
60 int qs_heartbeating;
61 unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
62 int qs_connected;
63 unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
64 int qs_holds;
65 unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
66} o2quo_state;
67
68/* this is horribly heavy-handed. It should instead flip the file
69 * system RO and call some userspace script. */
70static void o2quo_fence_self(void)
71{
72 /* panic spins with interrupts enabled. with preempt
73 * threads can still schedule, etc, etc */
74 o2hb_stop_all_regions();
75 panic("ocfs2 is very sorry to be fencing this system by panicing\n");
76}
77
78/* Indicate that a timeout occured on a hearbeat region write. The
79 * other nodes in the cluster may consider us dead at that time so we
80 * want to "fence" ourselves so that we don't scribble on the disk
81 * after they think they've recovered us. This can't solve all
82 * problems related to writeout after recovery but this hack can at
83 * least close some of those gaps. When we have real fencing, this can
84 * go away as our node would be fenced externally before other nodes
85 * begin recovery. */
86void o2quo_disk_timeout(void)
87{
88 o2quo_fence_self();
89}
90
91static void o2quo_make_decision(void *arg)
92{
93 int quorum;
94 int lowest_hb, lowest_reachable = 0, fence = 0;
95 struct o2quo_state *qs = &o2quo_state;
96
97 spin_lock(&qs->qs_lock);
98
99 lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
100 if (lowest_hb != O2NM_MAX_NODES)
101 lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
102
103 mlog(0, "heartbeating: %d, connected: %d, "
104 "lowest: %d (%sreachable)\n", qs->qs_heartbeating,
105 qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
106
107 if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
108 qs->qs_heartbeating == 1)
109 goto out;
110
111 if (qs->qs_heartbeating & 1) {
112 /* the odd numbered cluster case is straight forward --
113 * if we can't talk to the majority we're hosed */
114 quorum = (qs->qs_heartbeating + 1)/2;
115 if (qs->qs_connected < quorum) {
116 mlog(ML_ERROR, "fencing this node because it is "
117 "only connected to %u nodes and %u is needed "
118 "to make a quorum out of %u heartbeating nodes\n",
119 qs->qs_connected, quorum,
120 qs->qs_heartbeating);
121 fence = 1;
122 }
123 } else {
124 /* the even numbered cluster adds the possibility of each half
125 * of the cluster being able to talk amongst themselves.. in
126 * that case we're hosed if we can't talk to the group that has
127 * the lowest numbered node */
128 quorum = qs->qs_heartbeating / 2;
129 if (qs->qs_connected < quorum) {
130 mlog(ML_ERROR, "fencing this node because it is "
131 "only connected to %u nodes and %u is needed "
132 "to make a quorum out of %u heartbeating nodes\n",
133 qs->qs_connected, quorum,
134 qs->qs_heartbeating);
135 fence = 1;
136 }
137 else if ((qs->qs_connected == quorum) &&
138 !lowest_reachable) {
139 mlog(ML_ERROR, "fencing this node because it is "
140 "connected to a half-quorum of %u out of %u "
141 "nodes which doesn't include the lowest active "
142 "node %u\n", quorum, qs->qs_heartbeating,
143 lowest_hb);
144 fence = 1;
145 }
146 }
147
148out:
149 spin_unlock(&qs->qs_lock);
150 if (fence)
151 o2quo_fence_self();
152}
153
154static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
155{
156 assert_spin_locked(&qs->qs_lock);
157
158 if (!test_and_set_bit(node, qs->qs_hold_bm)) {
159 qs->qs_holds++;
160 mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
161 "node %u\n", node);
162 mlog(0, "node %u, %d total\n", node, qs->qs_holds);
163 }
164}
165
166static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
167{
168 assert_spin_locked(&qs->qs_lock);
169
170 if (test_and_clear_bit(node, qs->qs_hold_bm)) {
171 mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
172 if (--qs->qs_holds == 0) {
173 if (qs->qs_pending) {
174 qs->qs_pending = 0;
175 schedule_work(&qs->qs_work);
176 }
177 }
178 mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
179 node, qs->qs_holds);
180 }
181}
182
183/* as a node comes up we delay the quorum decision until we know the fate of
184 * the connection. the hold will be droped in conn_up or hb_down. it might be
185 * perpetuated by con_err until hb_down. if we already have a conn, we might
186 * be dropping a hold that conn_up got. */
187void o2quo_hb_up(u8 node)
188{
189 struct o2quo_state *qs = &o2quo_state;
190
191 spin_lock(&qs->qs_lock);
192
193 qs->qs_heartbeating++;
194 mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
195 "node %u\n", node);
196 mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
197 set_bit(node, qs->qs_hb_bm);
198
199 mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
200
201 if (!test_bit(node, qs->qs_conn_bm))
202 o2quo_set_hold(qs, node);
203 else
204 o2quo_clear_hold(qs, node);
205
206 spin_unlock(&qs->qs_lock);
207}
208
209/* hb going down releases any holds we might have had due to this node from
210 * conn_up, conn_err, or hb_up */
211void o2quo_hb_down(u8 node)
212{
213 struct o2quo_state *qs = &o2quo_state;
214
215 spin_lock(&qs->qs_lock);
216
217 qs->qs_heartbeating--;
218 mlog_bug_on_msg(qs->qs_heartbeating < 0,
219 "node %u, %d heartbeating\n",
220 node, qs->qs_heartbeating);
221 mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
222 clear_bit(node, qs->qs_hb_bm);
223
224 mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
225
226 o2quo_clear_hold(qs, node);
227
228 spin_unlock(&qs->qs_lock);
229}
230
231/* this tells us that we've decided that the node is still heartbeating
232 * even though we've lost it's conn. it must only be called after conn_err
233 * and indicates that we must now make a quorum decision in the future,
234 * though we might be doing so after waiting for holds to drain. Here
235 * we'll be dropping the hold from conn_err. */
236void o2quo_hb_still_up(u8 node)
237{
238 struct o2quo_state *qs = &o2quo_state;
239
240 spin_lock(&qs->qs_lock);
241
242 mlog(0, "node %u\n", node);
243
244 qs->qs_pending = 1;
245 o2quo_clear_hold(qs, node);
246
247 spin_unlock(&qs->qs_lock);
248}
249
250/* This is analagous to hb_up. as a node's connection comes up we delay the
251 * quorum decision until we see it heartbeating. the hold will be droped in
252 * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
253 * it's already heartbeating we we might be dropping a hold that conn_up got.
254 * */
255void o2quo_conn_up(u8 node)
256{
257 struct o2quo_state *qs = &o2quo_state;
258
259 spin_lock(&qs->qs_lock);
260
261 qs->qs_connected++;
262 mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
263 "node %u\n", node);
264 mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
265 set_bit(node, qs->qs_conn_bm);
266
267 mlog(0, "node %u, %d total\n", node, qs->qs_connected);
268
269 if (!test_bit(node, qs->qs_hb_bm))
270 o2quo_set_hold(qs, node);
271 else
272 o2quo_clear_hold(qs, node);
273
274 spin_unlock(&qs->qs_lock);
275}
276
277/* we've decided that we won't ever be connecting to the node again. if it's
278 * still heartbeating we grab a hold that will delay decisions until either the
279 * node stops heartbeating from hb_down or the caller decides that the node is
280 * still up and calls still_up */
281void o2quo_conn_err(u8 node)
282{
283 struct o2quo_state *qs = &o2quo_state;
284
285 spin_lock(&qs->qs_lock);
286
287 if (test_bit(node, qs->qs_conn_bm)) {
288 qs->qs_connected--;
289 mlog_bug_on_msg(qs->qs_connected < 0,
290 "node %u, connected %d\n",
291 node, qs->qs_connected);
292
293 clear_bit(node, qs->qs_conn_bm);
294 }
295
296 mlog(0, "node %u, %d total\n", node, qs->qs_connected);
297
298 if (test_bit(node, qs->qs_hb_bm))
299 o2quo_set_hold(qs, node);
300
301 spin_unlock(&qs->qs_lock);
302}
303
304void o2quo_init(void)
305{
306 struct o2quo_state *qs = &o2quo_state;
307
308 spin_lock_init(&qs->qs_lock);
309 INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL);
310}
311
312void o2quo_exit(void)
313{
314 flush_scheduled_work();
315}
diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h
new file mode 100644
index 000000000000..6649cc6f67c9
--- /dev/null
+++ b/fs/ocfs2/cluster/quorum.h
@@ -0,0 +1,36 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 */
22
23#ifndef O2CLUSTER_QUORUM_H
24#define O2CLUSTER_QUORUM_H
25
26void o2quo_init(void);
27void o2quo_exit(void);
28
29void o2quo_hb_up(u8 node);
30void o2quo_hb_down(u8 node);
31void o2quo_hb_still_up(u8 node);
32void o2quo_conn_up(u8 node);
33void o2quo_conn_err(u8 node);
34void o2quo_disk_timeout(void);
35
36#endif /* O2CLUSTER_QUORUM_H */
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
new file mode 100644
index 000000000000..1d9f6acafa2e
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.c
@@ -0,0 +1,124 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sys.c
5 *
6 * OCFS2 cluster sysfs interface
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation,
13 * version 2 of the License.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#include <linux/kernel.h>
28#include <linux/module.h>
29#include <linux/kobject.h>
30#include <linux/sysfs.h>
31
32#include "ocfs2_nodemanager.h"
33#include "masklog.h"
34#include "sys.h"
35
36struct o2cb_attribute {
37 struct attribute attr;
38 ssize_t (*show)(char *buf);
39 ssize_t (*store)(const char *buf, size_t count);
40};
41
42#define O2CB_ATTR(_name, _mode, _show, _store) \
43struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
44
45#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
46#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
47
48static ssize_t o2cb_interface_revision_show(char *buf)
49{
50 return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
51}
52
53static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
54
55static struct attribute *o2cb_attrs[] = {
56 &o2cb_attr_interface_revision.attr,
57 NULL,
58};
59
60static ssize_t
61o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
62static ssize_t
63o2cb_store(struct kobject * kobj, struct attribute * attr,
64 const char * buffer, size_t count);
65static struct sysfs_ops o2cb_sysfs_ops = {
66 .show = o2cb_show,
67 .store = o2cb_store,
68};
69
70static struct kobj_type o2cb_subsys_type = {
71 .default_attrs = o2cb_attrs,
72 .sysfs_ops = &o2cb_sysfs_ops,
73};
74
75/* gives us o2cb_subsys */
76static decl_subsys(o2cb, NULL, NULL);
77
78static ssize_t
79o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
80{
81 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
82 struct subsystem *sbs = to_o2cb_subsys(kobj);
83
84 BUG_ON(sbs != &o2cb_subsys);
85
86 if (o2cb_attr->show)
87 return o2cb_attr->show(buffer);
88 return -EIO;
89}
90
91static ssize_t
92o2cb_store(struct kobject * kobj, struct attribute * attr,
93 const char * buffer, size_t count)
94{
95 struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
96 struct subsystem *sbs = to_o2cb_subsys(kobj);
97
98 BUG_ON(sbs != &o2cb_subsys);
99
100 if (o2cb_attr->store)
101 return o2cb_attr->store(buffer, count);
102 return -EIO;
103}
104
105void o2cb_sys_shutdown(void)
106{
107 mlog_sys_shutdown();
108 subsystem_unregister(&o2cb_subsys);
109}
110
111int o2cb_sys_init(void)
112{
113 int ret;
114
115 o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type;
116 ret = subsystem_register(&o2cb_subsys);
117 if (ret)
118 return ret;
119
120 ret = mlog_sys_init(&o2cb_subsys);
121 if (ret)
122 subsystem_unregister(&o2cb_subsys);
123 return ret;
124}
diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h
new file mode 100644
index 000000000000..d66b8ab0045e
--- /dev/null
+++ b/fs/ocfs2/cluster/sys.h
@@ -0,0 +1,33 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sys.h
5 *
6 * Function prototypes for o2cb sysfs interface
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation,
13 * version 2 of the License.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef O2CLUSTER_SYS_H
28#define O2CLUSTER_SYS_H
29
30void o2cb_sys_shutdown(void);
31int o2cb_sys_init(void);
32
33#endif /* O2CLUSTER_SYS_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
new file mode 100644
index 000000000000..35d92c01a972
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp.c
@@ -0,0 +1,1829 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 *
3 * vim: noexpandtab sw=8 ts=8 sts=0:
4 *
5 * Copyright (C) 2004 Oracle. All rights reserved.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public
18 * License along with this program; if not, write to the
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 021110-1307, USA.
21 *
22 * ----
23 *
24 * Callers for this were originally written against a very simple synchronus
25 * API. This implementation reflects those simple callers. Some day I'm sure
26 * we'll need to move to a more robust posting/callback mechanism.
27 *
28 * Transmit calls pass in kernel virtual addresses and block copying this into
29 * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting
30 * for a failed socket to timeout. TX callers can also pass in a poniter to an
31 * 'int' which gets filled with an errno off the wire in response to the
32 * message they send.
33 *
34 * Handlers for unsolicited messages are registered. Each socket has a page
35 * that incoming data is copied into. First the header, then the data.
36 * Handlers are called from only one thread with a reference to this per-socket
37 * page. This page is destroyed after the handler call, so it can't be
38 * referenced beyond the call. Handlers may block but are discouraged from
39 * doing so.
40 *
41 * Any framing errors (bad magic, large payload lengths) close a connection.
42 *
43 * Our sock_container holds the state we associate with a socket. It's current
44 * framing state is held there as well as the refcounting we do around when it
45 * is safe to tear down the socket. The socket is only finally torn down from
46 * the container when the container loses all of its references -- so as long
47 * as you hold a ref on the container you can trust that the socket is valid
48 * for use with kernel socket APIs.
49 *
50 * Connections are initiated between a pair of nodes when the node with the
51 * higher node number gets a heartbeat callback which indicates that the lower
52 * numbered node has started heartbeating. The lower numbered node is passive
53 * and only accepts the connection if the higher numbered node is heartbeating.
54 */
55
56#include <linux/kernel.h>
57#include <linux/jiffies.h>
58#include <linux/slab.h>
59#include <linux/idr.h>
60#include <linux/kref.h>
61#include <net/tcp.h>
62
63#include <asm/uaccess.h>
64
65#include "heartbeat.h"
66#include "tcp.h"
67#include "nodemanager.h"
68#define MLOG_MASK_PREFIX ML_TCP
69#include "masklog.h"
70#include "quorum.h"
71
72#include "tcp_internal.h"
73
74/*
75 * The linux network stack isn't sparse endian clean.. It has macros like
76 * ntohs() which perform the endian checks and structs like sockaddr_in
77 * which aren't annotated. So __force is found here to get the build
78 * clean. When they emerge from the dark ages and annotate the code
79 * we can remove these.
80 */
81
82#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u"
83#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \
84 NIPQUAD(sc->sc_node->nd_ipv4_address), \
85 ntohs(sc->sc_node->nd_ipv4_port)
86
87/*
88 * In the following two log macros, the whitespace after the ',' just
89 * before ##args is intentional. Otherwise, gcc 2.95 will eat the
90 * previous token if args expands to nothing.
91 */
92#define msglog(hdr, fmt, args...) do { \
93 typeof(hdr) __hdr = (hdr); \
94 mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d " \
95 "key %08x num %u] " fmt, \
96 be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), \
97 be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status), \
98 be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key), \
99 be32_to_cpu(__hdr->msg_num) , ##args); \
100} while (0)
101
102#define sclog(sc, fmt, args...) do { \
103 typeof(sc) __sc = (sc); \
104 mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \
105 "pg_off %zu] " fmt, __sc, \
106 atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \
107 __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \
108 ##args); \
109} while (0)
110
111static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
112static struct rb_root o2net_handler_tree = RB_ROOT;
113
114static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
115
116/* XXX someday we'll need better accounting */
117static struct socket *o2net_listen_sock = NULL;
118
119/*
120 * listen work is only queued by the listening socket callbacks on the
121 * o2net_wq. teardown detaches the callbacks before destroying the workqueue.
122 * quorum work is queued as sock containers are shutdown.. stop_listening
123 * tears down all the node's sock containers, preventing future shutdowns
124 * and queued quroum work, before canceling delayed quorum work and
125 * destroying the work queue.
126 */
127static struct workqueue_struct *o2net_wq;
128static struct work_struct o2net_listen_work;
129
130static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
131#define O2NET_HB_PRI 0x1
132
133static struct o2net_handshake *o2net_hand;
134static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;
135
136static int o2net_sys_err_translations[O2NET_ERR_MAX] =
137 {[O2NET_ERR_NONE] = 0,
138 [O2NET_ERR_NO_HNDLR] = -ENOPROTOOPT,
139 [O2NET_ERR_OVERFLOW] = -EOVERFLOW,
140 [O2NET_ERR_DIED] = -EHOSTDOWN,};
141
142/* can't quite avoid *all* internal declarations :/ */
143static void o2net_sc_connect_completed(void *arg);
144static void o2net_rx_until_empty(void *arg);
145static void o2net_shutdown_sc(void *arg);
146static void o2net_listen_data_ready(struct sock *sk, int bytes);
147static void o2net_sc_send_keep_req(void *arg);
148static void o2net_idle_timer(unsigned long data);
149static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
150
151static inline int o2net_sys_err_to_errno(enum o2net_system_error err)
152{
153 int trans;
154 BUG_ON(err >= O2NET_ERR_MAX);
155 trans = o2net_sys_err_translations[err];
156
157 /* Just in case we mess up the translation table above */
158 BUG_ON(err != O2NET_ERR_NONE && trans == 0);
159 return trans;
160}
161
162static struct o2net_node * o2net_nn_from_num(u8 node_num)
163{
164 BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes));
165 return &o2net_nodes[node_num];
166}
167
168static u8 o2net_num_from_nn(struct o2net_node *nn)
169{
170 BUG_ON(nn == NULL);
171 return nn - o2net_nodes;
172}
173
174/* ------------------------------------------------------------ */
175
176static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
177{
178 int ret = 0;
179
180 do {
181 if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
182 ret = -EAGAIN;
183 break;
184 }
185 spin_lock(&nn->nn_lock);
186 ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
187 if (ret == 0)
188 list_add_tail(&nsw->ns_node_item,
189 &nn->nn_status_list);
190 spin_unlock(&nn->nn_lock);
191 } while (ret == -EAGAIN);
192
193 if (ret == 0) {
194 init_waitqueue_head(&nsw->ns_wq);
195 nsw->ns_sys_status = O2NET_ERR_NONE;
196 nsw->ns_status = 0;
197 }
198
199 return ret;
200}
201
202static void o2net_complete_nsw_locked(struct o2net_node *nn,
203 struct o2net_status_wait *nsw,
204 enum o2net_system_error sys_status,
205 s32 status)
206{
207 assert_spin_locked(&nn->nn_lock);
208
209 if (!list_empty(&nsw->ns_node_item)) {
210 list_del_init(&nsw->ns_node_item);
211 nsw->ns_sys_status = sys_status;
212 nsw->ns_status = status;
213 idr_remove(&nn->nn_status_idr, nsw->ns_id);
214 wake_up(&nsw->ns_wq);
215 }
216}
217
218static void o2net_complete_nsw(struct o2net_node *nn,
219 struct o2net_status_wait *nsw,
220 u64 id, enum o2net_system_error sys_status,
221 s32 status)
222{
223 spin_lock(&nn->nn_lock);
224 if (nsw == NULL) {
225 if (id > INT_MAX)
226 goto out;
227
228 nsw = idr_find(&nn->nn_status_idr, id);
229 if (nsw == NULL)
230 goto out;
231 }
232
233 o2net_complete_nsw_locked(nn, nsw, sys_status, status);
234
235out:
236 spin_unlock(&nn->nn_lock);
237 return;
238}
239
240static void o2net_complete_nodes_nsw(struct o2net_node *nn)
241{
242 struct list_head *iter, *tmp;
243 unsigned int num_kills = 0;
244 struct o2net_status_wait *nsw;
245
246 assert_spin_locked(&nn->nn_lock);
247
248 list_for_each_safe(iter, tmp, &nn->nn_status_list) {
249 nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
250 o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
251 num_kills++;
252 }
253
254 mlog(0, "completed %d messages for node %u\n", num_kills,
255 o2net_num_from_nn(nn));
256}
257
258static int o2net_nsw_completed(struct o2net_node *nn,
259 struct o2net_status_wait *nsw)
260{
261 int completed;
262 spin_lock(&nn->nn_lock);
263 completed = list_empty(&nsw->ns_node_item);
264 spin_unlock(&nn->nn_lock);
265 return completed;
266}
267
268/* ------------------------------------------------------------ */
269
270static void sc_kref_release(struct kref *kref)
271{
272 struct o2net_sock_container *sc = container_of(kref,
273 struct o2net_sock_container, sc_kref);
274 sclog(sc, "releasing\n");
275
276 if (sc->sc_sock) {
277 sock_release(sc->sc_sock);
278 sc->sc_sock = NULL;
279 }
280
281 o2nm_node_put(sc->sc_node);
282 sc->sc_node = NULL;
283
284 kfree(sc);
285}
286
287static void sc_put(struct o2net_sock_container *sc)
288{
289 sclog(sc, "put\n");
290 kref_put(&sc->sc_kref, sc_kref_release);
291}
292static void sc_get(struct o2net_sock_container *sc)
293{
294 sclog(sc, "get\n");
295 kref_get(&sc->sc_kref);
296}
297static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
298{
299 struct o2net_sock_container *sc, *ret = NULL;
300 struct page *page = NULL;
301
302 page = alloc_page(GFP_NOFS);
303 sc = kcalloc(1, sizeof(*sc), GFP_NOFS);
304 if (sc == NULL || page == NULL)
305 goto out;
306
307 kref_init(&sc->sc_kref);
308 o2nm_node_get(node);
309 sc->sc_node = node;
310
311 INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed, sc);
312 INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty, sc);
313 INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc, sc);
314 INIT_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req, sc);
315
316 init_timer(&sc->sc_idle_timeout);
317 sc->sc_idle_timeout.function = o2net_idle_timer;
318 sc->sc_idle_timeout.data = (unsigned long)sc;
319
320 sclog(sc, "alloced\n");
321
322 ret = sc;
323 sc->sc_page = page;
324 sc = NULL;
325 page = NULL;
326
327out:
328 if (page)
329 __free_page(page);
330 kfree(sc);
331
332 return ret;
333}
334
335/* ------------------------------------------------------------ */
336
337static void o2net_sc_queue_work(struct o2net_sock_container *sc,
338 struct work_struct *work)
339{
340 sc_get(sc);
341 if (!queue_work(o2net_wq, work))
342 sc_put(sc);
343}
344static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc,
345 struct work_struct *work,
346 int delay)
347{
348 sc_get(sc);
349 if (!queue_delayed_work(o2net_wq, work, delay))
350 sc_put(sc);
351}
352static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc,
353 struct work_struct *work)
354{
355 if (cancel_delayed_work(work))
356 sc_put(sc);
357}
358
359static void o2net_set_nn_state(struct o2net_node *nn,
360 struct o2net_sock_container *sc,
361 unsigned valid, int err)
362{
363 int was_valid = nn->nn_sc_valid;
364 int was_err = nn->nn_persistent_error;
365 struct o2net_sock_container *old_sc = nn->nn_sc;
366
367 assert_spin_locked(&nn->nn_lock);
368
369 /* the node num comparison and single connect/accept path should stop
370 * an non-null sc from being overwritten with another */
371 BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
372 mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
373 mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
374
375 /* we won't reconnect after our valid conn goes away for
376 * this hb iteration.. here so it shows up in the logs */
377 if (was_valid && !valid && err == 0)
378 err = -ENOTCONN;
379
380 mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n",
381 o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid,
382 nn->nn_persistent_error, err);
383
384 nn->nn_sc = sc;
385 nn->nn_sc_valid = valid ? 1 : 0;
386 nn->nn_persistent_error = err;
387
388 /* mirrors o2net_tx_can_proceed() */
389 if (nn->nn_persistent_error || nn->nn_sc_valid)
390 wake_up(&nn->nn_sc_wq);
391
392 if (!was_err && nn->nn_persistent_error) {
393 o2quo_conn_err(o2net_num_from_nn(nn));
394 queue_delayed_work(o2net_wq, &nn->nn_still_up,
395 msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
396 }
397
398 if (was_valid && !valid) {
399 mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n",
400 SC_NODEF_ARGS(old_sc));
401 o2net_complete_nodes_nsw(nn);
402 }
403
404 if (!was_valid && valid) {
405 o2quo_conn_up(o2net_num_from_nn(nn));
406 /* this is a bit of a hack. we only try reconnecting
407 * when heartbeating starts until we get a connection.
408 * if that connection then dies we don't try reconnecting.
409 * the only way to start connecting again is to down
410 * heartbeat and bring it back up. */
411 cancel_delayed_work(&nn->nn_connect_expired);
412 mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n",
413 o2nm_this_node() > sc->sc_node->nd_num ?
414 "connected to" : "accepted connection from",
415 SC_NODEF_ARGS(sc));
416 }
417
418 /* trigger the connecting worker func as long as we're not valid,
419 * it will back off if it shouldn't connect. This can be called
420 * from node config teardown and so needs to be careful about
421 * the work queue actually being up. */
422 if (!valid && o2net_wq) {
423 unsigned long delay;
424 /* delay if we're withing a RECONNECT_DELAY of the
425 * last attempt */
426 delay = (nn->nn_last_connect_attempt +
427 msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
428 - jiffies;
429 if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS))
430 delay = 0;
431 mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
432 queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
433 }
434
435 /* keep track of the nn's sc ref for the caller */
436 if ((old_sc == NULL) && sc)
437 sc_get(sc);
438 if (old_sc && (old_sc != sc)) {
439 o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work);
440 sc_put(old_sc);
441 }
442}
443
444/* see o2net_register_callbacks() */
445static void o2net_data_ready(struct sock *sk, int bytes)
446{
447 void (*ready)(struct sock *sk, int bytes);
448
449 read_lock(&sk->sk_callback_lock);
450 if (sk->sk_user_data) {
451 struct o2net_sock_container *sc = sk->sk_user_data;
452 sclog(sc, "data_ready hit\n");
453 do_gettimeofday(&sc->sc_tv_data_ready);
454 o2net_sc_queue_work(sc, &sc->sc_rx_work);
455 ready = sc->sc_data_ready;
456 } else {
457 ready = sk->sk_data_ready;
458 }
459 read_unlock(&sk->sk_callback_lock);
460
461 ready(sk, bytes);
462}
463
464/* see o2net_register_callbacks() */
465static void o2net_state_change(struct sock *sk)
466{
467 void (*state_change)(struct sock *sk);
468 struct o2net_sock_container *sc;
469
470 read_lock(&sk->sk_callback_lock);
471 sc = sk->sk_user_data;
472 if (sc == NULL) {
473 state_change = sk->sk_state_change;
474 goto out;
475 }
476
477 sclog(sc, "state_change to %d\n", sk->sk_state);
478
479 state_change = sc->sc_state_change;
480
481 switch(sk->sk_state) {
482 /* ignore connecting sockets as they make progress */
483 case TCP_SYN_SENT:
484 case TCP_SYN_RECV:
485 break;
486 case TCP_ESTABLISHED:
487 o2net_sc_queue_work(sc, &sc->sc_connect_work);
488 break;
489 default:
490 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
491 break;
492 }
493out:
494 read_unlock(&sk->sk_callback_lock);
495 state_change(sk);
496}
497
498/*
499 * we register callbacks so we can queue work on events before calling
500 * the original callbacks. our callbacks our careful to test user_data
501 * to discover when they've reaced with o2net_unregister_callbacks().
502 */
503static void o2net_register_callbacks(struct sock *sk,
504 struct o2net_sock_container *sc)
505{
506 write_lock_bh(&sk->sk_callback_lock);
507
508 /* accepted sockets inherit the old listen socket data ready */
509 if (sk->sk_data_ready == o2net_listen_data_ready) {
510 sk->sk_data_ready = sk->sk_user_data;
511 sk->sk_user_data = NULL;
512 }
513
514 BUG_ON(sk->sk_user_data != NULL);
515 sk->sk_user_data = sc;
516 sc_get(sc);
517
518 sc->sc_data_ready = sk->sk_data_ready;
519 sc->sc_state_change = sk->sk_state_change;
520 sk->sk_data_ready = o2net_data_ready;
521 sk->sk_state_change = o2net_state_change;
522
523 write_unlock_bh(&sk->sk_callback_lock);
524}
525
526static int o2net_unregister_callbacks(struct sock *sk,
527 struct o2net_sock_container *sc)
528{
529 int ret = 0;
530
531 write_lock_bh(&sk->sk_callback_lock);
532 if (sk->sk_user_data == sc) {
533 ret = 1;
534 sk->sk_user_data = NULL;
535 sk->sk_data_ready = sc->sc_data_ready;
536 sk->sk_state_change = sc->sc_state_change;
537 }
538 write_unlock_bh(&sk->sk_callback_lock);
539
540 return ret;
541}
542
543/*
544 * this is a little helper that is called by callers who have seen a problem
545 * with an sc and want to detach it from the nn if someone already hasn't beat
546 * them to it. if an error is given then the shutdown will be persistent
547 * and pending transmits will be canceled.
548 */
549static void o2net_ensure_shutdown(struct o2net_node *nn,
550 struct o2net_sock_container *sc,
551 int err)
552{
553 spin_lock(&nn->nn_lock);
554 if (nn->nn_sc == sc)
555 o2net_set_nn_state(nn, NULL, 0, err);
556 spin_unlock(&nn->nn_lock);
557}
558
559/*
560 * This work queue function performs the blocking parts of socket shutdown. A
561 * few paths lead here. set_nn_state will trigger this callback if it sees an
562 * sc detached from the nn. state_change will also trigger this callback
563 * directly when it sees errors. In that case we need to call set_nn_state
564 * ourselves as state_change couldn't get the nn_lock and call set_nn_state
565 * itself.
566 */
567static void o2net_shutdown_sc(void *arg)
568{
569 struct o2net_sock_container *sc = arg;
570 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
571
572 sclog(sc, "shutting down\n");
573
574 /* drop the callbacks ref and call shutdown only once */
575 if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
576 /* we shouldn't flush as we're in the thread, the
577 * races with pending sc work structs are harmless */
578 del_timer_sync(&sc->sc_idle_timeout);
579 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
580 sc_put(sc);
581 sc->sc_sock->ops->shutdown(sc->sc_sock,
582 RCV_SHUTDOWN|SEND_SHUTDOWN);
583 }
584
585 /* not fatal so failed connects before the other guy has our
586 * heartbeat can be retried */
587 o2net_ensure_shutdown(nn, sc, 0);
588 sc_put(sc);
589}
590
591/* ------------------------------------------------------------ */
592
593static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type,
594 u32 key)
595{
596 int ret = memcmp(&nmh->nh_key, &key, sizeof(key));
597
598 if (ret == 0)
599 ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type));
600
601 return ret;
602}
603
604static struct o2net_msg_handler *
605o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
606 struct rb_node **ret_parent)
607{
608 struct rb_node **p = &o2net_handler_tree.rb_node;
609 struct rb_node *parent = NULL;
610 struct o2net_msg_handler *nmh, *ret = NULL;
611 int cmp;
612
613 while (*p) {
614 parent = *p;
615 nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
616 cmp = o2net_handler_cmp(nmh, msg_type, key);
617
618 if (cmp < 0)
619 p = &(*p)->rb_left;
620 else if (cmp > 0)
621 p = &(*p)->rb_right;
622 else {
623 ret = nmh;
624 break;
625 }
626 }
627
628 if (ret_p != NULL)
629 *ret_p = p;
630 if (ret_parent != NULL)
631 *ret_parent = parent;
632
633 return ret;
634}
635
636static void o2net_handler_kref_release(struct kref *kref)
637{
638 struct o2net_msg_handler *nmh;
639 nmh = container_of(kref, struct o2net_msg_handler, nh_kref);
640
641 kfree(nmh);
642}
643
644static void o2net_handler_put(struct o2net_msg_handler *nmh)
645{
646 kref_put(&nmh->nh_kref, o2net_handler_kref_release);
647}
648
649/* max_len is protection for the handler func. incoming messages won't
650 * be given to the handler if their payload is longer than the max. */
651int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
652 o2net_msg_handler_func *func, void *data,
653 struct list_head *unreg_list)
654{
655 struct o2net_msg_handler *nmh = NULL;
656 struct rb_node **p, *parent;
657 int ret = 0;
658
659 if (max_len > O2NET_MAX_PAYLOAD_BYTES) {
660 mlog(0, "max_len for message handler out of range: %u\n",
661 max_len);
662 ret = -EINVAL;
663 goto out;
664 }
665
666 if (!msg_type) {
667 mlog(0, "no message type provided: %u, %p\n", msg_type, func);
668 ret = -EINVAL;
669 goto out;
670
671 }
672 if (!func) {
673 mlog(0, "no message handler provided: %u, %p\n",
674 msg_type, func);
675 ret = -EINVAL;
676 goto out;
677 }
678
679 nmh = kcalloc(1, sizeof(struct o2net_msg_handler), GFP_NOFS);
680 if (nmh == NULL) {
681 ret = -ENOMEM;
682 goto out;
683 }
684
685 nmh->nh_func = func;
686 nmh->nh_func_data = data;
687 nmh->nh_msg_type = msg_type;
688 nmh->nh_max_len = max_len;
689 nmh->nh_key = key;
690 /* the tree and list get this ref.. they're both removed in
691 * unregister when this ref is dropped */
692 kref_init(&nmh->nh_kref);
693 INIT_LIST_HEAD(&nmh->nh_unregister_item);
694
695 write_lock(&o2net_handler_lock);
696 if (o2net_handler_tree_lookup(msg_type, key, &p, &parent))
697 ret = -EEXIST;
698 else {
699 rb_link_node(&nmh->nh_node, parent, p);
700 rb_insert_color(&nmh->nh_node, &o2net_handler_tree);
701 list_add_tail(&nmh->nh_unregister_item, unreg_list);
702
703 mlog(ML_TCP, "registered handler func %p type %u key %08x\n",
704 func, msg_type, key);
705 /* we've had some trouble with handlers seemingly vanishing. */
706 mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
707 &parent) == NULL,
708 "couldn't find handler we *just* registerd "
709 "for type %u key %08x\n", msg_type, key);
710 }
711 write_unlock(&o2net_handler_lock);
712 if (ret)
713 goto out;
714
715out:
716 if (ret)
717 kfree(nmh);
718
719 return ret;
720}
721EXPORT_SYMBOL_GPL(o2net_register_handler);
722
723void o2net_unregister_handler_list(struct list_head *list)
724{
725 struct list_head *pos, *n;
726 struct o2net_msg_handler *nmh;
727
728 write_lock(&o2net_handler_lock);
729 list_for_each_safe(pos, n, list) {
730 nmh = list_entry(pos, struct o2net_msg_handler,
731 nh_unregister_item);
732 mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
733 nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
734 rb_erase(&nmh->nh_node, &o2net_handler_tree);
735 list_del_init(&nmh->nh_unregister_item);
736 kref_put(&nmh->nh_kref, o2net_handler_kref_release);
737 }
738 write_unlock(&o2net_handler_lock);
739}
740EXPORT_SYMBOL_GPL(o2net_unregister_handler_list);
741
742static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
743{
744 struct o2net_msg_handler *nmh;
745
746 read_lock(&o2net_handler_lock);
747 nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL);
748 if (nmh)
749 kref_get(&nmh->nh_kref);
750 read_unlock(&o2net_handler_lock);
751
752 return nmh;
753}
754
755/* ------------------------------------------------------------ */
756
757static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
758{
759 int ret;
760 mm_segment_t oldfs;
761 struct kvec vec = {
762 .iov_len = len,
763 .iov_base = data,
764 };
765 struct msghdr msg = {
766 .msg_iovlen = 1,
767 .msg_iov = (struct iovec *)&vec,
768 .msg_flags = MSG_DONTWAIT,
769 };
770
771 oldfs = get_fs();
772 set_fs(get_ds());
773 ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
774 set_fs(oldfs);
775
776 return ret;
777}
778
779static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
780 size_t veclen, size_t total)
781{
782 int ret;
783 mm_segment_t oldfs;
784 struct msghdr msg = {
785 .msg_iov = (struct iovec *)vec,
786 .msg_iovlen = veclen,
787 };
788
789 if (sock == NULL) {
790 ret = -EINVAL;
791 goto out;
792 }
793
794 oldfs = get_fs();
795 set_fs(get_ds());
796 ret = sock_sendmsg(sock, &msg, total);
797 set_fs(oldfs);
798 if (ret != total) {
799 mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
800 total);
801 if (ret >= 0)
802 ret = -EPIPE; /* should be smarter, I bet */
803 goto out;
804 }
805
806 ret = 0;
807out:
808 if (ret < 0)
809 mlog(0, "returning error: %d\n", ret);
810 return ret;
811}
812
813static void o2net_sendpage(struct o2net_sock_container *sc,
814 void *kmalloced_virt,
815 size_t size)
816{
817 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
818 ssize_t ret;
819
820
821 ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
822 virt_to_page(kmalloced_virt),
823 (long)kmalloced_virt & ~PAGE_MASK,
824 size, MSG_DONTWAIT);
825 if (ret != size) {
826 mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
827 " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret);
828 o2net_ensure_shutdown(nn, sc, 0);
829 }
830}
831
832static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key)
833{
834 memset(msg, 0, sizeof(struct o2net_msg));
835 msg->magic = cpu_to_be16(O2NET_MSG_MAGIC);
836 msg->data_len = cpu_to_be16(data_len);
837 msg->msg_type = cpu_to_be16(msg_type);
838 msg->sys_status = cpu_to_be32(O2NET_ERR_NONE);
839 msg->status = 0;
840 msg->key = cpu_to_be32(key);
841}
842
843static int o2net_tx_can_proceed(struct o2net_node *nn,
844 struct o2net_sock_container **sc_ret,
845 int *error)
846{
847 int ret = 0;
848
849 spin_lock(&nn->nn_lock);
850 if (nn->nn_persistent_error) {
851 ret = 1;
852 *sc_ret = NULL;
853 *error = nn->nn_persistent_error;
854 } else if (nn->nn_sc_valid) {
855 kref_get(&nn->nn_sc->sc_kref);
856
857 ret = 1;
858 *sc_ret = nn->nn_sc;
859 *error = 0;
860 }
861 spin_unlock(&nn->nn_lock);
862
863 return ret;
864}
865
866int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
867 size_t caller_veclen, u8 target_node, int *status)
868{
869 int ret, error = 0;
870 struct o2net_msg *msg = NULL;
871 size_t veclen, caller_bytes = 0;
872 struct kvec *vec = NULL;
873 struct o2net_sock_container *sc = NULL;
874 struct o2net_node *nn = o2net_nn_from_num(target_node);
875 struct o2net_status_wait nsw = {
876 .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
877 };
878
879 if (o2net_wq == NULL) {
880 mlog(0, "attempt to tx without o2netd running\n");
881 ret = -ESRCH;
882 goto out;
883 }
884
885 if (caller_veclen == 0) {
886 mlog(0, "bad kvec array length\n");
887 ret = -EINVAL;
888 goto out;
889 }
890
891 caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen);
892 if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) {
893 mlog(0, "total payload len %zu too large\n", caller_bytes);
894 ret = -EINVAL;
895 goto out;
896 }
897
898 if (target_node == o2nm_this_node()) {
899 ret = -ELOOP;
900 goto out;
901 }
902
903 ret = wait_event_interruptible(nn->nn_sc_wq,
904 o2net_tx_can_proceed(nn, &sc, &error));
905 if (!ret && error)
906 ret = error;
907 if (ret)
908 goto out;
909
910 veclen = caller_veclen + 1;
911 vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
912 if (vec == NULL) {
913 mlog(0, "failed to %zu element kvec!\n", veclen);
914 ret = -ENOMEM;
915 goto out;
916 }
917
918 msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC);
919 if (!msg) {
920 mlog(0, "failed to allocate a o2net_msg!\n");
921 ret = -ENOMEM;
922 goto out;
923 }
924
925 o2net_init_msg(msg, caller_bytes, msg_type, key);
926
927 vec[0].iov_len = sizeof(struct o2net_msg);
928 vec[0].iov_base = msg;
929 memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec));
930
931 ret = o2net_prep_nsw(nn, &nsw);
932 if (ret)
933 goto out;
934
935 msg->msg_num = cpu_to_be32(nsw.ns_id);
936
937 /* finally, convert the message header to network byte-order
938 * and send */
939 ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen,
940 sizeof(struct o2net_msg) + caller_bytes);
941 msglog(msg, "sending returned %d\n", ret);
942 if (ret < 0) {
943 mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret);
944 goto out;
945 }
946
947 /* wait on other node's handler */
948 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
949
950 /* Note that we avoid overwriting the callers status return
951 * variable if a system error was reported on the other
952 * side. Callers beware. */
953 ret = o2net_sys_err_to_errno(nsw.ns_sys_status);
954 if (status && !ret)
955 *status = nsw.ns_status;
956
957 mlog(0, "woken, returning system status %d, user status %d\n",
958 ret, nsw.ns_status);
959out:
960 if (sc)
961 sc_put(sc);
962 if (vec)
963 kfree(vec);
964 if (msg)
965 kfree(msg);
966 o2net_complete_nsw(nn, &nsw, 0, 0, 0);
967 return ret;
968}
969EXPORT_SYMBOL_GPL(o2net_send_message_vec);
970
971int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
972 u8 target_node, int *status)
973{
974 struct kvec vec = {
975 .iov_base = data,
976 .iov_len = len,
977 };
978 return o2net_send_message_vec(msg_type, key, &vec, 1,
979 target_node, status);
980}
981EXPORT_SYMBOL_GPL(o2net_send_message);
982
983static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr,
984 enum o2net_system_error syserr, int err)
985{
986 struct kvec vec = {
987 .iov_base = hdr,
988 .iov_len = sizeof(struct o2net_msg),
989 };
990
991 BUG_ON(syserr >= O2NET_ERR_MAX);
992
993 /* leave other fields intact from the incoming message, msg_num
994 * in particular */
995 hdr->sys_status = cpu_to_be32(syserr);
996 hdr->status = cpu_to_be32(err);
997 hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC); // twiddle the magic
998 hdr->data_len = 0;
999
1000 msglog(hdr, "about to send status magic %d\n", err);
1001 /* hdr has been in host byteorder this whole time */
1002 return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg));
1003}
1004
1005/* this returns -errno if the header was unknown or too large, etc.
1006 * after this is called the buffer us reused for the next message */
1007static int o2net_process_message(struct o2net_sock_container *sc,
1008 struct o2net_msg *hdr)
1009{
1010 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1011 int ret = 0, handler_status;
1012 enum o2net_system_error syserr;
1013 struct o2net_msg_handler *nmh = NULL;
1014
1015 msglog(hdr, "processing message\n");
1016
1017 o2net_sc_postpone_idle(sc);
1018
1019 switch(be16_to_cpu(hdr->magic)) {
1020 case O2NET_MSG_STATUS_MAGIC:
1021 /* special type for returning message status */
1022 o2net_complete_nsw(nn, NULL,
1023 be32_to_cpu(hdr->msg_num),
1024 be32_to_cpu(hdr->sys_status),
1025 be32_to_cpu(hdr->status));
1026 goto out;
1027 case O2NET_MSG_KEEP_REQ_MAGIC:
1028 o2net_sendpage(sc, o2net_keep_resp,
1029 sizeof(*o2net_keep_resp));
1030 goto out;
1031 case O2NET_MSG_KEEP_RESP_MAGIC:
1032 goto out;
1033 case O2NET_MSG_MAGIC:
1034 break;
1035 default:
1036 msglog(hdr, "bad magic\n");
1037 ret = -EINVAL;
1038 goto out;
1039 break;
1040 }
1041
1042 /* find a handler for it */
1043 handler_status = 0;
1044 nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type),
1045 be32_to_cpu(hdr->key));
1046 if (!nmh) {
1047 mlog(ML_TCP, "couldn't find handler for type %u key %08x\n",
1048 be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key));
1049 syserr = O2NET_ERR_NO_HNDLR;
1050 goto out_respond;
1051 }
1052
1053 syserr = O2NET_ERR_NONE;
1054
1055 if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len)
1056 syserr = O2NET_ERR_OVERFLOW;
1057
1058 if (syserr != O2NET_ERR_NONE)
1059 goto out_respond;
1060
1061 do_gettimeofday(&sc->sc_tv_func_start);
1062 sc->sc_msg_key = be32_to_cpu(hdr->key);
1063 sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
1064 handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) +
1065 be16_to_cpu(hdr->data_len),
1066 nmh->nh_func_data);
1067 do_gettimeofday(&sc->sc_tv_func_stop);
1068
1069out_respond:
1070 /* this destroys the hdr, so don't use it after this */
1071 ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr,
1072 handler_status);
1073 hdr = NULL;
1074 mlog(0, "sending handler status %d, syserr %d returned %d\n",
1075 handler_status, syserr, ret);
1076
1077out:
1078 if (nmh)
1079 o2net_handler_put(nmh);
1080 return ret;
1081}
1082
1083static int o2net_check_handshake(struct o2net_sock_container *sc)
1084{
1085 struct o2net_handshake *hand = page_address(sc->sc_page);
1086 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1087
1088 if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
1089 mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
1090 "version %llu but %llu is required, disconnecting\n",
1091 SC_NODEF_ARGS(sc),
1092 (unsigned long long)be64_to_cpu(hand->protocol_version),
1093 O2NET_PROTOCOL_VERSION);
1094
1095 /* don't bother reconnecting if its the wrong version. */
1096 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1097 return -1;
1098 }
1099
1100 sc->sc_handshake_ok = 1;
1101
1102 spin_lock(&nn->nn_lock);
1103 /* set valid and queue the idle timers only if it hasn't been
1104 * shut down already */
1105 if (nn->nn_sc == sc) {
1106 o2net_sc_postpone_idle(sc);
1107 o2net_set_nn_state(nn, sc, 1, 0);
1108 }
1109 spin_unlock(&nn->nn_lock);
1110
1111 /* shift everything up as though it wasn't there */
1112 sc->sc_page_off -= sizeof(struct o2net_handshake);
1113 if (sc->sc_page_off)
1114 memmove(hand, hand + 1, sc->sc_page_off);
1115
1116 return 0;
1117}
1118
1119/* this demuxes the queued rx bytes into header or payload bits and calls
1120 * handlers as each full message is read off the socket. it returns -error,
1121 * == 0 eof, or > 0 for progress made.*/
1122static int o2net_advance_rx(struct o2net_sock_container *sc)
1123{
1124 struct o2net_msg *hdr;
1125 int ret = 0;
1126 void *data;
1127 size_t datalen;
1128
1129 sclog(sc, "receiving\n");
1130 do_gettimeofday(&sc->sc_tv_advance_start);
1131
1132 /* do we need more header? */
1133 if (sc->sc_page_off < sizeof(struct o2net_msg)) {
1134 data = page_address(sc->sc_page) + sc->sc_page_off;
1135 datalen = sizeof(struct o2net_msg) - sc->sc_page_off;
1136 ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
1137 if (ret > 0) {
1138 sc->sc_page_off += ret;
1139
1140 /* this working relies on the handshake being
1141 * smaller than the normal message header */
1142 if (sc->sc_page_off >= sizeof(struct o2net_handshake)&&
1143 !sc->sc_handshake_ok && o2net_check_handshake(sc)) {
1144 ret = -EPROTO;
1145 goto out;
1146 }
1147
1148 /* only swab incoming here.. we can
1149 * only get here once as we cross from
1150 * being under to over */
1151 if (sc->sc_page_off == sizeof(struct o2net_msg)) {
1152 hdr = page_address(sc->sc_page);
1153 if (be16_to_cpu(hdr->data_len) >
1154 O2NET_MAX_PAYLOAD_BYTES)
1155 ret = -EOVERFLOW;
1156 }
1157 }
1158 if (ret <= 0)
1159 goto out;
1160 }
1161
1162 if (sc->sc_page_off < sizeof(struct o2net_msg)) {
1163 /* oof, still don't have a header */
1164 goto out;
1165 }
1166
1167 /* this was swabbed above when we first read it */
1168 hdr = page_address(sc->sc_page);
1169
1170 msglog(hdr, "at page_off %zu\n", sc->sc_page_off);
1171
1172 /* do we need more payload? */
1173 if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) {
1174 /* need more payload */
1175 data = page_address(sc->sc_page) + sc->sc_page_off;
1176 datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) -
1177 sc->sc_page_off;
1178 ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen);
1179 if (ret > 0)
1180 sc->sc_page_off += ret;
1181 if (ret <= 0)
1182 goto out;
1183 }
1184
1185 if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) {
1186 /* we can only get here once, the first time we read
1187 * the payload.. so set ret to progress if the handler
1188 * works out. after calling this the message is toast */
1189 ret = o2net_process_message(sc, hdr);
1190 if (ret == 0)
1191 ret = 1;
1192 sc->sc_page_off = 0;
1193 }
1194
1195out:
1196 sclog(sc, "ret = %d\n", ret);
1197 do_gettimeofday(&sc->sc_tv_advance_stop);
1198 return ret;
1199}
1200
1201/* this work func is triggerd by data ready. it reads until it can read no
1202 * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing
1203 * our work the work struct will be marked and we'll be called again. */
1204static void o2net_rx_until_empty(void *arg)
1205{
1206 struct o2net_sock_container *sc = arg;
1207 int ret;
1208
1209 do {
1210 ret = o2net_advance_rx(sc);
1211 } while (ret > 0);
1212
1213 if (ret <= 0 && ret != -EAGAIN) {
1214 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1215 sclog(sc, "saw error %d, closing\n", ret);
1216 /* not permanent so read failed handshake can retry */
1217 o2net_ensure_shutdown(nn, sc, 0);
1218 }
1219
1220 sc_put(sc);
1221}
1222
1223static int o2net_set_nodelay(struct socket *sock)
1224{
1225 int ret, val = 1;
1226 mm_segment_t oldfs;
1227
1228 oldfs = get_fs();
1229 set_fs(KERNEL_DS);
1230
1231 /*
1232 * Dear unsuspecting programmer,
1233 *
1234 * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level
1235 * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
1236 * silently turn into SO_DEBUG.
1237 *
1238 * Yours,
1239 * Keeper of hilariously fragile interfaces.
1240 */
1241 ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
1242 (char __user *)&val, sizeof(val));
1243
1244 set_fs(oldfs);
1245 return ret;
1246}
1247
1248/* ------------------------------------------------------------ */
1249
1250/* called when a connect completes and after a sock is accepted. the
1251 * rx path will see the response and mark the sc valid */
1252static void o2net_sc_connect_completed(void *arg)
1253{
1254 struct o2net_sock_container *sc = arg;
1255
1256 mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n",
1257 (unsigned long long)O2NET_PROTOCOL_VERSION,
1258 (unsigned long long)be64_to_cpu(o2net_hand->connector_id));
1259
1260 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
1261 sc_put(sc);
1262}
1263
1264/* this is called as a work_struct func. */
1265static void o2net_sc_send_keep_req(void *arg)
1266{
1267 struct o2net_sock_container *sc = arg;
1268
1269 o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req));
1270 sc_put(sc);
1271}
1272
1273/* socket shutdown does a del_timer_sync against this as it tears down.
1274 * we can't start this timer until we've got to the point in sc buildup
1275 * where shutdown is going to be involved */
1276static void o2net_idle_timer(unsigned long data)
1277{
1278 struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
1279 struct timeval now;
1280
1281 do_gettimeofday(&now);
1282
1283 mlog(ML_NOTICE, "connection to " SC_NODEF_FMT " has been idle for 10 "
1284 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc));
1285 mlog(ML_NOTICE, "here are some times that might help debug the "
1286 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
1287 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
1288 sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec,
1289 now.tv_sec, now.tv_usec,
1290 sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec,
1291 sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec,
1292 sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec,
1293 sc->sc_msg_key, sc->sc_msg_type,
1294 sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec,
1295 sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec);
1296
1297 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
1298}
1299
1300static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
1301{
1302 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
1303 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
1304 O2NET_KEEPALIVE_DELAY_SECS * HZ);
1305 do_gettimeofday(&sc->sc_tv_timer);
1306 mod_timer(&sc->sc_idle_timeout,
1307 jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ));
1308}
1309
1310/* this work func is kicked whenever a path sets the nn state which doesn't
1311 * have valid set. This includes seeing hb come up, losing a connection,
1312 * having a connect attempt fail, etc. This centralizes the logic which decides
1313 * if a connect attempt should be made or if we should give up and all future
1314 * transmit attempts should fail */
1315static void o2net_start_connect(void *arg)
1316{
1317 struct o2net_node *nn = arg;
1318 struct o2net_sock_container *sc = NULL;
1319 struct o2nm_node *node = NULL;
1320 struct socket *sock = NULL;
1321 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
1322 int ret = 0;
1323
1324 /* if we're greater we initiate tx, otherwise we accept */
1325 if (o2nm_this_node() <= o2net_num_from_nn(nn))
1326 goto out;
1327
1328 /* watch for racing with tearing a node down */
1329 node = o2nm_get_node_by_num(o2net_num_from_nn(nn));
1330 if (node == NULL) {
1331 ret = 0;
1332 goto out;
1333 }
1334
1335 spin_lock(&nn->nn_lock);
1336 /* see if we already have one pending or have given up */
1337 if (nn->nn_sc || nn->nn_persistent_error)
1338 arg = NULL;
1339 spin_unlock(&nn->nn_lock);
1340 if (arg == NULL) /* *shrug*, needed some indicator */
1341 goto out;
1342
1343 nn->nn_last_connect_attempt = jiffies;
1344
1345 sc = sc_alloc(node);
1346 if (sc == NULL) {
1347 mlog(0, "couldn't allocate sc\n");
1348 ret = -ENOMEM;
1349 goto out;
1350 }
1351
1352 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1353 if (ret < 0) {
1354 mlog(0, "can't create socket: %d\n", ret);
1355 goto out;
1356 }
1357 sc->sc_sock = sock; /* freed by sc_kref_release */
1358
1359 sock->sk->sk_allocation = GFP_ATOMIC;
1360
1361 myaddr.sin_family = AF_INET;
1362 myaddr.sin_port = (__force u16)htons(0); /* any port */
1363
1364 ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
1365 sizeof(myaddr));
1366 if (ret) {
1367 mlog(0, "bind failed: %d\n", ret);
1368 goto out;
1369 }
1370
1371 ret = o2net_set_nodelay(sc->sc_sock);
1372 if (ret) {
1373 mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
1374 goto out;
1375 }
1376
1377 o2net_register_callbacks(sc->sc_sock->sk, sc);
1378
1379 spin_lock(&nn->nn_lock);
1380 /* handshake completion will set nn->nn_sc_valid */
1381 o2net_set_nn_state(nn, sc, 0, 0);
1382 spin_unlock(&nn->nn_lock);
1383
1384 remoteaddr.sin_family = AF_INET;
1385 remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address;
1386 remoteaddr.sin_port = (__force u16)node->nd_ipv4_port;
1387
1388 ret = sc->sc_sock->ops->connect(sc->sc_sock,
1389 (struct sockaddr *)&remoteaddr,
1390 sizeof(remoteaddr),
1391 O_NONBLOCK);
1392 if (ret == -EINPROGRESS)
1393 ret = 0;
1394
1395out:
1396 if (ret) {
1397 mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
1398 "with errno %d\n", SC_NODEF_ARGS(sc), ret);
1399 /* 0 err so that another will be queued and attempted
1400 * from set_nn_state */
1401 if (sc)
1402 o2net_ensure_shutdown(nn, sc, 0);
1403 }
1404 if (sc)
1405 sc_put(sc);
1406 if (node)
1407 o2nm_node_put(node);
1408
1409 return;
1410}
1411
1412static void o2net_connect_expired(void *arg)
1413{
1414 struct o2net_node *nn = arg;
1415
1416 spin_lock(&nn->nn_lock);
1417 if (!nn->nn_sc_valid) {
1418 mlog(ML_ERROR, "no connection established with node %u after "
1419 "%u seconds, giving up and returning errors.\n",
1420 o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS);
1421
1422 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
1423 }
1424 spin_unlock(&nn->nn_lock);
1425}
1426
1427static void o2net_still_up(void *arg)
1428{
1429 struct o2net_node *nn = arg;
1430
1431 o2quo_hb_still_up(o2net_num_from_nn(nn));
1432}
1433
1434/* ------------------------------------------------------------ */
1435
1436void o2net_disconnect_node(struct o2nm_node *node)
1437{
1438 struct o2net_node *nn = o2net_nn_from_num(node->nd_num);
1439
1440 /* don't reconnect until it's heartbeating again */
1441 spin_lock(&nn->nn_lock);
1442 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
1443 spin_unlock(&nn->nn_lock);
1444
1445 if (o2net_wq) {
1446 cancel_delayed_work(&nn->nn_connect_expired);
1447 cancel_delayed_work(&nn->nn_connect_work);
1448 cancel_delayed_work(&nn->nn_still_up);
1449 flush_workqueue(o2net_wq);
1450 }
1451}
1452
1453static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
1454 void *data)
1455{
1456 o2quo_hb_down(node_num);
1457
1458 if (node_num != o2nm_this_node())
1459 o2net_disconnect_node(node);
1460}
1461
1462static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1463 void *data)
1464{
1465 struct o2net_node *nn = o2net_nn_from_num(node_num);
1466
1467 o2quo_hb_up(node_num);
1468
1469 /* ensure an immediate connect attempt */
1470 nn->nn_last_connect_attempt = jiffies -
1471 (msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1);
1472
1473 if (node_num != o2nm_this_node()) {
1474 /* heartbeat doesn't work unless a local node number is
1475 * configured and doing so brings up the o2net_wq, so we can
1476 * use it.. */
1477 queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
1478 O2NET_IDLE_TIMEOUT_SECS * HZ);
1479
1480 /* believe it or not, accept and node hearbeating testing
1481 * can succeed for this node before we got here.. so
1482 * only use set_nn_state to clear the persistent error
1483 * if that hasn't already happened */
1484 spin_lock(&nn->nn_lock);
1485 if (nn->nn_persistent_error)
1486 o2net_set_nn_state(nn, NULL, 0, 0);
1487 spin_unlock(&nn->nn_lock);
1488 }
1489}
1490
1491void o2net_unregister_hb_callbacks(void)
1492{
1493 int ret;
1494
1495 ret = o2hb_unregister_callback(&o2net_hb_up);
1496 if (ret < 0)
1497 mlog(ML_ERROR, "Status return %d unregistering heartbeat up "
1498 "callback!\n", ret);
1499
1500 ret = o2hb_unregister_callback(&o2net_hb_down);
1501 if (ret < 0)
1502 mlog(ML_ERROR, "Status return %d unregistering heartbeat down "
1503 "callback!\n", ret);
1504}
1505
1506int o2net_register_hb_callbacks(void)
1507{
1508 int ret;
1509
1510 o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB,
1511 o2net_hb_node_down_cb, NULL, O2NET_HB_PRI);
1512 o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
1513 o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
1514
1515 ret = o2hb_register_callback(&o2net_hb_up);
1516 if (ret == 0)
1517 ret = o2hb_register_callback(&o2net_hb_down);
1518
1519 if (ret)
1520 o2net_unregister_hb_callbacks();
1521
1522 return ret;
1523}
1524
1525/* ------------------------------------------------------------ */
1526
1527static int o2net_accept_one(struct socket *sock)
1528{
1529 int ret, slen;
1530 struct sockaddr_in sin;
1531 struct socket *new_sock = NULL;
1532 struct o2nm_node *node = NULL;
1533 struct o2net_sock_container *sc = NULL;
1534 struct o2net_node *nn;
1535
1536 BUG_ON(sock == NULL);
1537 ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
1538 sock->sk->sk_protocol, &new_sock);
1539 if (ret)
1540 goto out;
1541
1542 new_sock->type = sock->type;
1543 new_sock->ops = sock->ops;
1544 ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
1545 if (ret < 0)
1546 goto out;
1547
1548 new_sock->sk->sk_allocation = GFP_ATOMIC;
1549
1550 ret = o2net_set_nodelay(new_sock);
1551 if (ret) {
1552 mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
1553 goto out;
1554 }
1555
1556 slen = sizeof(sin);
1557 ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
1558 &slen, 1);
1559 if (ret < 0)
1560 goto out;
1561
1562 node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr);
1563 if (node == NULL) {
1564 mlog(ML_NOTICE, "attempt to connect from unknown node at "
1565 "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
1566 ntohs((__force __be16)sin.sin_port));
1567 ret = -EINVAL;
1568 goto out;
1569 }
1570
1571 if (o2nm_this_node() > node->nd_num) {
1572 mlog(ML_NOTICE, "unexpected connect attempted from a lower "
1573 "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n",
1574 node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
1575 ntohs((__force __be16)sin.sin_port), node->nd_num);
1576 ret = -EINVAL;
1577 goto out;
1578 }
1579
1580 /* this happens all the time when the other node sees our heartbeat
1581 * and tries to connect before we see their heartbeat */
1582 if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) {
1583 mlog(ML_CONN, "attempt to connect from node '%s' at "
1584 "%u.%u.%u.%u:%d but it isn't heartbeating\n",
1585 node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
1586 ntohs((__force __be16)sin.sin_port));
1587 ret = -EINVAL;
1588 goto out;
1589 }
1590
1591 nn = o2net_nn_from_num(node->nd_num);
1592
1593 spin_lock(&nn->nn_lock);
1594 if (nn->nn_sc)
1595 ret = -EBUSY;
1596 else
1597 ret = 0;
1598 spin_unlock(&nn->nn_lock);
1599 if (ret) {
1600 mlog(ML_NOTICE, "attempt to connect from node '%s' at "
1601 "%u.%u.%u.%u:%d but it already has an open connection\n",
1602 node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
1603 ntohs((__force __be16)sin.sin_port));
1604 goto out;
1605 }
1606
1607 sc = sc_alloc(node);
1608 if (sc == NULL) {
1609 ret = -ENOMEM;
1610 goto out;
1611 }
1612
1613 sc->sc_sock = new_sock;
1614 new_sock = NULL;
1615
1616 spin_lock(&nn->nn_lock);
1617 o2net_set_nn_state(nn, sc, 0, 0);
1618 spin_unlock(&nn->nn_lock);
1619
1620 o2net_register_callbacks(sc->sc_sock->sk, sc);
1621 o2net_sc_queue_work(sc, &sc->sc_rx_work);
1622
1623 o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand));
1624
1625out:
1626 if (new_sock)
1627 sock_release(new_sock);
1628 if (node)
1629 o2nm_node_put(node);
1630 if (sc)
1631 sc_put(sc);
1632 return ret;
1633}
1634
1635static void o2net_accept_many(void *arg)
1636{
1637 struct socket *sock = arg;
1638 while (o2net_accept_one(sock) == 0)
1639 cond_resched();
1640}
1641
1642static void o2net_listen_data_ready(struct sock *sk, int bytes)
1643{
1644 void (*ready)(struct sock *sk, int bytes);
1645
1646 read_lock(&sk->sk_callback_lock);
1647 ready = sk->sk_user_data;
1648 if (ready == NULL) { /* check for teardown race */
1649 ready = sk->sk_data_ready;
1650 goto out;
1651 }
1652
1653 /* ->sk_data_ready is also called for a newly established child socket
1654 * before it has been accepted and the acceptor has set up their
1655 * data_ready.. we only want to queue listen work for our listening
1656 * socket */
1657 if (sk->sk_state == TCP_LISTEN) {
1658 mlog(ML_TCP, "bytes: %d\n", bytes);
1659 queue_work(o2net_wq, &o2net_listen_work);
1660 }
1661
1662out:
1663 read_unlock(&sk->sk_callback_lock);
1664 ready(sk, bytes);
1665}
1666
1667static int o2net_open_listening_sock(__be16 port)
1668{
1669 struct socket *sock = NULL;
1670 int ret;
1671 struct sockaddr_in sin = {
1672 .sin_family = PF_INET,
1673 .sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) },
1674 .sin_port = (__force u16)port,
1675 };
1676
1677 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1678 if (ret < 0) {
1679 mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
1680 goto out;
1681 }
1682
1683 sock->sk->sk_allocation = GFP_ATOMIC;
1684
1685 write_lock_bh(&sock->sk->sk_callback_lock);
1686 sock->sk->sk_user_data = sock->sk->sk_data_ready;
1687 sock->sk->sk_data_ready = o2net_listen_data_ready;
1688 write_unlock_bh(&sock->sk->sk_callback_lock);
1689
1690 o2net_listen_sock = sock;
1691 INIT_WORK(&o2net_listen_work, o2net_accept_many, sock);
1692
1693 sock->sk->sk_reuse = 1;
1694 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
1695 if (ret < 0) {
1696 mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n",
1697 ntohs(port), ret);
1698 goto out;
1699 }
1700
1701 ret = sock->ops->listen(sock, 64);
1702 if (ret < 0) {
1703 mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n",
1704 ntohs(port), ret);
1705 }
1706
1707out:
1708 if (ret) {
1709 o2net_listen_sock = NULL;
1710 if (sock)
1711 sock_release(sock);
1712 }
1713 return ret;
1714}
1715
1716/*
1717 * called from node manager when we should bring up our network listening
1718 * socket. node manager handles all the serialization to only call this
1719 * once and to match it with o2net_stop_listening(). note,
1720 * o2nm_this_node() doesn't work yet as we're being called while it
1721 * is being set up.
1722 */
1723int o2net_start_listening(struct o2nm_node *node)
1724{
1725 int ret = 0;
1726
1727 BUG_ON(o2net_wq != NULL);
1728 BUG_ON(o2net_listen_sock != NULL);
1729
1730 mlog(ML_KTHREAD, "starting o2net thread...\n");
1731 o2net_wq = create_singlethread_workqueue("o2net");
1732 if (o2net_wq == NULL) {
1733 mlog(ML_ERROR, "unable to launch o2net thread\n");
1734 return -ENOMEM; /* ? */
1735 }
1736
1737 ret = o2net_open_listening_sock(node->nd_ipv4_port);
1738 if (ret) {
1739 destroy_workqueue(o2net_wq);
1740 o2net_wq = NULL;
1741 } else
1742 o2quo_conn_up(node->nd_num);
1743
1744 return ret;
1745}
1746
1747/* again, o2nm_this_node() doesn't work here as we're involved in
1748 * tearing it down */
1749void o2net_stop_listening(struct o2nm_node *node)
1750{
1751 struct socket *sock = o2net_listen_sock;
1752 size_t i;
1753
1754 BUG_ON(o2net_wq == NULL);
1755 BUG_ON(o2net_listen_sock == NULL);
1756
1757 /* stop the listening socket from generating work */
1758 write_lock_bh(&sock->sk->sk_callback_lock);
1759 sock->sk->sk_data_ready = sock->sk->sk_user_data;
1760 sock->sk->sk_user_data = NULL;
1761 write_unlock_bh(&sock->sk->sk_callback_lock);
1762
1763 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
1764 struct o2nm_node *node = o2nm_get_node_by_num(i);
1765 if (node) {
1766 o2net_disconnect_node(node);
1767 o2nm_node_put(node);
1768 }
1769 }
1770
1771 /* finish all work and tear down the work queue */
1772 mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n");
1773 destroy_workqueue(o2net_wq);
1774 o2net_wq = NULL;
1775
1776 sock_release(o2net_listen_sock);
1777 o2net_listen_sock = NULL;
1778
1779 o2quo_conn_err(node->nd_num);
1780}
1781
1782/* ------------------------------------------------------------ */
1783
1784int o2net_init(void)
1785{
1786 unsigned long i;
1787
1788 o2quo_init();
1789
1790 o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL);
1791 o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
1792 o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
1793 if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) {
1794 kfree(o2net_hand);
1795 kfree(o2net_keep_req);
1796 kfree(o2net_keep_resp);
1797 return -ENOMEM;
1798 }
1799
1800 o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
1801 o2net_hand->connector_id = cpu_to_be64(1);
1802
1803 o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC);
1804 o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC);
1805
1806 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
1807 struct o2net_node *nn = o2net_nn_from_num(i);
1808
1809 spin_lock_init(&nn->nn_lock);
1810 INIT_WORK(&nn->nn_connect_work, o2net_start_connect, nn);
1811 INIT_WORK(&nn->nn_connect_expired, o2net_connect_expired, nn);
1812 INIT_WORK(&nn->nn_still_up, o2net_still_up, nn);
1813 /* until we see hb from a node we'll return einval */
1814 nn->nn_persistent_error = -ENOTCONN;
1815 init_waitqueue_head(&nn->nn_sc_wq);
1816 idr_init(&nn->nn_status_idr);
1817 INIT_LIST_HEAD(&nn->nn_status_list);
1818 }
1819
1820 return 0;
1821}
1822
1823void o2net_exit(void)
1824{
1825 o2quo_exit();
1826 kfree(o2net_hand);
1827 kfree(o2net_keep_req);
1828 kfree(o2net_keep_resp);
1829}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
new file mode 100644
index 000000000000..a6f4585501c8
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp.h
@@ -0,0 +1,113 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * tcp.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef O2CLUSTER_TCP_H
28#define O2CLUSTER_TCP_H
29
30#include <linux/socket.h>
31#ifdef __KERNEL__
32#include <net/sock.h>
33#include <linux/tcp.h>
34#else
35#include <sys/socket.h>
36#endif
37#include <linux/inet.h>
38#include <linux/in.h>
39
40struct o2net_msg
41{
42 __be16 magic;
43 __be16 data_len;
44 __be16 msg_type;
45 __be16 pad1;
46 __be32 sys_status;
47 __be32 status;
48 __be32 key;
49 __be32 msg_num;
50 __u8 buf[0];
51};
52
53typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data);
54
55#define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg))
56
57/* TODO: figure this out.... */
58static inline int o2net_link_down(int err, struct socket *sock)
59{
60 if (sock) {
61 if (sock->sk->sk_state != TCP_ESTABLISHED &&
62 sock->sk->sk_state != TCP_CLOSE_WAIT)
63 return 1;
64 }
65
66 if (err >= 0)
67 return 0;
68 switch (err) {
69 /* ????????????????????????? */
70 case -ERESTARTSYS:
71 case -EBADF:
72 /* When the server has died, an ICMP port unreachable
73 * message prompts ECONNREFUSED. */
74 case -ECONNREFUSED:
75 case -ENOTCONN:
76 case -ECONNRESET:
77 case -EPIPE:
78 return 1;
79 }
80 return 0;
81}
82
83enum {
84 O2NET_DRIVER_UNINITED,
85 O2NET_DRIVER_READY,
86};
87
88int o2net_init_tcp_sock(struct inode *inode);
89int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
90 u8 target_node, int *status);
91int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
92 size_t veclen, u8 target_node, int *status);
93int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len,
94 struct inode *group);
95
96int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
97 o2net_msg_handler_func *func, void *data,
98 struct list_head *unreg_list);
99void o2net_unregister_handler_list(struct list_head *list);
100
101struct o2nm_node;
102int o2net_register_hb_callbacks(void);
103void o2net_unregister_hb_callbacks(void);
104int o2net_start_listening(struct o2nm_node *node);
105void o2net_stop_listening(struct o2nm_node *node);
106void o2net_disconnect_node(struct o2nm_node *node);
107
108int o2net_init(void);
109void o2net_exit(void);
110int o2net_proc_init(struct proc_dir_entry *parent);
111void o2net_proc_exit(struct proc_dir_entry *parent);
112
113#endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
new file mode 100644
index 000000000000..ff9e2e2104c2
--- /dev/null
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -0,0 +1,174 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef O2CLUSTER_TCP_INTERNAL_H
23#define O2CLUSTER_TCP_INTERNAL_H
24
25#define O2NET_MSG_MAGIC ((u16)0xfa55)
26#define O2NET_MSG_STATUS_MAGIC ((u16)0xfa56)
27#define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57)
28#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
29
30/* same as hb delay, we're waiting for another node to recognize our hb */
31#define O2NET_RECONNECT_DELAY_MS O2HB_REGION_TIMEOUT_MS
32
33/* we're delaying our quorum decision so that heartbeat will have timed
34 * out truly dead nodes by the time we come around to making decisions
35 * on their number */
36#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
37
38#define O2NET_KEEPALIVE_DELAY_SECS 5
39#define O2NET_IDLE_TIMEOUT_SECS 10
40
41/*
42 * This version number represents quite a lot, unfortunately. It not
43 * only represents the raw network message protocol on the wire but also
44 * locking semantics of the file system using the protocol. It should
45 * be somewhere else, I'm sure, but right now it isn't.
46 *
47 * New in version 2:
48 * - full 64 bit i_size in the metadata lock lvbs
49 * - introduction of "rw" lock and pushing meta/data locking down
50 */
51#define O2NET_PROTOCOL_VERSION 2ULL
52struct o2net_handshake {
53 __be64 protocol_version;
54 __be64 connector_id;
55};
56
57struct o2net_node {
58 /* this is never called from int/bh */
59 spinlock_t nn_lock;
60
61 /* set the moment an sc is allocated and a connect is started */
62 struct o2net_sock_container *nn_sc;
63 /* _valid is only set after the handshake passes and tx can happen */
64 unsigned nn_sc_valid:1;
65 /* if this is set tx just returns it */
66 int nn_persistent_error;
67
68 /* threads waiting for an sc to arrive wait on the wq for generation
69 * to increase. it is increased when a connecting socket succeeds
70 * or fails or when an accepted socket is attached. */
71 wait_queue_head_t nn_sc_wq;
72
73 struct idr nn_status_idr;
74 struct list_head nn_status_list;
75
76 /* connects are attempted from when heartbeat comes up until either hb
77 * goes down, the node is unconfigured, no connect attempts succeed
78 * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work
79 * is queued from set_nn_state both from hb up and from itself if a
80 * connect attempt fails and so can be self-arming. shutdown is
81 * careful to first mark the nn such that no connects will be attempted
82 * before canceling delayed connect work and flushing the queue. */
83 struct work_struct nn_connect_work;
84 unsigned long nn_last_connect_attempt;
85
86 /* this is queued as nodes come up and is canceled when a connection is
87 * established. this expiring gives up on the node and errors out
88 * transmits */
89 struct work_struct nn_connect_expired;
90
91 /* after we give up on a socket we wait a while before deciding
92 * that it is still heartbeating and that we should do some
93 * quorum work */
94 struct work_struct nn_still_up;
95};
96
97struct o2net_sock_container {
98 struct kref sc_kref;
99 /* the next two are vaild for the life time of the sc */
100 struct socket *sc_sock;
101 struct o2nm_node *sc_node;
102
103 /* all of these sc work structs hold refs on the sc while they are
104 * queued. they should not be able to ref a freed sc. the teardown
105 * race is with o2net_wq destruction in o2net_stop_listening() */
106
107 /* rx and connect work are generated from socket callbacks. sc
108 * shutdown removes the callbacks and then flushes the work queue */
109 struct work_struct sc_rx_work;
110 struct work_struct sc_connect_work;
111 /* shutdown work is triggered in two ways. the simple way is
112 * for a code path calls ensure_shutdown which gets a lock, removes
113 * the sc from the nn, and queues the work. in this case the
114 * work is single-shot. the work is also queued from a sock
115 * callback, though, and in this case the work will find the sc
116 * still on the nn and will call ensure_shutdown itself.. this
117 * ends up triggering the shutdown work again, though nothing
118 * will be done in that second iteration. so work queue teardown
119 * has to be careful to remove the sc from the nn before waiting
120 * on the work queue so that the shutdown work doesn't remove the
121 * sc and rearm itself.
122 */
123 struct work_struct sc_shutdown_work;
124
125 struct timer_list sc_idle_timeout;
126 struct work_struct sc_keepalive_work;
127
128 unsigned sc_handshake_ok:1;
129
130 struct page *sc_page;
131 size_t sc_page_off;
132
133 /* original handlers for the sockets */
134 void (*sc_state_change)(struct sock *sk);
135 void (*sc_data_ready)(struct sock *sk, int bytes);
136
137 struct timeval sc_tv_timer;
138 struct timeval sc_tv_data_ready;
139 struct timeval sc_tv_advance_start;
140 struct timeval sc_tv_advance_stop;
141 struct timeval sc_tv_func_start;
142 struct timeval sc_tv_func_stop;
143 u32 sc_msg_key;
144 u16 sc_msg_type;
145};
146
147struct o2net_msg_handler {
148 struct rb_node nh_node;
149 u32 nh_max_len;
150 u32 nh_msg_type;
151 u32 nh_key;
152 o2net_msg_handler_func *nh_func;
153 o2net_msg_handler_func *nh_func_data;
154 struct kref nh_kref;
155 struct list_head nh_unregister_item;
156};
157
158enum o2net_system_error {
159 O2NET_ERR_NONE = 0,
160 O2NET_ERR_NO_HNDLR,
161 O2NET_ERR_OVERFLOW,
162 O2NET_ERR_DIED,
163 O2NET_ERR_MAX
164};
165
166struct o2net_status_wait {
167 enum o2net_system_error ns_sys_status;
168 s32 ns_status;
169 int ns_id;
170 wait_queue_head_t ns_wq;
171 struct list_head ns_node_item;
172};
173
174#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
new file mode 100644
index 000000000000..7286c48bb30d
--- /dev/null
+++ b/fs/ocfs2/cluster/ver.c
@@ -0,0 +1,42 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "ver.h"
30
31#define CLUSTER_BUILD_VERSION "1.3.3"
32
33#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
34
35void cluster_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
new file mode 100644
index 000000000000..32554c3382c2
--- /dev/null
+++ b/fs/ocfs2/cluster/ver.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef O2CLUSTER_VER_H
27#define O2CLUSTER_VER_H
28
29void cluster_print_version(void);
30
31#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
new file mode 100644
index 000000000000..bd85182e97bc
--- /dev/null
+++ b/fs/ocfs2/dcache.c
@@ -0,0 +1,91 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dcache.c
5 *
6 * dentry cache handling code
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/namei.h>
30
31#define MLOG_MASK_PREFIX ML_DCACHE
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "alloc.h"
37#include "dcache.h"
38#include "file.h"
39#include "inode.h"
40
41static int ocfs2_dentry_revalidate(struct dentry *dentry,
42 struct nameidata *nd)
43{
44 struct inode *inode = dentry->d_inode;
45 int ret = 0; /* if all else fails, just return false */
46 struct ocfs2_super *osb;
47
48 mlog_entry("(0x%p, '%.*s')\n", dentry,
49 dentry->d_name.len, dentry->d_name.name);
50
51 /* Never trust a negative dentry - force a new lookup. */
52 if (inode == NULL) {
53 mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
54 dentry->d_name.name);
55 goto bail;
56 }
57
58 osb = OCFS2_SB(inode->i_sb);
59
60 BUG_ON(!osb);
61
62 if (inode != osb->root_inode) {
63 spin_lock(&OCFS2_I(inode)->ip_lock);
64 /* did we or someone else delete this inode? */
65 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
66 spin_unlock(&OCFS2_I(inode)->ip_lock);
67 mlog(0, "inode (%"MLFu64") deleted, returning false\n",
68 OCFS2_I(inode)->ip_blkno);
69 goto bail;
70 }
71 spin_unlock(&OCFS2_I(inode)->ip_lock);
72
73 if (!inode->i_nlink) {
74 mlog(0, "Inode %"MLFu64" orphaned, returning false "
75 "dir = %d\n", OCFS2_I(inode)->ip_blkno,
76 S_ISDIR(inode->i_mode));
77 goto bail;
78 }
79 }
80
81 ret = 1;
82
83bail:
84 mlog_exit(ret);
85
86 return ret;
87}
88
89struct dentry_operations ocfs2_dentry_ops = {
90 .d_revalidate = ocfs2_dentry_revalidate,
91};
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
new file mode 100644
index 000000000000..90072771114b
--- /dev/null
+++ b/fs/ocfs2/dcache.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dcache.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_DCACHE_H
27#define OCFS2_DCACHE_H
28
29extern struct dentry_operations ocfs2_dentry_ops;
30
31#endif /* OCFS2_DCACHE_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
new file mode 100644
index 000000000000..856e20ae8263
--- /dev/null
+++ b/fs/ocfs2/dir.c
@@ -0,0 +1,618 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dir.c
5 *
6 * Creates, reads, walks and deletes directory-nodes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * Portions of this code from linux/fs/ext3/dir.c
11 *
12 * Copyright (C) 1992, 1993, 1994, 1995
13 * Remy Card (card@masi.ibp.fr)
14 * Laboratoire MASI - Institut Blaise pascal
15 * Universite Pierre et Marie Curie (Paris VI)
16 *
17 * from
18 *
19 * linux/fs/minix/dir.c
20 *
21 * Copyright (C) 1991, 1992 Linux Torvalds
22 *
23 * This program is free software; you can redistribute it and/or
24 * modify it under the terms of the GNU General Public
25 * License as published by the Free Software Foundation; either
26 * version 2 of the License, or (at your option) any later version.
27 *
28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
31 * General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public
34 * License along with this program; if not, write to the
35 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
36 * Boston, MA 021110-1307, USA.
37 */
38
39#include <linux/fs.h>
40#include <linux/types.h>
41#include <linux/slab.h>
42#include <linux/highmem.h>
43
44#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h>
46
47#include "ocfs2.h"
48
49#include "alloc.h"
50#include "dir.h"
51#include "dlmglue.h"
52#include "extent_map.h"
53#include "file.h"
54#include "inode.h"
55#include "journal.h"
56#include "namei.h"
57#include "suballoc.h"
58#include "uptodate.h"
59
60#include "buffer_head_io.h"
61
62static unsigned char ocfs2_filetype_table[] = {
63 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
64};
65
66static int ocfs2_extend_dir(struct ocfs2_super *osb,
67 struct inode *dir,
68 struct buffer_head *parent_fe_bh,
69 struct buffer_head **new_de_bh);
70/*
71 * ocfs2_readdir()
72 *
73 */
74int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
75{
76 int error = 0;
77 unsigned long offset, blk;
78 int i, num, stored;
79 struct buffer_head * bh, * tmp;
80 struct ocfs2_dir_entry * de;
81 int err;
82 struct inode *inode = filp->f_dentry->d_inode;
83 struct super_block * sb = inode->i_sb;
84 int have_disk_lock = 0;
85
86 mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
87
88 stored = 0;
89 bh = NULL;
90
91 error = ocfs2_meta_lock(inode, NULL, NULL, 0);
92 if (error < 0) {
93 if (error != -ENOENT)
94 mlog_errno(error);
95 /* we haven't got any yet, so propagate the error. */
96 stored = error;
97 goto bail;
98 }
99 have_disk_lock = 1;
100
101 offset = filp->f_pos & (sb->s_blocksize - 1);
102
103 while (!error && !stored && filp->f_pos < i_size_read(inode)) {
104 blk = (filp->f_pos) >> sb->s_blocksize_bits;
105 bh = ocfs2_bread(inode, blk, &err, 0);
106 if (!bh) {
107 mlog(ML_ERROR, "directory #%"MLFu64" contains a hole "
108 "at offset %lld\n",
109 OCFS2_I(inode)->ip_blkno,
110 filp->f_pos);
111 filp->f_pos += sb->s_blocksize - offset;
112 continue;
113 }
114
115 /*
116 * Do the readahead (8k)
117 */
118 if (!offset) {
119 for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
120 i > 0; i--) {
121 tmp = ocfs2_bread(inode, ++blk, &err, 1);
122 if (tmp)
123 brelse(tmp);
124 }
125 }
126
127revalidate:
128 /* If the dir block has changed since the last call to
129 * readdir(2), then we might be pointing to an invalid
130 * dirent right now. Scan from the start of the block
131 * to make sure. */
132 if (filp->f_version != inode->i_version) {
133 for (i = 0; i < sb->s_blocksize && i < offset; ) {
134 de = (struct ocfs2_dir_entry *) (bh->b_data + i);
135 /* It's too expensive to do a full
136 * dirent test each time round this
137 * loop, but we do have to test at
138 * least that it is non-zero. A
139 * failure will be detected in the
140 * dirent test below. */
141 if (le16_to_cpu(de->rec_len) <
142 OCFS2_DIR_REC_LEN(1))
143 break;
144 i += le16_to_cpu(de->rec_len);
145 }
146 offset = i;
147 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
148 | offset;
149 filp->f_version = inode->i_version;
150 }
151
152 while (!error && filp->f_pos < i_size_read(inode)
153 && offset < sb->s_blocksize) {
154 de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
155 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
156 /* On error, skip the f_pos to the
157 next block. */
158 filp->f_pos = (filp->f_pos |
159 (sb->s_blocksize - 1)) + 1;
160 brelse(bh);
161 goto bail;
162 }
163 offset += le16_to_cpu(de->rec_len);
164 if (le64_to_cpu(de->inode)) {
165 /* We might block in the next section
166 * if the data destination is
167 * currently swapped out. So, use a
168 * version stamp to detect whether or
169 * not the directory has been modified
170 * during the copy operation.
171 */
172 unsigned long version = filp->f_version;
173 unsigned char d_type = DT_UNKNOWN;
174
175 if (de->file_type < OCFS2_FT_MAX)
176 d_type = ocfs2_filetype_table[de->file_type];
177 error = filldir(dirent, de->name,
178 de->name_len,
179 filp->f_pos,
180 ino_from_blkno(sb, le64_to_cpu(de->inode)),
181 d_type);
182 if (error)
183 break;
184 if (version != filp->f_version)
185 goto revalidate;
186 stored ++;
187 }
188 filp->f_pos += le16_to_cpu(de->rec_len);
189 }
190 offset = 0;
191 brelse(bh);
192 }
193
194 stored = 0;
195bail:
196 if (have_disk_lock)
197 ocfs2_meta_unlock(inode, 0);
198
199 mlog_exit(stored);
200
201 return stored;
202}
203
204/*
205 * NOTE: this should always be called with parent dir i_sem taken.
206 */
207int ocfs2_find_files_on_disk(const char *name,
208 int namelen,
209 u64 *blkno,
210 struct inode *inode,
211 struct buffer_head **dirent_bh,
212 struct ocfs2_dir_entry **dirent)
213{
214 int status = -ENOENT;
215 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
216
217 mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, "
218 "inode=%p)\n",
219 osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode);
220
221 *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
222 if (!*dirent_bh || !*dirent) {
223 status = -ENOENT;
224 goto leave;
225 }
226
227 *blkno = le64_to_cpu((*dirent)->inode);
228
229 status = 0;
230leave:
231 if (status < 0) {
232 *dirent = NULL;
233 if (*dirent_bh) {
234 brelse(*dirent_bh);
235 *dirent_bh = NULL;
236 }
237 }
238
239 mlog_exit(status);
240 return status;
241}
242
243/* Check for a name within a directory.
244 *
245 * Return 0 if the name does not exist
246 * Return -EEXIST if the directory contains the name
247 *
248 * Callers should have i_sem + a cluster lock on dir
249 */
250int ocfs2_check_dir_for_entry(struct inode *dir,
251 const char *name,
252 int namelen)
253{
254 int ret;
255 struct buffer_head *dirent_bh = NULL;
256 struct ocfs2_dir_entry *dirent = NULL;
257
258 mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno,
259 namelen, name);
260
261 ret = -EEXIST;
262 dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
263 if (dirent_bh)
264 goto bail;
265
266 ret = 0;
267bail:
268 if (dirent_bh)
269 brelse(dirent_bh);
270
271 mlog_exit(ret);
272 return ret;
273}
274
275/*
276 * routine to check that the specified directory is empty (for rmdir)
277 */
278int ocfs2_empty_dir(struct inode *inode)
279{
280 unsigned long offset;
281 struct buffer_head * bh;
282 struct ocfs2_dir_entry * de, * de1;
283 struct super_block * sb;
284 int err;
285
286 sb = inode->i_sb;
287 if ((i_size_read(inode) <
288 (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
289 !(bh = ocfs2_bread(inode, 0, &err, 0))) {
290 mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
291 "no data block\n",
292 OCFS2_I(inode)->ip_blkno);
293 return 1;
294 }
295
296 de = (struct ocfs2_dir_entry *) bh->b_data;
297 de1 = (struct ocfs2_dir_entry *)
298 ((char *)de + le16_to_cpu(de->rec_len));
299 if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
300 !le64_to_cpu(de1->inode) ||
301 strcmp(".", de->name) ||
302 strcmp("..", de1->name)) {
303 mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
304 "no `.' or `..'\n",
305 OCFS2_I(inode)->ip_blkno);
306 brelse(bh);
307 return 1;
308 }
309 offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
310 de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
311 while (offset < i_size_read(inode) ) {
312 if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
313 brelse(bh);
314 bh = ocfs2_bread(inode,
315 offset >> sb->s_blocksize_bits, &err, 0);
316 if (!bh) {
317 mlog(ML_ERROR, "directory #%"MLFu64" contains "
318 "a hole at offset %lu\n",
319 OCFS2_I(inode)->ip_blkno, offset);
320 offset += sb->s_blocksize;
321 continue;
322 }
323 de = (struct ocfs2_dir_entry *) bh->b_data;
324 }
325 if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
326 brelse(bh);
327 return 1;
328 }
329 if (le64_to_cpu(de->inode)) {
330 brelse(bh);
331 return 0;
332 }
333 offset += le16_to_cpu(de->rec_len);
334 de = (struct ocfs2_dir_entry *)
335 ((char *)de + le16_to_cpu(de->rec_len));
336 }
337 brelse(bh);
338 return 1;
339}
340
341/* returns a bh of the 1st new block in the allocation. */
342int ocfs2_do_extend_dir(struct super_block *sb,
343 struct ocfs2_journal_handle *handle,
344 struct inode *dir,
345 struct buffer_head *parent_fe_bh,
346 struct ocfs2_alloc_context *data_ac,
347 struct ocfs2_alloc_context *meta_ac,
348 struct buffer_head **new_bh)
349{
350 int status;
351 int extend;
352 u64 p_blkno;
353
354 spin_lock(&OCFS2_I(dir)->ip_lock);
355 extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
356 spin_unlock(&OCFS2_I(dir)->ip_lock);
357
358 if (extend) {
359 status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
360 parent_fe_bh, handle,
361 data_ac, meta_ac, NULL);
362 BUG_ON(status == -EAGAIN);
363 if (status < 0) {
364 mlog_errno(status);
365 goto bail;
366 }
367 }
368
369 status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
370 (sb->s_blocksize_bits - 9)),
371 1, &p_blkno, NULL);
372 if (status < 0) {
373 mlog_errno(status);
374 goto bail;
375 }
376
377 *new_bh = sb_getblk(sb, p_blkno);
378 if (!*new_bh) {
379 status = -EIO;
380 mlog_errno(status);
381 goto bail;
382 }
383 status = 0;
384bail:
385 mlog_exit(status);
386 return status;
387}
388
389/* assumes you already have a cluster lock on the directory. */
390static int ocfs2_extend_dir(struct ocfs2_super *osb,
391 struct inode *dir,
392 struct buffer_head *parent_fe_bh,
393 struct buffer_head **new_de_bh)
394{
395 int status = 0;
396 int credits, num_free_extents;
397 loff_t dir_i_size;
398 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
399 struct ocfs2_alloc_context *data_ac = NULL;
400 struct ocfs2_alloc_context *meta_ac = NULL;
401 struct ocfs2_journal_handle *handle = NULL;
402 struct buffer_head *new_bh = NULL;
403 struct ocfs2_dir_entry * de;
404 struct super_block *sb = osb->sb;
405
406 mlog_entry_void();
407
408 dir_i_size = i_size_read(dir);
409 mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n",
410 OCFS2_I(dir)->ip_blkno, dir_i_size);
411
412 handle = ocfs2_alloc_handle(osb);
413 if (handle == NULL) {
414 status = -ENOMEM;
415 mlog_errno(status);
416 goto bail;
417 }
418
419 /* dir->i_size is always block aligned. */
420 spin_lock(&OCFS2_I(dir)->ip_lock);
421 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
422 spin_unlock(&OCFS2_I(dir)->ip_lock);
423 num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
424 if (num_free_extents < 0) {
425 status = num_free_extents;
426 mlog_errno(status);
427 goto bail;
428 }
429
430 if (!num_free_extents) {
431 status = ocfs2_reserve_new_metadata(osb, handle,
432 fe, &meta_ac);
433 if (status < 0) {
434 if (status != -ENOSPC)
435 mlog_errno(status);
436 goto bail;
437 }
438 }
439
440 status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
441 if (status < 0) {
442 if (status != -ENOSPC)
443 mlog_errno(status);
444 goto bail;
445 }
446
447 credits = ocfs2_calc_extend_credits(sb, fe, 1);
448 } else {
449 spin_unlock(&OCFS2_I(dir)->ip_lock);
450 credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
451 }
452
453 handle = ocfs2_start_trans(osb, handle, credits);
454 if (IS_ERR(handle)) {
455 status = PTR_ERR(handle);
456 handle = NULL;
457 mlog_errno(status);
458 goto bail;
459 }
460
461 status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
462 data_ac, meta_ac, &new_bh);
463 if (status < 0) {
464 mlog_errno(status);
465 goto bail;
466 }
467
468 ocfs2_set_new_buffer_uptodate(dir, new_bh);
469
470 status = ocfs2_journal_access(handle, dir, new_bh,
471 OCFS2_JOURNAL_ACCESS_CREATE);
472 if (status < 0) {
473 mlog_errno(status);
474 goto bail;
475 }
476 memset(new_bh->b_data, 0, sb->s_blocksize);
477 de = (struct ocfs2_dir_entry *) new_bh->b_data;
478 de->inode = 0;
479 de->rec_len = cpu_to_le16(sb->s_blocksize);
480 status = ocfs2_journal_dirty(handle, new_bh);
481 if (status < 0) {
482 mlog_errno(status);
483 goto bail;
484 }
485
486 dir_i_size += dir->i_sb->s_blocksize;
487 i_size_write(dir, dir_i_size);
488 dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
489 status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
490 if (status < 0) {
491 mlog_errno(status);
492 goto bail;
493 }
494
495 *new_de_bh = new_bh;
496 get_bh(*new_de_bh);
497bail:
498 if (handle)
499 ocfs2_commit_trans(handle);
500
501 if (data_ac)
502 ocfs2_free_alloc_context(data_ac);
503 if (meta_ac)
504 ocfs2_free_alloc_context(meta_ac);
505
506 if (new_bh)
507 brelse(new_bh);
508
509 mlog_exit(status);
510 return status;
511}
512
513/*
514 * Search the dir for a good spot, extending it if necessary. The
515 * block containing an appropriate record is returned in ret_de_bh.
516 */
517int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
518 struct inode *dir,
519 struct buffer_head *parent_fe_bh,
520 const char *name,
521 int namelen,
522 struct buffer_head **ret_de_bh)
523{
524 unsigned long offset;
525 struct buffer_head * bh = NULL;
526 unsigned short rec_len;
527 struct ocfs2_dinode *fe;
528 struct ocfs2_dir_entry *de;
529 struct super_block *sb;
530 int status;
531
532 mlog_entry_void();
533
534 mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n",
535 namelen, OCFS2_I(dir)->ip_blkno);
536
537 BUG_ON(!S_ISDIR(dir->i_mode));
538 fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
539 BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
540
541 sb = dir->i_sb;
542
543 if (!namelen) {
544 status = -EINVAL;
545 mlog_errno(status);
546 goto bail;
547 }
548
549 bh = ocfs2_bread(dir, 0, &status, 0);
550 if (!bh) {
551 mlog_errno(status);
552 goto bail;
553 }
554
555 rec_len = OCFS2_DIR_REC_LEN(namelen);
556 offset = 0;
557 de = (struct ocfs2_dir_entry *) bh->b_data;
558 while (1) {
559 if ((char *)de >= sb->s_blocksize + bh->b_data) {
560 brelse(bh);
561 bh = NULL;
562
563 if (i_size_read(dir) <= offset) {
564 status = ocfs2_extend_dir(osb,
565 dir,
566 parent_fe_bh,
567 &bh);
568 if (status < 0) {
569 mlog_errno(status);
570 goto bail;
571 }
572 BUG_ON(!bh);
573 *ret_de_bh = bh;
574 get_bh(*ret_de_bh);
575 goto bail;
576 }
577 bh = ocfs2_bread(dir,
578 offset >> sb->s_blocksize_bits,
579 &status,
580 0);
581 if (!bh) {
582 mlog_errno(status);
583 goto bail;
584 }
585 /* move to next block */
586 de = (struct ocfs2_dir_entry *) bh->b_data;
587 }
588 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
589 status = -ENOENT;
590 goto bail;
591 }
592 if (ocfs2_match(namelen, name, de)) {
593 status = -EEXIST;
594 goto bail;
595 }
596 if (((le64_to_cpu(de->inode) == 0) &&
597 (le16_to_cpu(de->rec_len) >= rec_len)) ||
598 (le16_to_cpu(de->rec_len) >=
599 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
600 /* Ok, we found a spot. Return this bh and let
601 * the caller actually fill it in. */
602 *ret_de_bh = bh;
603 get_bh(*ret_de_bh);
604 status = 0;
605 goto bail;
606 }
607 offset += le16_to_cpu(de->rec_len);
608 de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
609 }
610
611 status = 0;
612bail:
613 if (bh)
614 brelse(bh);
615
616 mlog_exit(status);
617 return status;
618}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
new file mode 100644
index 000000000000..5f614ec9649c
--- /dev/null
+++ b/fs/ocfs2/dir.h
@@ -0,0 +1,54 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dir.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_DIR_H
27#define OCFS2_DIR_H
28
29int ocfs2_check_dir_for_entry(struct inode *dir,
30 const char *name,
31 int namelen);
32int ocfs2_empty_dir(struct inode *inode); /* FIXME: to namei.c */
33int ocfs2_find_files_on_disk(const char *name,
34 int namelen,
35 u64 *blkno,
36 struct inode *inode,
37 struct buffer_head **dirent_bh,
38 struct ocfs2_dir_entry **dirent);
39int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
40int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
41 struct inode *dir,
42 struct buffer_head *parent_fe_bh,
43 const char *name,
44 int namelen,
45 struct buffer_head **ret_de_bh);
46struct ocfs2_alloc_context;
47int ocfs2_do_extend_dir(struct super_block *sb,
48 struct ocfs2_journal_handle *handle,
49 struct inode *dir,
50 struct buffer_head *parent_fe_bh,
51 struct ocfs2_alloc_context *data_ac,
52 struct ocfs2_alloc_context *meta_ac,
53 struct buffer_head **new_bh);
54#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
new file mode 100644
index 000000000000..ce3f7c29d270
--- /dev/null
+++ b/fs/ocfs2/dlm/Makefile
@@ -0,0 +1,8 @@
1EXTRA_CFLAGS += -Ifs/ocfs2
2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
7
8ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h
new file mode 100644
index 000000000000..53652f51c0e1
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmapi.h
@@ -0,0 +1,214 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmapi.h
5 *
6 * externally exported dlm interfaces
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifndef DLMAPI_H
28#define DLMAPI_H
29
30struct dlm_lock;
31struct dlm_ctxt;
32
33/* NOTE: changes made to this enum should be reflected in dlmdebug.c */
34enum dlm_status {
35 DLM_NORMAL = 0, /* 0: request in progress */
36 DLM_GRANTED, /* 1: request granted */
37 DLM_DENIED, /* 2: request denied */
38 DLM_DENIED_NOLOCKS, /* 3: request denied, out of system resources */
39 DLM_WORKING, /* 4: async request in progress */
40 DLM_BLOCKED, /* 5: lock request blocked */
41 DLM_BLOCKED_ORPHAN, /* 6: lock request blocked by a orphan lock*/
42 DLM_DENIED_GRACE_PERIOD, /* 7: topological change in progress */
43 DLM_SYSERR, /* 8: system error */
44 DLM_NOSUPPORT, /* 9: unsupported */
45 DLM_CANCELGRANT, /* 10: can't cancel convert: already granted */
46 DLM_IVLOCKID, /* 11: bad lockid */
47 DLM_SYNC, /* 12: synchronous request granted */
48 DLM_BADTYPE, /* 13: bad resource type */
49 DLM_BADRESOURCE, /* 14: bad resource handle */
50 DLM_MAXHANDLES, /* 15: no more resource handles */
51 DLM_NOCLINFO, /* 16: can't contact cluster manager */
52 DLM_NOLOCKMGR, /* 17: can't contact lock manager */
53 DLM_NOPURGED, /* 18: can't contact purge daemon */
54 DLM_BADARGS, /* 19: bad api args */
55 DLM_VOID, /* 20: no status */
56 DLM_NOTQUEUED, /* 21: NOQUEUE was specified and request failed */
57 DLM_IVBUFLEN, /* 22: invalid resource name length */
58 DLM_CVTUNGRANT, /* 23: attempted to convert ungranted lock */
59 DLM_BADPARAM, /* 24: invalid lock mode specified */
60 DLM_VALNOTVALID, /* 25: value block has been invalidated */
61 DLM_REJECTED, /* 26: request rejected, unrecognized client */
62 DLM_ABORT, /* 27: blocked lock request cancelled */
63 DLM_CANCEL, /* 28: conversion request cancelled */
64 DLM_IVRESHANDLE, /* 29: invalid resource handle */
65 DLM_DEADLOCK, /* 30: deadlock recovery refused this request */
66 DLM_DENIED_NOASTS, /* 31: failed to allocate AST */
67 DLM_FORWARD, /* 32: request must wait for primary's response */
68 DLM_TIMEOUT, /* 33: timeout value for lock has expired */
69 DLM_IVGROUPID, /* 34: invalid group specification */
70 DLM_VERS_CONFLICT, /* 35: version conflicts prevent request handling */
71 DLM_BAD_DEVICE_PATH, /* 36: Locks device does not exist or path wrong */
72 DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */
73 DLM_NO_CONTROL_DEVICE, /* 38: Cannot set options on opened device */
74
75 DLM_RECOVERING, /* 39: extension, allows caller to fail a lock
76 request if it is being recovered */
77 DLM_MIGRATING, /* 40: extension, allows caller to fail a lock
78 request if it is being migrated */
79 DLM_MAXSTATS, /* 41: upper limit for return code validation */
80};
81
82/* for pretty-printing dlm_status error messages */
83const char *dlm_errmsg(enum dlm_status err);
84/* for pretty-printing dlm_status error names */
85const char *dlm_errname(enum dlm_status err);
86
87/* Eventually the DLM will use standard errno values, but in the
88 * meantime this lets us track dlm errors as they bubble up. When we
89 * bring its error reporting into line with the rest of the stack,
90 * these can just be replaced with calls to mlog_errno. */
91#define dlm_error(st) do { \
92 if ((st) != DLM_RECOVERING && \
93 (st) != DLM_MIGRATING && \
94 (st) != DLM_FORWARD) \
95 mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \
96} while (0)
97
98#define DLM_LKSB_UNUSED1 0x01
99#define DLM_LKSB_PUT_LVB 0x02
100#define DLM_LKSB_GET_LVB 0x04
101#define DLM_LKSB_UNUSED2 0x08
102#define DLM_LKSB_UNUSED3 0x10
103#define DLM_LKSB_UNUSED4 0x20
104#define DLM_LKSB_UNUSED5 0x40
105#define DLM_LKSB_UNUSED6 0x80
106
107#define DLM_LVB_LEN 64
108
109/* Callers are only allowed access to the lvb and status members of
110 * this struct. */
111struct dlm_lockstatus {
112 enum dlm_status status;
113 u32 flags;
114 struct dlm_lock *lockid;
115 char lvb[DLM_LVB_LEN];
116};
117
118/* Valid lock modes. */
119#define LKM_IVMODE (-1) /* invalid mode */
120#define LKM_NLMODE 0 /* null lock */
121#define LKM_CRMODE 1 /* concurrent read unsupported */
122#define LKM_CWMODE 2 /* concurrent write unsupported */
123#define LKM_PRMODE 3 /* protected read */
124#define LKM_PWMODE 4 /* protected write unsupported */
125#define LKM_EXMODE 5 /* exclusive */
126#define LKM_MAXMODE 5
127#define LKM_MODEMASK 0xff
128
129/* Flags passed to dlmlock and dlmunlock:
130 * reserved: flags used by the "real" dlm
131 * only a few are supported by this dlm
132 * (U) = unsupported by ocfs2 dlm */
133#define LKM_ORPHAN 0x00000010 /* this lock is orphanable (U) */
134#define LKM_PARENTABLE 0x00000020 /* this lock was orphaned (U) */
135#define LKM_BLOCK 0x00000040 /* blocking lock request (U) */
136#define LKM_LOCAL 0x00000080 /* local lock request */
137#define LKM_VALBLK 0x00000100 /* lock value block request */
138#define LKM_NOQUEUE 0x00000200 /* non blocking request */
139#define LKM_CONVERT 0x00000400 /* conversion request */
140#define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */
141#define LKM_UNLOCK 0x00001000 /* deallocate this lock */
142#define LKM_CANCEL 0x00002000 /* cancel conversion request */
143#define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */
144#define LKM_INVVALBLK 0x00008000 /* invalidate lock value block */
145#define LKM_SYNCSTS 0x00010000 /* return synchronous status if poss (U) */
146#define LKM_TIMEOUT 0x00020000 /* lock request contains timeout (U) */
147#define LKM_SNGLDLCK 0x00040000 /* request can self-deadlock (U) */
148#define LKM_FINDLOCAL 0x00080000 /* find local lock request (U) */
149#define LKM_PROC_OWNED 0x00100000 /* owned by process, not group (U) */
150#define LKM_XID 0x00200000 /* use transaction id for deadlock (U) */
151#define LKM_XID_CONFLICT 0x00400000 /* do not allow lock inheritance (U) */
152#define LKM_FORCE 0x00800000 /* force unlock flag */
153#define LKM_REVVALBLK 0x01000000 /* temporary solution: re-validate
154 lock value block (U) */
155/* unused */
156#define LKM_UNUSED1 0x00000001 /* unused */
157#define LKM_UNUSED2 0x00000002 /* unused */
158#define LKM_UNUSED3 0x00000004 /* unused */
159#define LKM_UNUSED4 0x00000008 /* unused */
160#define LKM_UNUSED5 0x02000000 /* unused */
161#define LKM_UNUSED6 0x04000000 /* unused */
162#define LKM_UNUSED7 0x08000000 /* unused */
163
164/* ocfs2 extensions: internal only
165 * should never be used by caller */
166#define LKM_MIGRATION 0x10000000 /* extension: lockres is to be migrated
167 to another node */
168#define LKM_PUT_LVB 0x20000000 /* extension: lvb is being passed
169 should be applied to lockres */
170#define LKM_GET_LVB 0x40000000 /* extension: lvb should be copied
171 from lockres when lock is granted */
172#define LKM_RECOVERY 0x80000000 /* extension: flag for recovery lock
173 used to avoid recovery rwsem */
174
175
176typedef void (dlm_astlockfunc_t)(void *);
177typedef void (dlm_bastlockfunc_t)(void *, int);
178typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status);
179
180enum dlm_status dlmlock(struct dlm_ctxt *dlm,
181 int mode,
182 struct dlm_lockstatus *lksb,
183 int flags,
184 const char *name,
185 dlm_astlockfunc_t *ast,
186 void *data,
187 dlm_bastlockfunc_t *bast);
188
189enum dlm_status dlmunlock(struct dlm_ctxt *dlm,
190 struct dlm_lockstatus *lksb,
191 int flags,
192 dlm_astunlockfunc_t *unlockast,
193 void *data);
194
195struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key);
196
197void dlm_unregister_domain(struct dlm_ctxt *dlm);
198
199void dlm_print_one_lock(struct dlm_lock *lockid);
200
201typedef void (dlm_eviction_func)(int, void *);
202struct dlm_eviction_cb {
203 struct list_head ec_item;
204 dlm_eviction_func *ec_func;
205 void *ec_data;
206};
207void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
208 dlm_eviction_func *f,
209 void *data);
210void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
211 struct dlm_eviction_cb *cb);
212void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb);
213
214#endif /* DLMAPI_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
new file mode 100644
index 000000000000..8d17d28ef91c
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -0,0 +1,466 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmast.c
5 *
6 * AST and BAST functionality for local and remote nodes
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41
42
43#include "cluster/heartbeat.h"
44#include "cluster/nodemanager.h"
45#include "cluster/tcp.h"
46#include "cluster/endian.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50
51#define MLOG_MASK_PREFIX ML_DLM
52#include "cluster/masklog.h"
53
54static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
55 struct dlm_lock *lock);
56static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
57
58/* Should be called as an ast gets queued to see if the new
59 * lock level will obsolete a pending bast.
60 * For example, if dlm_thread queued a bast for an EX lock that
61 * was blocking another EX, but before sending the bast the
62 * lock owner downconverted to NL, the bast is now obsolete.
63 * Only the ast should be sent.
64 * This is needed because the lock and convert paths can queue
65 * asts out-of-band (not waiting for dlm_thread) in order to
66 * allow for LKM_NOQUEUE to get immediate responses. */
67static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
68{
69 assert_spin_locked(&dlm->ast_lock);
70 assert_spin_locked(&lock->spinlock);
71
72 if (lock->ml.highest_blocked == LKM_IVMODE)
73 return 0;
74 BUG_ON(lock->ml.highest_blocked == LKM_NLMODE);
75
76 if (lock->bast_pending &&
77 list_empty(&lock->bast_list))
78 /* old bast already sent, ok */
79 return 0;
80
81 if (lock->ml.type == LKM_EXMODE)
82 /* EX blocks anything left, any bast still valid */
83 return 0;
84 else if (lock->ml.type == LKM_NLMODE)
85 /* NL blocks nothing, no reason to send any bast, cancel it */
86 return 1;
87 else if (lock->ml.highest_blocked != LKM_EXMODE)
88 /* PR only blocks EX */
89 return 1;
90
91 return 0;
92}
93
94static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
95{
96 mlog_entry_void();
97
98 BUG_ON(!dlm);
99 BUG_ON(!lock);
100
101 assert_spin_locked(&dlm->ast_lock);
102 if (!list_empty(&lock->ast_list)) {
103 mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n",
104 lock->ast_pending, lock->ml.type);
105 BUG();
106 }
107 BUG_ON(!list_empty(&lock->ast_list));
108 if (lock->ast_pending)
109 mlog(0, "lock has an ast getting flushed right now\n");
110
111 /* putting lock on list, add a ref */
112 dlm_lock_get(lock);
113 spin_lock(&lock->spinlock);
114
115 /* check to see if this ast obsoletes the bast */
116 if (dlm_should_cancel_bast(dlm, lock)) {
117 struct dlm_lock_resource *res = lock->lockres;
118 mlog(0, "%s: cancelling bast for %.*s\n",
119 dlm->name, res->lockname.len, res->lockname.name);
120 lock->bast_pending = 0;
121 list_del_init(&lock->bast_list);
122 lock->ml.highest_blocked = LKM_IVMODE;
123 /* removing lock from list, remove a ref. guaranteed
124 * this won't be the last ref because of the get above,
125 * so res->spinlock will not be taken here */
126 dlm_lock_put(lock);
127 /* free up the reserved bast that we are cancelling.
128 * guaranteed that this will not be the last reserved
129 * ast because *both* an ast and a bast were reserved
130 * to get to this point. the res->spinlock will not be
131 * taken here */
132 dlm_lockres_release_ast(dlm, res);
133 }
134 list_add_tail(&lock->ast_list, &dlm->pending_asts);
135 lock->ast_pending = 1;
136 spin_unlock(&lock->spinlock);
137}
138
139void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
140{
141 mlog_entry_void();
142
143 BUG_ON(!dlm);
144 BUG_ON(!lock);
145
146 spin_lock(&dlm->ast_lock);
147 __dlm_queue_ast(dlm, lock);
148 spin_unlock(&dlm->ast_lock);
149}
150
151
152static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
153{
154 mlog_entry_void();
155
156 BUG_ON(!dlm);
157 BUG_ON(!lock);
158 assert_spin_locked(&dlm->ast_lock);
159
160 BUG_ON(!list_empty(&lock->bast_list));
161 if (lock->bast_pending)
162 mlog(0, "lock has a bast getting flushed right now\n");
163
164 /* putting lock on list, add a ref */
165 dlm_lock_get(lock);
166 spin_lock(&lock->spinlock);
167 list_add_tail(&lock->bast_list, &dlm->pending_basts);
168 lock->bast_pending = 1;
169 spin_unlock(&lock->spinlock);
170}
171
172void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
173{
174 mlog_entry_void();
175
176 BUG_ON(!dlm);
177 BUG_ON(!lock);
178
179 spin_lock(&dlm->ast_lock);
180 __dlm_queue_bast(dlm, lock);
181 spin_unlock(&dlm->ast_lock);
182}
183
184static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
185 struct dlm_lock *lock)
186{
187 struct dlm_lockstatus *lksb = lock->lksb;
188 BUG_ON(!lksb);
189
190 /* only updates if this node masters the lockres */
191 if (res->owner == dlm->node_num) {
192
193 spin_lock(&res->spinlock);
194 /* check the lksb flags for the direction */
195 if (lksb->flags & DLM_LKSB_GET_LVB) {
196 mlog(0, "getting lvb from lockres for %s node\n",
197 lock->ml.node == dlm->node_num ? "master" :
198 "remote");
199 memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
200 } else if (lksb->flags & DLM_LKSB_PUT_LVB) {
201 mlog(0, "setting lvb from lockres for %s node\n",
202 lock->ml.node == dlm->node_num ? "master" :
203 "remote");
204 memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
205 }
206 spin_unlock(&res->spinlock);
207 }
208
209 /* reset any lvb flags on the lksb */
210 lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
211}
212
213void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
214 struct dlm_lock *lock)
215{
216 dlm_astlockfunc_t *fn;
217 struct dlm_lockstatus *lksb;
218
219 mlog_entry_void();
220
221 lksb = lock->lksb;
222 fn = lock->ast;
223 BUG_ON(lock->ml.node != dlm->node_num);
224
225 dlm_update_lvb(dlm, res, lock);
226 (*fn)(lock->astdata);
227}
228
229
230int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
231 struct dlm_lock *lock)
232{
233 int ret;
234 struct dlm_lockstatus *lksb;
235 int lksbflags;
236
237 mlog_entry_void();
238
239 lksb = lock->lksb;
240 BUG_ON(lock->ml.node == dlm->node_num);
241
242 lksbflags = lksb->flags;
243 dlm_update_lvb(dlm, res, lock);
244
245 /* lock request came from another node
246 * go do the ast over there */
247 ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags);
248 return ret;
249}
250
251void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
252 struct dlm_lock *lock, int blocked_type)
253{
254 dlm_bastlockfunc_t *fn = lock->bast;
255
256 mlog_entry_void();
257 BUG_ON(lock->ml.node != dlm->node_num);
258
259 (*fn)(lock->astdata, blocked_type);
260}
261
262
263
264int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
265{
266 int ret;
267 unsigned int locklen;
268 struct dlm_ctxt *dlm = data;
269 struct dlm_lock_resource *res = NULL;
270 struct dlm_lock *lock = NULL;
271 struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
272 char *name;
273 struct list_head *iter, *head=NULL;
274 u64 cookie;
275 u32 flags;
276
277 if (!dlm_grab(dlm)) {
278 dlm_error(DLM_REJECTED);
279 return DLM_REJECTED;
280 }
281
282 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
283 "Domain %s not fully joined!\n", dlm->name);
284
285 name = past->name;
286 locklen = past->namelen;
287 cookie = be64_to_cpu(past->cookie);
288 flags = be32_to_cpu(past->flags);
289
290 if (locklen > DLM_LOCKID_NAME_MAX) {
291 ret = DLM_IVBUFLEN;
292 mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
293 goto leave;
294 }
295
296 if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
297 (LKM_PUT_LVB|LKM_GET_LVB)) {
298 mlog(ML_ERROR, "both PUT and GET lvb specified\n");
299 ret = DLM_BADARGS;
300 goto leave;
301 }
302
303 mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
304 (flags & LKM_GET_LVB ? "get lvb" : "none"));
305
306 mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);
307
308 if (past->type != DLM_AST &&
309 past->type != DLM_BAST) {
310 mlog(ML_ERROR, "Unknown ast type! %d, cookie=%"MLFu64", "
311 "name=%.*s\n", past->type, cookie, locklen, name);
312 ret = DLM_IVLOCKID;
313 goto leave;
314 }
315
316 res = dlm_lookup_lockres(dlm, name, locklen);
317 if (!res) {
318 mlog(ML_ERROR, "got %sast for unknown lockres! "
319 "cookie=%"MLFu64", name=%.*s, namelen=%u\n",
320 past->type == DLM_AST ? "" : "b",
321 cookie, locklen, name, locklen);
322 ret = DLM_IVLOCKID;
323 goto leave;
324 }
325
326 /* cannot get a proxy ast message if this node owns it */
327 BUG_ON(res->owner == dlm->node_num);
328
329 mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
330
331 spin_lock(&res->spinlock);
332 if (res->state & DLM_LOCK_RES_RECOVERING) {
333 mlog(0, "responding with DLM_RECOVERING!\n");
334 ret = DLM_RECOVERING;
335 goto unlock_out;
336 }
337 if (res->state & DLM_LOCK_RES_MIGRATING) {
338 mlog(0, "responding with DLM_MIGRATING!\n");
339 ret = DLM_MIGRATING;
340 goto unlock_out;
341 }
342 /* try convert queue for both ast/bast */
343 head = &res->converting;
344 lock = NULL;
345 list_for_each(iter, head) {
346 lock = list_entry (iter, struct dlm_lock, list);
347 if (be64_to_cpu(lock->ml.cookie) == cookie)
348 goto do_ast;
349 }
350
351 /* if not on convert, try blocked for ast, granted for bast */
352 if (past->type == DLM_AST)
353 head = &res->blocked;
354 else
355 head = &res->granted;
356
357 list_for_each(iter, head) {
358 lock = list_entry (iter, struct dlm_lock, list);
359 if (be64_to_cpu(lock->ml.cookie) == cookie)
360 goto do_ast;
361 }
362
363 mlog(ML_ERROR, "got %sast for unknown lock! cookie=%"MLFu64", "
364 "name=%.*s, namelen=%u\n",
365 past->type == DLM_AST ? "" : "b", cookie, locklen, name, locklen);
366
367 ret = DLM_NORMAL;
368unlock_out:
369 spin_unlock(&res->spinlock);
370 goto leave;
371
372do_ast:
373 ret = DLM_NORMAL;
374 if (past->type == DLM_AST) {
375 /* do not alter lock refcount. switching lists. */
376 list_del_init(&lock->list);
377 list_add_tail(&lock->list, &res->granted);
378 mlog(0, "ast: adding to granted list... type=%d, "
379 "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
380 if (lock->ml.convert_type != LKM_IVMODE) {
381 lock->ml.type = lock->ml.convert_type;
382 lock->ml.convert_type = LKM_IVMODE;
383 } else {
384 // should already be there....
385 }
386
387 lock->lksb->status = DLM_NORMAL;
388
389 /* if we requested the lvb, fetch it into our lksb now */
390 if (flags & LKM_GET_LVB) {
391 BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB));
392 memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN);
393 }
394 }
395 spin_unlock(&res->spinlock);
396
397 if (past->type == DLM_AST)
398 dlm_do_local_ast(dlm, res, lock);
399 else
400 dlm_do_local_bast(dlm, res, lock, past->blocked_type);
401
402leave:
403
404 if (res)
405 dlm_lockres_put(res);
406
407 dlm_put(dlm);
408 return ret;
409}
410
411
412
413int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
414 struct dlm_lock *lock, int msg_type,
415 int blocked_type, int flags)
416{
417 int ret = 0;
418 struct dlm_proxy_ast past;
419 struct kvec vec[2];
420 size_t veclen = 1;
421 int status;
422
423 mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
424 res->lockname.len, res->lockname.name, lock->ml.node,
425 msg_type, blocked_type);
426
427 memset(&past, 0, sizeof(struct dlm_proxy_ast));
428 past.node_idx = dlm->node_num;
429 past.type = msg_type;
430 past.blocked_type = blocked_type;
431 past.namelen = res->lockname.len;
432 memcpy(past.name, res->lockname.name, past.namelen);
433 past.cookie = lock->ml.cookie;
434
435 vec[0].iov_len = sizeof(struct dlm_proxy_ast);
436 vec[0].iov_base = &past;
437 if (flags & DLM_LKSB_GET_LVB) {
438 mlog(0, "returning requested LVB data\n");
439 be32_add_cpu(&past.flags, LKM_GET_LVB);
440 vec[1].iov_len = DLM_LVB_LEN;
441 vec[1].iov_base = lock->lksb->lvb;
442 veclen++;
443 }
444
445 ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
446 lock->ml.node, &status);
447 if (ret < 0)
448 mlog_errno(ret);
449 else {
450 if (status == DLM_RECOVERING) {
451 mlog(ML_ERROR, "sent AST to node %u, it thinks this "
452 "node is dead!\n", lock->ml.node);
453 BUG();
454 } else if (status == DLM_MIGRATING) {
455 mlog(ML_ERROR, "sent AST to node %u, it returned "
456 "DLM_MIGRATING!\n", lock->ml.node);
457 BUG();
458 } else if (status != DLM_NORMAL) {
459 mlog(ML_ERROR, "AST to node %u returned %d!\n",
460 lock->ml.node, status);
461 /* ignore it */
462 }
463 ret = 0;
464 }
465 return ret;
466}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
new file mode 100644
index 000000000000..3fecba0a6023
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -0,0 +1,884 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmcommon.h
5 *
6 * Copyright (C) 2004 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMCOMMON_H
26#define DLMCOMMON_H
27
28#include <linux/kref.h>
29
30#define DLM_HB_NODE_DOWN_PRI (0xf000000)
31#define DLM_HB_NODE_UP_PRI (0x8000000)
32
33#define DLM_LOCKID_NAME_MAX 32
34
35#define DLM_DOMAIN_NAME_MAX_LEN 255
36#define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES
37#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
38#define DLM_THREAD_MS 200 // flush at least every 200 ms
39
40#define DLM_HASH_BITS 7
41#define DLM_HASH_SIZE (1 << DLM_HASH_BITS)
42#define DLM_HASH_MASK (DLM_HASH_SIZE - 1)
43
44enum dlm_ast_type {
45 DLM_AST = 0,
46 DLM_BAST,
47 DLM_ASTUNLOCK
48};
49
50
51#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
52 LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
53 LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
54
55#define DLM_RECOVERY_LOCK_NAME "$RECOVERY"
56#define DLM_RECOVERY_LOCK_NAME_LEN 9
57
58static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
59{
60 if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
61 memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0)
62 return 1;
63 return 0;
64}
65
66#define DLM_RECO_STATE_ACTIVE 0x0001
67
68struct dlm_recovery_ctxt
69{
70 struct list_head resources;
71 struct list_head received;
72 struct list_head node_data;
73 u8 new_master;
74 u8 dead_node;
75 u16 state;
76 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
77 wait_queue_head_t event;
78};
79
80enum dlm_ctxt_state {
81 DLM_CTXT_NEW = 0,
82 DLM_CTXT_JOINED,
83 DLM_CTXT_IN_SHUTDOWN,
84 DLM_CTXT_LEAVING,
85};
86
87struct dlm_ctxt
88{
89 struct list_head list;
90 struct list_head *resources;
91 struct list_head dirty_list;
92 struct list_head purge_list;
93 struct list_head pending_asts;
94 struct list_head pending_basts;
95 unsigned int purge_count;
96 spinlock_t spinlock;
97 spinlock_t ast_lock;
98 char *name;
99 u8 node_num;
100 u32 key;
101 u8 joining_node;
102 wait_queue_head_t dlm_join_events;
103 unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
104 unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
105 unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
106 struct dlm_recovery_ctxt reco;
107 spinlock_t master_lock;
108 struct list_head master_list;
109 struct list_head mle_hb_events;
110
111 /* these give a really vague idea of the system load */
112 atomic_t local_resources;
113 atomic_t remote_resources;
114 atomic_t unknown_resources;
115
116 /* NOTE: Next three are protected by dlm_domain_lock */
117 struct kref dlm_refs;
118 enum dlm_ctxt_state dlm_state;
119 unsigned int num_joins;
120
121 struct o2hb_callback_func dlm_hb_up;
122 struct o2hb_callback_func dlm_hb_down;
123 struct task_struct *dlm_thread_task;
124 struct task_struct *dlm_reco_thread_task;
125 wait_queue_head_t dlm_thread_wq;
126 wait_queue_head_t dlm_reco_thread_wq;
127 wait_queue_head_t ast_wq;
128 wait_queue_head_t migration_wq;
129
130 struct work_struct dispatched_work;
131 struct list_head work_list;
132 spinlock_t work_lock;
133 struct list_head dlm_domain_handlers;
134 struct list_head dlm_eviction_callbacks;
135};
136
137/* these keventd work queue items are for less-frequently
138 * called functions that cannot be directly called from the
139 * net message handlers for some reason, usually because
140 * they need to send net messages of their own. */
141void dlm_dispatch_work(void *data);
142
143struct dlm_lock_resource;
144struct dlm_work_item;
145
146typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *);
147
148struct dlm_request_all_locks_priv
149{
150 u8 reco_master;
151 u8 dead_node;
152};
153
154struct dlm_mig_lockres_priv
155{
156 struct dlm_lock_resource *lockres;
157 u8 real_master;
158};
159
160struct dlm_assert_master_priv
161{
162 struct dlm_lock_resource *lockres;
163 u8 request_from;
164 u32 flags;
165 unsigned ignore_higher:1;
166};
167
168
169struct dlm_work_item
170{
171 struct list_head list;
172 dlm_workfunc_t *func;
173 struct dlm_ctxt *dlm;
174 void *data;
175 union {
176 struct dlm_request_all_locks_priv ral;
177 struct dlm_mig_lockres_priv ml;
178 struct dlm_assert_master_priv am;
179 } u;
180};
181
182static inline void dlm_init_work_item(struct dlm_ctxt *dlm,
183 struct dlm_work_item *i,
184 dlm_workfunc_t *f, void *data)
185{
186 memset(i, 0, sizeof(*i));
187 i->func = f;
188 INIT_LIST_HEAD(&i->list);
189 i->data = data;
190 i->dlm = dlm; /* must have already done a dlm_grab on this! */
191}
192
193
194
195static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
196 u8 node)
197{
198 assert_spin_locked(&dlm->spinlock);
199
200 dlm->joining_node = node;
201 wake_up(&dlm->dlm_join_events);
202}
203
204#define DLM_LOCK_RES_UNINITED 0x00000001
205#define DLM_LOCK_RES_RECOVERING 0x00000002
206#define DLM_LOCK_RES_READY 0x00000004
207#define DLM_LOCK_RES_DIRTY 0x00000008
208#define DLM_LOCK_RES_IN_PROGRESS 0x00000010
209#define DLM_LOCK_RES_MIGRATING 0x00000020
210
211#define DLM_PURGE_INTERVAL_MS (8 * 1000)
212
213struct dlm_lock_resource
214{
215 /* WARNING: Please see the comment in dlm_init_lockres before
216 * adding fields here. */
217 struct list_head list;
218 struct kref refs;
219
220 /* please keep these next 3 in this order
221 * some funcs want to iterate over all lists */
222 struct list_head granted;
223 struct list_head converting;
224 struct list_head blocked;
225
226 struct list_head dirty;
227 struct list_head recovering; // dlm_recovery_ctxt.resources list
228
229 /* unused lock resources have their last_used stamped and are
230 * put on a list for the dlm thread to run. */
231 struct list_head purge;
232 unsigned long last_used;
233
234 unsigned migration_pending:1;
235 atomic_t asts_reserved;
236 spinlock_t spinlock;
237 wait_queue_head_t wq;
238 u8 owner; //node which owns the lock resource, or unknown
239 u16 state;
240 struct qstr lockname;
241 char lvb[DLM_LVB_LEN];
242};
243
244struct dlm_migratable_lock
245{
246 __be64 cookie;
247
248 /* these 3 are just padding for the in-memory structure, but
249 * list and flags are actually used when sent over the wire */
250 __be16 pad1;
251 u8 list; // 0=granted, 1=converting, 2=blocked
252 u8 flags;
253
254 s8 type;
255 s8 convert_type;
256 s8 highest_blocked;
257 u8 node;
258}; // 16 bytes
259
260struct dlm_lock
261{
262 struct dlm_migratable_lock ml;
263
264 struct list_head list;
265 struct list_head ast_list;
266 struct list_head bast_list;
267 struct dlm_lock_resource *lockres;
268 spinlock_t spinlock;
269 struct kref lock_refs;
270
271 // ast and bast must be callable while holding a spinlock!
272 dlm_astlockfunc_t *ast;
273 dlm_bastlockfunc_t *bast;
274 void *astdata;
275 struct dlm_lockstatus *lksb;
276 unsigned ast_pending:1,
277 bast_pending:1,
278 convert_pending:1,
279 lock_pending:1,
280 cancel_pending:1,
281 unlock_pending:1,
282 lksb_kernel_allocated:1;
283};
284
285
286#define DLM_LKSB_UNUSED1 0x01
287#define DLM_LKSB_PUT_LVB 0x02
288#define DLM_LKSB_GET_LVB 0x04
289#define DLM_LKSB_UNUSED2 0x08
290#define DLM_LKSB_UNUSED3 0x10
291#define DLM_LKSB_UNUSED4 0x20
292#define DLM_LKSB_UNUSED5 0x40
293#define DLM_LKSB_UNUSED6 0x80
294
295
296enum dlm_lockres_list {
297 DLM_GRANTED_LIST = 0,
298 DLM_CONVERTING_LIST,
299 DLM_BLOCKED_LIST
300};
301
302static inline struct list_head *
303dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
304{
305 struct list_head *ret = NULL;
306 if (idx == DLM_GRANTED_LIST)
307 ret = &res->granted;
308 else if (idx == DLM_CONVERTING_LIST)
309 ret = &res->converting;
310 else if (idx == DLM_BLOCKED_LIST)
311 ret = &res->blocked;
312 else
313 BUG();
314 return ret;
315}
316
317
318
319
320struct dlm_node_iter
321{
322 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
323 int curnode;
324};
325
326
327enum {
328 DLM_MASTER_REQUEST_MSG = 500,
329 DLM_UNUSED_MSG1, /* 501 */
330 DLM_ASSERT_MASTER_MSG, /* 502 */
331 DLM_CREATE_LOCK_MSG, /* 503 */
332 DLM_CONVERT_LOCK_MSG, /* 504 */
333 DLM_PROXY_AST_MSG, /* 505 */
334 DLM_UNLOCK_LOCK_MSG, /* 506 */
335 DLM_UNUSED_MSG2, /* 507 */
336 DLM_MIGRATE_REQUEST_MSG, /* 508 */
337 DLM_MIG_LOCKRES_MSG, /* 509 */
338 DLM_QUERY_JOIN_MSG, /* 510 */
339 DLM_ASSERT_JOINED_MSG, /* 511 */
340 DLM_CANCEL_JOIN_MSG, /* 512 */
341 DLM_EXIT_DOMAIN_MSG, /* 513 */
342 DLM_MASTER_REQUERY_MSG, /* 514 */
343 DLM_LOCK_REQUEST_MSG, /* 515 */
344 DLM_RECO_DATA_DONE_MSG, /* 516 */
345 DLM_BEGIN_RECO_MSG, /* 517 */
346 DLM_FINALIZE_RECO_MSG /* 518 */
347};
348
349struct dlm_reco_node_data
350{
351 int state;
352 u8 node_num;
353 struct list_head list;
354};
355
356enum {
357 DLM_RECO_NODE_DATA_DEAD = -1,
358 DLM_RECO_NODE_DATA_INIT = 0,
359 DLM_RECO_NODE_DATA_REQUESTING,
360 DLM_RECO_NODE_DATA_REQUESTED,
361 DLM_RECO_NODE_DATA_RECEIVING,
362 DLM_RECO_NODE_DATA_DONE,
363 DLM_RECO_NODE_DATA_FINALIZE_SENT,
364};
365
366
367enum {
368 DLM_MASTER_RESP_NO = 0,
369 DLM_MASTER_RESP_YES,
370 DLM_MASTER_RESP_MAYBE,
371 DLM_MASTER_RESP_ERROR
372};
373
374
375struct dlm_master_request
376{
377 u8 node_idx;
378 u8 namelen;
379 __be16 pad1;
380 __be32 flags;
381
382 u8 name[O2NM_MAX_NAME_LEN];
383};
384
385#define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001
386#define DLM_ASSERT_MASTER_REQUERY 0x00000002
387#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004
388struct dlm_assert_master
389{
390 u8 node_idx;
391 u8 namelen;
392 __be16 pad1;
393 __be32 flags;
394
395 u8 name[O2NM_MAX_NAME_LEN];
396};
397
398struct dlm_migrate_request
399{
400 u8 master;
401 u8 new_master;
402 u8 namelen;
403 u8 pad1;
404 __be32 pad2;
405 u8 name[O2NM_MAX_NAME_LEN];
406};
407
408struct dlm_master_requery
409{
410 u8 pad1;
411 u8 pad2;
412 u8 node_idx;
413 u8 namelen;
414 __be32 pad3;
415 u8 name[O2NM_MAX_NAME_LEN];
416};
417
418#define DLM_MRES_RECOVERY 0x01
419#define DLM_MRES_MIGRATION 0x02
420#define DLM_MRES_ALL_DONE 0x04
421
422/*
423 * We would like to get one whole lockres into a single network
424 * message whenever possible. Generally speaking, there will be
425 * at most one dlm_lock on a lockres for each node in the cluster,
426 * plus (infrequently) any additional locks coming in from userdlm.
427 *
428 * struct _dlm_lockres_page
429 * {
430 * dlm_migratable_lockres mres;
431 * dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS];
432 * u8 pad[DLM_MIG_LOCKRES_RESERVED];
433 * };
434 *
435 * from ../cluster/tcp.h
436 * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
437 * (roughly 4080 bytes)
438 * and sizeof(dlm_migratable_lockres) = 112 bytes
439 * and sizeof(dlm_migratable_lock) = 16 bytes
440 *
441 * Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and
442 * DLM_MIG_LOCKRES_RESERVED=128 means we have this:
443 *
444 * (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) +
445 * sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED =
446 * NET_MAX_PAYLOAD_BYTES
447 * (240 * 16) + 112 + 128 = 4080
448 *
449 * So a lockres would need more than 240 locks before it would
450 * use more than one network packet to recover. Not too bad.
451 */
452#define DLM_MAX_MIGRATABLE_LOCKS 240
453
454struct dlm_migratable_lockres
455{
456 u8 master;
457 u8 lockname_len;
458 u8 num_locks; // locks sent in this structure
459 u8 flags;
460 __be32 total_locks; // locks to be sent for this migration cookie
461 __be64 mig_cookie; // cookie for this lockres migration
462 // or zero if not needed
463 // 16 bytes
464 u8 lockname[DLM_LOCKID_NAME_MAX];
465 // 48 bytes
466 u8 lvb[DLM_LVB_LEN];
467 // 112 bytes
468 struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112
469};
470#define DLM_MIG_LOCKRES_MAX_LEN \
471 (sizeof(struct dlm_migratable_lockres) + \
472 (sizeof(struct dlm_migratable_lock) * \
473 DLM_MAX_MIGRATABLE_LOCKS) )
474
475/* from above, 128 bytes
476 * for some undetermined future use */
477#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \
478 DLM_MIG_LOCKRES_MAX_LEN)
479
480struct dlm_create_lock
481{
482 __be64 cookie;
483
484 __be32 flags;
485 u8 pad1;
486 u8 node_idx;
487 s8 requested_type;
488 u8 namelen;
489
490 u8 name[O2NM_MAX_NAME_LEN];
491};
492
493struct dlm_convert_lock
494{
495 __be64 cookie;
496
497 __be32 flags;
498 u8 pad1;
499 u8 node_idx;
500 s8 requested_type;
501 u8 namelen;
502
503 u8 name[O2NM_MAX_NAME_LEN];
504
505 s8 lvb[0];
506};
507#define DLM_CONVERT_LOCK_MAX_LEN (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN)
508
509struct dlm_unlock_lock
510{
511 __be64 cookie;
512
513 __be32 flags;
514 __be16 pad1;
515 u8 node_idx;
516 u8 namelen;
517
518 u8 name[O2NM_MAX_NAME_LEN];
519
520 s8 lvb[0];
521};
522#define DLM_UNLOCK_LOCK_MAX_LEN (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN)
523
524struct dlm_proxy_ast
525{
526 __be64 cookie;
527
528 __be32 flags;
529 u8 node_idx;
530 u8 type;
531 u8 blocked_type;
532 u8 namelen;
533
534 u8 name[O2NM_MAX_NAME_LEN];
535
536 s8 lvb[0];
537};
538#define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN)
539
540#define DLM_MOD_KEY (0x666c6172)
541enum dlm_query_join_response {
542 JOIN_DISALLOW = 0,
543 JOIN_OK,
544 JOIN_OK_NO_MAP,
545};
546
547struct dlm_lock_request
548{
549 u8 node_idx;
550 u8 dead_node;
551 __be16 pad1;
552 __be32 pad2;
553};
554
555struct dlm_reco_data_done
556{
557 u8 node_idx;
558 u8 dead_node;
559 __be16 pad1;
560 __be32 pad2;
561
562 /* unused for now */
563 /* eventually we can use this to attempt
564 * lvb recovery based on each node's info */
565 u8 reco_lvb[DLM_LVB_LEN];
566};
567
568struct dlm_begin_reco
569{
570 u8 node_idx;
571 u8 dead_node;
572 __be16 pad1;
573 __be32 pad2;
574};
575
576
577struct dlm_query_join_request
578{
579 u8 node_idx;
580 u8 pad1[2];
581 u8 name_len;
582 u8 domain[O2NM_MAX_NAME_LEN];
583};
584
585struct dlm_assert_joined
586{
587 u8 node_idx;
588 u8 pad1[2];
589 u8 name_len;
590 u8 domain[O2NM_MAX_NAME_LEN];
591};
592
593struct dlm_cancel_join
594{
595 u8 node_idx;
596 u8 pad1[2];
597 u8 name_len;
598 u8 domain[O2NM_MAX_NAME_LEN];
599};
600
601struct dlm_exit_domain
602{
603 u8 node_idx;
604 u8 pad1[3];
605};
606
607struct dlm_finalize_reco
608{
609 u8 node_idx;
610 u8 dead_node;
611 __be16 pad1;
612 __be32 pad2;
613};
614
615static inline enum dlm_status
616__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
617{
618 enum dlm_status status = DLM_NORMAL;
619
620 assert_spin_locked(&res->spinlock);
621
622 if (res->state & DLM_LOCK_RES_RECOVERING)
623 status = DLM_RECOVERING;
624 else if (res->state & DLM_LOCK_RES_MIGRATING)
625 status = DLM_MIGRATING;
626 else if (res->state & DLM_LOCK_RES_IN_PROGRESS)
627 status = DLM_FORWARD;
628
629 return status;
630}
631
632struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
633 struct dlm_lockstatus *lksb);
634void dlm_lock_get(struct dlm_lock *lock);
635void dlm_lock_put(struct dlm_lock *lock);
636
637void dlm_lock_attach_lockres(struct dlm_lock *lock,
638 struct dlm_lock_resource *res);
639
640int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data);
641int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data);
642int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data);
643
644void dlm_revert_pending_convert(struct dlm_lock_resource *res,
645 struct dlm_lock *lock);
646void dlm_revert_pending_lock(struct dlm_lock_resource *res,
647 struct dlm_lock *lock);
648
649int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data);
650void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
651 struct dlm_lock *lock);
652void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
653 struct dlm_lock *lock);
654
655int dlm_launch_thread(struct dlm_ctxt *dlm);
656void dlm_complete_thread(struct dlm_ctxt *dlm);
657int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
658void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
659void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
660
661void dlm_put(struct dlm_ctxt *dlm);
662struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
663int dlm_domain_fully_joined(struct dlm_ctxt *dlm);
664
665void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
666 struct dlm_lock_resource *res);
667void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
668 struct dlm_lock_resource *res);
669void dlm_purge_lockres(struct dlm_ctxt *dlm,
670 struct dlm_lock_resource *lockres);
671void dlm_lockres_get(struct dlm_lock_resource *res);
672void dlm_lockres_put(struct dlm_lock_resource *res);
673void __dlm_unhash_lockres(struct dlm_lock_resource *res);
674void __dlm_insert_lockres(struct dlm_ctxt *dlm,
675 struct dlm_lock_resource *res);
676struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
677 const char *name,
678 unsigned int len);
679struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
680 const char *name,
681 unsigned int len);
682
683int dlm_is_host_down(int errno);
684void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
685 struct dlm_lock_resource *res,
686 u8 owner);
687struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
688 const char *lockid,
689 int flags);
690struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
691 const char *name,
692 unsigned int namelen);
693
694void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
695void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
696void dlm_do_local_ast(struct dlm_ctxt *dlm,
697 struct dlm_lock_resource *res,
698 struct dlm_lock *lock);
699int dlm_do_remote_ast(struct dlm_ctxt *dlm,
700 struct dlm_lock_resource *res,
701 struct dlm_lock *lock);
702void dlm_do_local_bast(struct dlm_ctxt *dlm,
703 struct dlm_lock_resource *res,
704 struct dlm_lock *lock,
705 int blocked_type);
706int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm,
707 struct dlm_lock_resource *res,
708 struct dlm_lock *lock,
709 int msg_type,
710 int blocked_type, int flags);
711static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm,
712 struct dlm_lock_resource *res,
713 struct dlm_lock *lock,
714 int blocked_type)
715{
716 return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST,
717 blocked_type, 0);
718}
719
720static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
721 struct dlm_lock_resource *res,
722 struct dlm_lock *lock,
723 int flags)
724{
725 return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST,
726 0, flags);
727}
728
729void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
730void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
731
732u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
733void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
734void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
735
736
737int dlm_nm_init(struct dlm_ctxt *dlm);
738int dlm_heartbeat_init(struct dlm_ctxt *dlm);
739void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
740void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
741
742int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
743int dlm_migrate_lockres(struct dlm_ctxt *dlm,
744 struct dlm_lock_resource *res,
745 u8 target);
746int dlm_finish_migration(struct dlm_ctxt *dlm,
747 struct dlm_lock_resource *res,
748 u8 old_master);
749void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
750 struct dlm_lock_resource *res);
751void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res);
752
753int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data);
754int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data);
755int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data);
756int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data);
757int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data);
758int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data);
759int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data);
760int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data);
761int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
762
763int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
764 struct dlm_lock_resource *res,
765 int ignore_higher,
766 u8 request_from,
767 u32 flags);
768
769
770int dlm_send_one_lockres(struct dlm_ctxt *dlm,
771 struct dlm_lock_resource *res,
772 struct dlm_migratable_lockres *mres,
773 u8 send_to,
774 u8 flags);
775void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
776 struct dlm_lock_resource *res);
777
778/* will exit holding res->spinlock, but may drop in function */
779void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
780void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
781
782/* will exit holding res->spinlock, but may drop in function */
783static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
784{
785 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
786 DLM_LOCK_RES_RECOVERING|
787 DLM_LOCK_RES_MIGRATING));
788}
789
790
791int dlm_init_mle_cache(void);
792void dlm_destroy_mle_cache(void);
793void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
794void dlm_clean_master_list(struct dlm_ctxt *dlm,
795 u8 dead_node);
796int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
797
798
799static inline const char * dlm_lock_mode_name(int mode)
800{
801 switch (mode) {
802 case LKM_EXMODE:
803 return "EX";
804 case LKM_PRMODE:
805 return "PR";
806 case LKM_NLMODE:
807 return "NL";
808 }
809 return "UNKNOWN";
810}
811
812
813static inline int dlm_lock_compatible(int existing, int request)
814{
815 /* NO_LOCK compatible with all */
816 if (request == LKM_NLMODE ||
817 existing == LKM_NLMODE)
818 return 1;
819
820 /* EX incompatible with all non-NO_LOCK */
821 if (request == LKM_EXMODE)
822 return 0;
823
824 /* request must be PR, which is compatible with PR */
825 if (existing == LKM_PRMODE)
826 return 1;
827
828 return 0;
829}
830
831static inline int dlm_lock_on_list(struct list_head *head,
832 struct dlm_lock *lock)
833{
834 struct list_head *iter;
835 struct dlm_lock *tmplock;
836
837 list_for_each(iter, head) {
838 tmplock = list_entry(iter, struct dlm_lock, list);
839 if (tmplock == lock)
840 return 1;
841 }
842 return 0;
843}
844
845
846static inline enum dlm_status dlm_err_to_dlm_status(int err)
847{
848 enum dlm_status ret;
849 if (err == -ENOMEM)
850 ret = DLM_SYSERR;
851 else if (err == -ETIMEDOUT || o2net_link_down(err, NULL))
852 ret = DLM_NOLOCKMGR;
853 else if (err == -EINVAL)
854 ret = DLM_BADPARAM;
855 else if (err == -ENAMETOOLONG)
856 ret = DLM_IVBUFLEN;
857 else
858 ret = DLM_BADARGS;
859 return ret;
860}
861
862
863static inline void dlm_node_iter_init(unsigned long *map,
864 struct dlm_node_iter *iter)
865{
866 memcpy(iter->node_map, map, sizeof(iter->node_map));
867 iter->curnode = -1;
868}
869
870static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
871{
872 int bit;
873 bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1);
874 if (bit >= O2NM_MAX_NODES) {
875 iter->curnode = O2NM_MAX_NODES;
876 return -ENOENT;
877 }
878 iter->curnode = bit;
879 return bit;
880}
881
882
883
884#endif /* DLMCOMMON_H */
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
new file mode 100644
index 000000000000..6001b22a997d
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -0,0 +1,530 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmconvert.c
5 *
6 * underlying calls for lock conversion
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41
42
43#include "cluster/heartbeat.h"
44#include "cluster/nodemanager.h"
45#include "cluster/tcp.h"
46
47#include "dlmapi.h"
48#include "dlmcommon.h"
49
50#include "dlmconvert.h"
51
52#define MLOG_MASK_PREFIX ML_DLM
53#include "cluster/masklog.h"
54
55/* NOTE: __dlmconvert_master is the only function in here that
56 * needs a spinlock held on entry (res->spinlock) and it is the
57 * only one that holds a lock on exit (res->spinlock).
58 * All other functions in here need no locks and drop all of
59 * the locks that they acquire. */
60static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
61 struct dlm_lock_resource *res,
62 struct dlm_lock *lock, int flags,
63 int type, int *call_ast,
64 int *kick_thread);
65static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
66 struct dlm_lock_resource *res,
67 struct dlm_lock *lock, int flags, int type);
68
69/*
70 * this is only called directly by dlmlock(), and only when the
71 * local node is the owner of the lockres
72 * locking:
73 * caller needs: none
74 * taken: takes and drops res->spinlock
75 * held on exit: none
76 * returns: see __dlmconvert_master
77 */
78enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
79 struct dlm_lock_resource *res,
80 struct dlm_lock *lock, int flags, int type)
81{
82 int call_ast = 0, kick_thread = 0;
83 enum dlm_status status;
84
85 spin_lock(&res->spinlock);
86 /* we are not in a network handler, this is fine */
87 __dlm_wait_on_lockres(res);
88 __dlm_lockres_reserve_ast(res);
89 res->state |= DLM_LOCK_RES_IN_PROGRESS;
90
91 status = __dlmconvert_master(dlm, res, lock, flags, type,
92 &call_ast, &kick_thread);
93
94 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
95 spin_unlock(&res->spinlock);
96 wake_up(&res->wq);
97 if (status != DLM_NORMAL && status != DLM_NOTQUEUED)
98 dlm_error(status);
99
100 /* either queue the ast or release it */
101 if (call_ast)
102 dlm_queue_ast(dlm, lock);
103 else
104 dlm_lockres_release_ast(dlm, res);
105
106 if (kick_thread)
107 dlm_kick_thread(dlm, res);
108
109 return status;
110}
111
112/* performs lock conversion at the lockres master site
113 * locking:
114 * caller needs: res->spinlock
115 * taken: takes and drops lock->spinlock
116 * held on exit: res->spinlock
117 * returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED
118 * call_ast: whether ast should be called for this lock
119 * kick_thread: whether dlm_kick_thread should be called
120 */
121static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
122 struct dlm_lock_resource *res,
123 struct dlm_lock *lock, int flags,
124 int type, int *call_ast,
125 int *kick_thread)
126{
127 enum dlm_status status = DLM_NORMAL;
128 struct list_head *iter;
129 struct dlm_lock *tmplock=NULL;
130
131 assert_spin_locked(&res->spinlock);
132
133 mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
134 lock->ml.type, lock->ml.convert_type, type);
135
136 spin_lock(&lock->spinlock);
137
138 /* already converting? */
139 if (lock->ml.convert_type != LKM_IVMODE) {
140 mlog(ML_ERROR, "attempted to convert a lock with a lock "
141 "conversion pending\n");
142 status = DLM_DENIED;
143 goto unlock_exit;
144 }
145
146 /* must be on grant queue to convert */
147 if (!dlm_lock_on_list(&res->granted, lock)) {
148 mlog(ML_ERROR, "attempted to convert a lock not on grant "
149 "queue\n");
150 status = DLM_DENIED;
151 goto unlock_exit;
152 }
153
154 if (flags & LKM_VALBLK) {
155 switch (lock->ml.type) {
156 case LKM_EXMODE:
157 /* EX + LKM_VALBLK + convert == set lvb */
158 mlog(0, "will set lvb: converting %s->%s\n",
159 dlm_lock_mode_name(lock->ml.type),
160 dlm_lock_mode_name(type));
161 lock->lksb->flags |= DLM_LKSB_PUT_LVB;
162 break;
163 case LKM_PRMODE:
164 case LKM_NLMODE:
165 /* refetch if new level is not NL */
166 if (type > LKM_NLMODE) {
167 mlog(0, "will fetch new value into "
168 "lvb: converting %s->%s\n",
169 dlm_lock_mode_name(lock->ml.type),
170 dlm_lock_mode_name(type));
171 lock->lksb->flags |= DLM_LKSB_GET_LVB;
172 } else {
173 mlog(0, "will NOT fetch new value "
174 "into lvb: converting %s->%s\n",
175 dlm_lock_mode_name(lock->ml.type),
176 dlm_lock_mode_name(type));
177 flags &= ~(LKM_VALBLK);
178 }
179 break;
180 }
181 }
182
183
184 /* in-place downconvert? */
185 if (type <= lock->ml.type)
186 goto grant;
187
188 /* upconvert from here on */
189 status = DLM_NORMAL;
190 list_for_each(iter, &res->granted) {
191 tmplock = list_entry(iter, struct dlm_lock, list);
192 if (tmplock == lock)
193 continue;
194 if (!dlm_lock_compatible(tmplock->ml.type, type))
195 goto switch_queues;
196 }
197
198 list_for_each(iter, &res->converting) {
199 tmplock = list_entry(iter, struct dlm_lock, list);
200 if (!dlm_lock_compatible(tmplock->ml.type, type))
201 goto switch_queues;
202 /* existing conversion requests take precedence */
203 if (!dlm_lock_compatible(tmplock->ml.convert_type, type))
204 goto switch_queues;
205 }
206
207 /* fall thru to grant */
208
209grant:
210 mlog(0, "res %.*s, granting %s lock\n", res->lockname.len,
211 res->lockname.name, dlm_lock_mode_name(type));
212 /* immediately grant the new lock type */
213 lock->lksb->status = DLM_NORMAL;
214 if (lock->ml.node == dlm->node_num)
215 mlog(0, "doing in-place convert for nonlocal lock\n");
216 lock->ml.type = type;
217 status = DLM_NORMAL;
218 *call_ast = 1;
219 goto unlock_exit;
220
221switch_queues:
222 if (flags & LKM_NOQUEUE) {
223 mlog(0, "failed to convert NOQUEUE lock %.*s from "
224 "%d to %d...\n", res->lockname.len, res->lockname.name,
225 lock->ml.type, type);
226 status = DLM_NOTQUEUED;
227 goto unlock_exit;
228 }
229 mlog(0, "res %.*s, queueing...\n", res->lockname.len,
230 res->lockname.name);
231
232 lock->ml.convert_type = type;
233 /* do not alter lock refcount. switching lists. */
234 list_del_init(&lock->list);
235 list_add_tail(&lock->list, &res->converting);
236
237unlock_exit:
238 spin_unlock(&lock->spinlock);
239 if (status == DLM_DENIED) {
240 __dlm_print_one_lock_resource(res);
241 }
242 if (status == DLM_NORMAL)
243 *kick_thread = 1;
244 return status;
245}
246
247void dlm_revert_pending_convert(struct dlm_lock_resource *res,
248 struct dlm_lock *lock)
249{
250 /* do not alter lock refcount. switching lists. */
251 list_del_init(&lock->list);
252 list_add_tail(&lock->list, &res->granted);
253 lock->ml.convert_type = LKM_IVMODE;
254 lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
255}
256
257/* messages the master site to do lock conversion
258 * locking:
259 * caller needs: none
260 * taken: takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS
261 * held on exit: none
262 * returns: DLM_NORMAL, DLM_RECOVERING, status from remote node
263 */
264enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
265 struct dlm_lock_resource *res,
266 struct dlm_lock *lock, int flags, int type)
267{
268 enum dlm_status status;
269
270 mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
271 lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
272
273 spin_lock(&res->spinlock);
274 if (res->state & DLM_LOCK_RES_RECOVERING) {
275 mlog(0, "bailing out early since res is RECOVERING "
276 "on secondary queue\n");
277 /* __dlm_print_one_lock_resource(res); */
278 status = DLM_RECOVERING;
279 goto bail;
280 }
281 /* will exit this call with spinlock held */
282 __dlm_wait_on_lockres(res);
283
284 if (lock->ml.convert_type != LKM_IVMODE) {
285 __dlm_print_one_lock_resource(res);
286 mlog(ML_ERROR, "converting a remote lock that is already "
287 "converting! (cookie=%"MLFu64", conv=%d)\n",
288 lock->ml.cookie, lock->ml.convert_type);
289 status = DLM_DENIED;
290 goto bail;
291 }
292 res->state |= DLM_LOCK_RES_IN_PROGRESS;
293 /* move lock to local convert queue */
294 /* do not alter lock refcount. switching lists. */
295 list_del_init(&lock->list);
296 list_add_tail(&lock->list, &res->converting);
297 lock->convert_pending = 1;
298 lock->ml.convert_type = type;
299
300 if (flags & LKM_VALBLK) {
301 if (lock->ml.type == LKM_EXMODE) {
302 flags |= LKM_PUT_LVB;
303 lock->lksb->flags |= DLM_LKSB_PUT_LVB;
304 } else {
305 if (lock->ml.convert_type == LKM_NLMODE)
306 flags &= ~LKM_VALBLK;
307 else {
308 flags |= LKM_GET_LVB;
309 lock->lksb->flags |= DLM_LKSB_GET_LVB;
310 }
311 }
312 }
313 spin_unlock(&res->spinlock);
314
315 /* no locks held here.
316 * need to wait for a reply as to whether it got queued or not. */
317 status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
318
319 spin_lock(&res->spinlock);
320 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
321 lock->convert_pending = 0;
322 /* if it failed, move it back to granted queue */
323 if (status != DLM_NORMAL) {
324 if (status != DLM_NOTQUEUED)
325 dlm_error(status);
326 dlm_revert_pending_convert(res, lock);
327 }
328bail:
329 spin_unlock(&res->spinlock);
330
331 /* TODO: should this be a wake_one? */
332 /* wake up any IN_PROGRESS waiters */
333 wake_up(&res->wq);
334
335 return status;
336}
337
338/* sends DLM_CONVERT_LOCK_MSG to master site
339 * locking:
340 * caller needs: none
341 * taken: none
342 * held on exit: none
343 * returns: DLM_NOLOCKMGR, status from remote node
344 */
345static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
346 struct dlm_lock_resource *res,
347 struct dlm_lock *lock, int flags, int type)
348{
349 struct dlm_convert_lock convert;
350 int tmpret;
351 enum dlm_status ret;
352 int status = 0;
353 struct kvec vec[2];
354 size_t veclen = 1;
355
356 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
357
358 memset(&convert, 0, sizeof(struct dlm_convert_lock));
359 convert.node_idx = dlm->node_num;
360 convert.requested_type = type;
361 convert.cookie = lock->ml.cookie;
362 convert.namelen = res->lockname.len;
363 convert.flags = cpu_to_be32(flags);
364 memcpy(convert.name, res->lockname.name, convert.namelen);
365
366 vec[0].iov_len = sizeof(struct dlm_convert_lock);
367 vec[0].iov_base = &convert;
368
369 if (flags & LKM_PUT_LVB) {
370 /* extra data to send if we are updating lvb */
371 vec[1].iov_len = DLM_LVB_LEN;
372 vec[1].iov_base = lock->lksb->lvb;
373 veclen++;
374 }
375
376 tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key,
377 vec, veclen, res->owner, &status);
378 if (tmpret >= 0) {
379 // successfully sent and received
380 ret = status; // this is already a dlm_status
381 if (ret == DLM_RECOVERING) {
382 mlog(0, "node %u returned DLM_RECOVERING from convert "
383 "message!\n", res->owner);
384 } else if (ret == DLM_MIGRATING) {
385 mlog(0, "node %u returned DLM_MIGRATING from convert "
386 "message!\n", res->owner);
387 } else if (ret == DLM_FORWARD) {
388 mlog(0, "node %u returned DLM_FORWARD from convert "
389 "message!\n", res->owner);
390 } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
391 dlm_error(ret);
392 } else {
393 mlog_errno(tmpret);
394 if (dlm_is_host_down(tmpret)) {
395 ret = DLM_RECOVERING;
396 mlog(0, "node %u died so returning DLM_RECOVERING "
397 "from convert message!\n", res->owner);
398 } else {
399 ret = dlm_err_to_dlm_status(tmpret);
400 }
401 }
402
403 return ret;
404}
405
406/* handler for DLM_CONVERT_LOCK_MSG on master site
407 * locking:
408 * caller needs: none
409 * taken: takes and drop res->spinlock
410 * held on exit: none
411 * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
412 * status from __dlmconvert_master
413 */
414int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
415{
416 struct dlm_ctxt *dlm = data;
417 struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
418 struct dlm_lock_resource *res = NULL;
419 struct list_head *iter;
420 struct dlm_lock *lock = NULL;
421 struct dlm_lockstatus *lksb;
422 enum dlm_status status = DLM_NORMAL;
423 u32 flags;
424 int call_ast = 0, kick_thread = 0;
425
426 if (!dlm_grab(dlm)) {
427 dlm_error(DLM_REJECTED);
428 return DLM_REJECTED;
429 }
430
431 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
432 "Domain %s not fully joined!\n", dlm->name);
433
434 if (cnv->namelen > DLM_LOCKID_NAME_MAX) {
435 status = DLM_IVBUFLEN;
436 dlm_error(status);
437 goto leave;
438 }
439
440 flags = be32_to_cpu(cnv->flags);
441
442 if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
443 (LKM_PUT_LVB|LKM_GET_LVB)) {
444 mlog(ML_ERROR, "both PUT and GET lvb specified\n");
445 status = DLM_BADARGS;
446 goto leave;
447 }
448
449 mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
450 (flags & LKM_GET_LVB ? "get lvb" : "none"));
451
452 status = DLM_IVLOCKID;
453 res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen);
454 if (!res) {
455 dlm_error(status);
456 goto leave;
457 }
458
459 spin_lock(&res->spinlock);
460 list_for_each(iter, &res->granted) {
461 lock = list_entry(iter, struct dlm_lock, list);
462 if (lock->ml.cookie == cnv->cookie &&
463 lock->ml.node == cnv->node_idx) {
464 dlm_lock_get(lock);
465 break;
466 }
467 lock = NULL;
468 }
469 spin_unlock(&res->spinlock);
470 if (!lock) {
471 status = DLM_IVLOCKID;
472 dlm_error(status);
473 goto leave;
474 }
475
476 /* found the lock */
477 lksb = lock->lksb;
478
479 /* see if caller needed to get/put lvb */
480 if (flags & LKM_PUT_LVB) {
481 BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
482 lksb->flags |= DLM_LKSB_PUT_LVB;
483 memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN);
484 } else if (flags & LKM_GET_LVB) {
485 BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
486 lksb->flags |= DLM_LKSB_GET_LVB;
487 }
488
489 spin_lock(&res->spinlock);
490 status = __dlm_lockres_state_to_status(res);
491 if (status == DLM_NORMAL) {
492 __dlm_lockres_reserve_ast(res);
493 res->state |= DLM_LOCK_RES_IN_PROGRESS;
494 status = __dlmconvert_master(dlm, res, lock, flags,
495 cnv->requested_type,
496 &call_ast, &kick_thread);
497 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
498 }
499 spin_unlock(&res->spinlock);
500
501 if (status != DLM_NORMAL) {
502 if (status != DLM_NOTQUEUED)
503 dlm_error(status);
504 lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
505 }
506
507leave:
508 if (!lock)
509 mlog(ML_ERROR, "did not find lock to convert on grant queue! "
510 "cookie=%"MLFu64"\n",
511 cnv->cookie);
512 else
513 dlm_lock_put(lock);
514
515 /* either queue the ast or release it */
516 if (call_ast)
517 dlm_queue_ast(dlm, lock);
518 else
519 dlm_lockres_release_ast(dlm, res);
520
521 if (kick_thread)
522 dlm_kick_thread(dlm, res);
523
524 if (res)
525 dlm_lockres_put(res);
526
527 dlm_put(dlm);
528
529 return status;
530}
diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h
new file mode 100644
index 000000000000..b2e3677df878
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmconvert.h
@@ -0,0 +1,35 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmconvert.h
5 *
6 * Copyright (C) 2004 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMCONVERT_H
26#define DLMCONVERT_H
27
28enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
29 struct dlm_lock_resource *res,
30 struct dlm_lock *lock, int flags, int type);
31enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
32 struct dlm_lock_resource *res,
33 struct dlm_lock *lock, int flags, int type);
34
35#endif
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
new file mode 100644
index 000000000000..f339fe27975a
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -0,0 +1,246 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdebug.c
5 *
6 * debug functionality for the dlm
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/utsname.h>
31#include <linux/sysctl.h>
32#include <linux/spinlock.h>
33
34#include "cluster/heartbeat.h"
35#include "cluster/nodemanager.h"
36#include "cluster/tcp.h"
37
38#include "dlmapi.h"
39#include "dlmcommon.h"
40#include "dlmdebug.h"
41
42#include "dlmdomain.h"
43#include "dlmdebug.h"
44
45#define MLOG_MASK_PREFIX ML_DLM
46#include "cluster/masklog.h"
47
48void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
49{
50 mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
51 res->lockname.len, res->lockname.name,
52 res->owner, res->state);
53 spin_lock(&res->spinlock);
54 __dlm_print_one_lock_resource(res);
55 spin_unlock(&res->spinlock);
56}
57
58void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
59{
60 struct list_head *iter2;
61 struct dlm_lock *lock;
62
63 assert_spin_locked(&res->spinlock);
64
65 mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
66 res->lockname.len, res->lockname.name,
67 res->owner, res->state);
68 mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n",
69 res->last_used, list_empty(&res->purge) ? "no" : "yes");
70 mlog(ML_NOTICE, " granted queue: \n");
71 list_for_each(iter2, &res->granted) {
72 lock = list_entry(iter2, struct dlm_lock, list);
73 spin_lock(&lock->spinlock);
74 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
75 "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
76 lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie,
77 list_empty(&lock->ast_list) ? 'y' : 'n',
78 lock->ast_pending ? 'y' : 'n',
79 list_empty(&lock->bast_list) ? 'y' : 'n',
80 lock->bast_pending ? 'y' : 'n');
81 spin_unlock(&lock->spinlock);
82 }
83 mlog(ML_NOTICE, " converting queue: \n");
84 list_for_each(iter2, &res->converting) {
85 lock = list_entry(iter2, struct dlm_lock, list);
86 spin_lock(&lock->spinlock);
87 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
88 "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
89 lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie,
90 list_empty(&lock->ast_list) ? 'y' : 'n',
91 lock->ast_pending ? 'y' : 'n',
92 list_empty(&lock->bast_list) ? 'y' : 'n',
93 lock->bast_pending ? 'y' : 'n');
94 spin_unlock(&lock->spinlock);
95 }
96 mlog(ML_NOTICE, " blocked queue: \n");
97 list_for_each(iter2, &res->blocked) {
98 lock = list_entry(iter2, struct dlm_lock, list);
99 spin_lock(&lock->spinlock);
100 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
101 "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
102 lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie,
103 list_empty(&lock->ast_list) ? 'y' : 'n',
104 lock->ast_pending ? 'y' : 'n',
105 list_empty(&lock->bast_list) ? 'y' : 'n',
106 lock->bast_pending ? 'y' : 'n');
107 spin_unlock(&lock->spinlock);
108 }
109}
110
111void dlm_print_one_lock(struct dlm_lock *lockid)
112{
113 dlm_print_one_lock_resource(lockid->lockres);
114}
115EXPORT_SYMBOL_GPL(dlm_print_one_lock);
116
117void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
118{
119 struct dlm_lock_resource *res;
120 struct list_head *iter;
121 struct list_head *bucket;
122 int i;
123
124 mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
125 dlm->name, dlm->node_num, dlm->key);
126 if (!dlm || !dlm->name) {
127 mlog(ML_ERROR, "dlm=%p\n", dlm);
128 return;
129 }
130
131 spin_lock(&dlm->spinlock);
132 for (i=0; i<DLM_HASH_SIZE; i++) {
133 bucket = &(dlm->resources[i]);
134 list_for_each(iter, bucket) {
135 res = list_entry(iter, struct dlm_lock_resource, list);
136 dlm_print_one_lock_resource(res);
137 }
138 }
139 spin_unlock(&dlm->spinlock);
140}
141
142static const char *dlm_errnames[] = {
143 [DLM_NORMAL] = "DLM_NORMAL",
144 [DLM_GRANTED] = "DLM_GRANTED",
145 [DLM_DENIED] = "DLM_DENIED",
146 [DLM_DENIED_NOLOCKS] = "DLM_DENIED_NOLOCKS",
147 [DLM_WORKING] = "DLM_WORKING",
148 [DLM_BLOCKED] = "DLM_BLOCKED",
149 [DLM_BLOCKED_ORPHAN] = "DLM_BLOCKED_ORPHAN",
150 [DLM_DENIED_GRACE_PERIOD] = "DLM_DENIED_GRACE_PERIOD",
151 [DLM_SYSERR] = "DLM_SYSERR",
152 [DLM_NOSUPPORT] = "DLM_NOSUPPORT",
153 [DLM_CANCELGRANT] = "DLM_CANCELGRANT",
154 [DLM_IVLOCKID] = "DLM_IVLOCKID",
155 [DLM_SYNC] = "DLM_SYNC",
156 [DLM_BADTYPE] = "DLM_BADTYPE",
157 [DLM_BADRESOURCE] = "DLM_BADRESOURCE",
158 [DLM_MAXHANDLES] = "DLM_MAXHANDLES",
159 [DLM_NOCLINFO] = "DLM_NOCLINFO",
160 [DLM_NOLOCKMGR] = "DLM_NOLOCKMGR",
161 [DLM_NOPURGED] = "DLM_NOPURGED",
162 [DLM_BADARGS] = "DLM_BADARGS",
163 [DLM_VOID] = "DLM_VOID",
164 [DLM_NOTQUEUED] = "DLM_NOTQUEUED",
165 [DLM_IVBUFLEN] = "DLM_IVBUFLEN",
166 [DLM_CVTUNGRANT] = "DLM_CVTUNGRANT",
167 [DLM_BADPARAM] = "DLM_BADPARAM",
168 [DLM_VALNOTVALID] = "DLM_VALNOTVALID",
169 [DLM_REJECTED] = "DLM_REJECTED",
170 [DLM_ABORT] = "DLM_ABORT",
171 [DLM_CANCEL] = "DLM_CANCEL",
172 [DLM_IVRESHANDLE] = "DLM_IVRESHANDLE",
173 [DLM_DEADLOCK] = "DLM_DEADLOCK",
174 [DLM_DENIED_NOASTS] = "DLM_DENIED_NOASTS",
175 [DLM_FORWARD] = "DLM_FORWARD",
176 [DLM_TIMEOUT] = "DLM_TIMEOUT",
177 [DLM_IVGROUPID] = "DLM_IVGROUPID",
178 [DLM_VERS_CONFLICT] = "DLM_VERS_CONFLICT",
179 [DLM_BAD_DEVICE_PATH] = "DLM_BAD_DEVICE_PATH",
180 [DLM_NO_DEVICE_PERMISSION] = "DLM_NO_DEVICE_PERMISSION",
181 [DLM_NO_CONTROL_DEVICE ] = "DLM_NO_CONTROL_DEVICE ",
182 [DLM_RECOVERING] = "DLM_RECOVERING",
183 [DLM_MIGRATING] = "DLM_MIGRATING",
184 [DLM_MAXSTATS] = "DLM_MAXSTATS",
185};
186
187static const char *dlm_errmsgs[] = {
188 [DLM_NORMAL] = "request in progress",
189 [DLM_GRANTED] = "request granted",
190 [DLM_DENIED] = "request denied",
191 [DLM_DENIED_NOLOCKS] = "request denied, out of system resources",
192 [DLM_WORKING] = "async request in progress",
193 [DLM_BLOCKED] = "lock request blocked",
194 [DLM_BLOCKED_ORPHAN] = "lock request blocked by a orphan lock",
195 [DLM_DENIED_GRACE_PERIOD] = "topological change in progress",
196 [DLM_SYSERR] = "system error",
197 [DLM_NOSUPPORT] = "unsupported",
198 [DLM_CANCELGRANT] = "can't cancel convert: already granted",
199 [DLM_IVLOCKID] = "bad lockid",
200 [DLM_SYNC] = "synchronous request granted",
201 [DLM_BADTYPE] = "bad resource type",
202 [DLM_BADRESOURCE] = "bad resource handle",
203 [DLM_MAXHANDLES] = "no more resource handles",
204 [DLM_NOCLINFO] = "can't contact cluster manager",
205 [DLM_NOLOCKMGR] = "can't contact lock manager",
206 [DLM_NOPURGED] = "can't contact purge daemon",
207 [DLM_BADARGS] = "bad api args",
208 [DLM_VOID] = "no status",
209 [DLM_NOTQUEUED] = "NOQUEUE was specified and request failed",
210 [DLM_IVBUFLEN] = "invalid resource name length",
211 [DLM_CVTUNGRANT] = "attempted to convert ungranted lock",
212 [DLM_BADPARAM] = "invalid lock mode specified",
213 [DLM_VALNOTVALID] = "value block has been invalidated",
214 [DLM_REJECTED] = "request rejected, unrecognized client",
215 [DLM_ABORT] = "blocked lock request cancelled",
216 [DLM_CANCEL] = "conversion request cancelled",
217 [DLM_IVRESHANDLE] = "invalid resource handle",
218 [DLM_DEADLOCK] = "deadlock recovery refused this request",
219 [DLM_DENIED_NOASTS] = "failed to allocate AST",
220 [DLM_FORWARD] = "request must wait for primary's response",
221 [DLM_TIMEOUT] = "timeout value for lock has expired",
222 [DLM_IVGROUPID] = "invalid group specification",
223 [DLM_VERS_CONFLICT] = "version conflicts prevent request handling",
224 [DLM_BAD_DEVICE_PATH] = "Locks device does not exist or path wrong",
225 [DLM_NO_DEVICE_PERMISSION] = "Client has insufficient perms for device",
226 [DLM_NO_CONTROL_DEVICE] = "Cannot set options on opened device ",
227 [DLM_RECOVERING] = "lock resource being recovered",
228 [DLM_MIGRATING] = "lock resource being migrated",
229 [DLM_MAXSTATS] = "invalid error number",
230};
231
232const char *dlm_errmsg(enum dlm_status err)
233{
234 if (err >= DLM_MAXSTATS || err < 0)
235 return dlm_errmsgs[DLM_MAXSTATS];
236 return dlm_errmsgs[err];
237}
238EXPORT_SYMBOL_GPL(dlm_errmsg);
239
240const char *dlm_errname(enum dlm_status err)
241{
242 if (err >= DLM_MAXSTATS || err < 0)
243 return dlm_errnames[DLM_MAXSTATS];
244 return dlm_errnames[err];
245}
246EXPORT_SYMBOL_GPL(dlm_errname);
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 000000000000..6858510c3ccd
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,30 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdebug.h
5 *
6 * Copyright (C) 2004 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMDEBUG_H
26#define DLMDEBUG_H
27
28void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
29
30#endif
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
new file mode 100644
index 000000000000..da3c22045f89
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -0,0 +1,1469 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdomain.c
5 *
6 * defines domain join / leave apis
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#include <linux/module.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/utsname.h>
32#include <linux/init.h>
33#include <linux/spinlock.h>
34#include <linux/delay.h>
35#include <linux/err.h>
36
37#include "cluster/heartbeat.h"
38#include "cluster/nodemanager.h"
39#include "cluster/tcp.h"
40
41#include "dlmapi.h"
42#include "dlmcommon.h"
43
44#include "dlmdebug.h"
45#include "dlmdomain.h"
46
47#include "dlmver.h"
48
49#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
50#include "cluster/masklog.h"
51
52/*
53 *
54 * spinlock lock ordering: if multiple locks are needed, obey this ordering:
55 * dlm_domain_lock
56 * struct dlm_ctxt->spinlock
57 * struct dlm_lock_resource->spinlock
58 * struct dlm_ctxt->master_lock
59 * struct dlm_ctxt->ast_lock
60 * dlm_master_list_entry->spinlock
61 * dlm_lock->spinlock
62 *
63 */
64
65spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
66LIST_HEAD(dlm_domains);
67static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
68
69#define DLM_DOMAIN_BACKOFF_MS 200
70
71static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data);
72static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data);
73static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data);
74static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data);
75
76static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
77
78void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
79{
80 list_del_init(&lockres->list);
81 dlm_lockres_put(lockres);
82}
83
84void __dlm_insert_lockres(struct dlm_ctxt *dlm,
85 struct dlm_lock_resource *res)
86{
87 struct list_head *bucket;
88 struct qstr *q;
89
90 assert_spin_locked(&dlm->spinlock);
91
92 q = &res->lockname;
93 q->hash = full_name_hash(q->name, q->len);
94 bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]);
95
96 /* get a reference for our hashtable */
97 dlm_lockres_get(res);
98
99 list_add_tail(&res->list, bucket);
100}
101
102struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
103 const char *name,
104 unsigned int len)
105{
106 unsigned int hash;
107 struct list_head *iter;
108 struct dlm_lock_resource *tmpres=NULL;
109 struct list_head *bucket;
110
111 mlog_entry("%.*s\n", len, name);
112
113 assert_spin_locked(&dlm->spinlock);
114
115 hash = full_name_hash(name, len);
116
117 bucket = &(dlm->resources[hash & DLM_HASH_MASK]);
118
119 /* check for pre-existing lock */
120 list_for_each(iter, bucket) {
121 tmpres = list_entry(iter, struct dlm_lock_resource, list);
122 if (tmpres->lockname.len == len &&
123 memcmp(tmpres->lockname.name, name, len) == 0) {
124 dlm_lockres_get(tmpres);
125 break;
126 }
127
128 tmpres = NULL;
129 }
130 return tmpres;
131}
132
133struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
134 const char *name,
135 unsigned int len)
136{
137 struct dlm_lock_resource *res;
138
139 spin_lock(&dlm->spinlock);
140 res = __dlm_lookup_lockres(dlm, name, len);
141 spin_unlock(&dlm->spinlock);
142 return res;
143}
144
145static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
146{
147 struct dlm_ctxt *tmp = NULL;
148 struct list_head *iter;
149
150 assert_spin_locked(&dlm_domain_lock);
151
152 /* tmp->name here is always NULL terminated,
153 * but domain may not be! */
154 list_for_each(iter, &dlm_domains) {
155 tmp = list_entry (iter, struct dlm_ctxt, list);
156 if (strlen(tmp->name) == len &&
157 memcmp(tmp->name, domain, len)==0)
158 break;
159 tmp = NULL;
160 }
161
162 return tmp;
163}
164
165/* For null terminated domain strings ONLY */
166static struct dlm_ctxt * __dlm_lookup_domain(const char *domain)
167{
168 assert_spin_locked(&dlm_domain_lock);
169
170 return __dlm_lookup_domain_full(domain, strlen(domain));
171}
172
173
174/* returns true on one of two conditions:
175 * 1) the domain does not exist
176 * 2) the domain exists and it's state is "joined" */
177static int dlm_wait_on_domain_helper(const char *domain)
178{
179 int ret = 0;
180 struct dlm_ctxt *tmp = NULL;
181
182 spin_lock(&dlm_domain_lock);
183
184 tmp = __dlm_lookup_domain(domain);
185 if (!tmp)
186 ret = 1;
187 else if (tmp->dlm_state == DLM_CTXT_JOINED)
188 ret = 1;
189
190 spin_unlock(&dlm_domain_lock);
191 return ret;
192}
193
194static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
195{
196 if (dlm->resources)
197 free_page((unsigned long) dlm->resources);
198
199 if (dlm->name)
200 kfree(dlm->name);
201
202 kfree(dlm);
203}
204
205/* A little strange - this function will be called while holding
206 * dlm_domain_lock and is expected to be holding it on the way out. We
207 * will however drop and reacquire it multiple times */
208static void dlm_ctxt_release(struct kref *kref)
209{
210 struct dlm_ctxt *dlm;
211
212 dlm = container_of(kref, struct dlm_ctxt, dlm_refs);
213
214 BUG_ON(dlm->num_joins);
215 BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED);
216
217 /* we may still be in the list if we hit an error during join. */
218 list_del_init(&dlm->list);
219
220 spin_unlock(&dlm_domain_lock);
221
222 mlog(0, "freeing memory from domain %s\n", dlm->name);
223
224 wake_up(&dlm_domain_events);
225
226 dlm_free_ctxt_mem(dlm);
227
228 spin_lock(&dlm_domain_lock);
229}
230
231void dlm_put(struct dlm_ctxt *dlm)
232{
233 spin_lock(&dlm_domain_lock);
234 kref_put(&dlm->dlm_refs, dlm_ctxt_release);
235 spin_unlock(&dlm_domain_lock);
236}
237
238static void __dlm_get(struct dlm_ctxt *dlm)
239{
240 kref_get(&dlm->dlm_refs);
241}
242
243/* given a questionable reference to a dlm object, gets a reference if
244 * it can find it in the list, otherwise returns NULL in which case
245 * you shouldn't trust your pointer. */
246struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
247{
248 struct list_head *iter;
249 struct dlm_ctxt *target = NULL;
250
251 spin_lock(&dlm_domain_lock);
252
253 list_for_each(iter, &dlm_domains) {
254 target = list_entry (iter, struct dlm_ctxt, list);
255
256 if (target == dlm) {
257 __dlm_get(target);
258 break;
259 }
260
261 target = NULL;
262 }
263
264 spin_unlock(&dlm_domain_lock);
265
266 return target;
267}
268
269int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
270{
271 int ret;
272
273 spin_lock(&dlm_domain_lock);
274 ret = (dlm->dlm_state == DLM_CTXT_JOINED) ||
275 (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN);
276 spin_unlock(&dlm_domain_lock);
277
278 return ret;
279}
280
281static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
282{
283 dlm_unregister_domain_handlers(dlm);
284 dlm_complete_thread(dlm);
285 dlm_complete_recovery_thread(dlm);
286
287 /* We've left the domain. Now we can take ourselves out of the
288 * list and allow the kref stuff to help us free the
289 * memory. */
290 spin_lock(&dlm_domain_lock);
291 list_del_init(&dlm->list);
292 spin_unlock(&dlm_domain_lock);
293
294 /* Wake up anyone waiting for us to remove this domain */
295 wake_up(&dlm_domain_events);
296}
297
298static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
299{
300 int i;
301 struct dlm_lock_resource *res;
302
303 mlog(0, "Migrating locks from domain %s\n", dlm->name);
304restart:
305 spin_lock(&dlm->spinlock);
306 for (i=0; i<DLM_HASH_SIZE; i++) {
307 while (!list_empty(&dlm->resources[i])) {
308 res = list_entry(dlm->resources[i].next,
309 struct dlm_lock_resource, list);
310 /* need reference when manually grabbing lockres */
311 dlm_lockres_get(res);
312 /* this should unhash the lockres
313 * and exit with dlm->spinlock */
314 mlog(0, "purging res=%p\n", res);
315 if (dlm_lockres_is_dirty(dlm, res)) {
316 /* HACK! this should absolutely go.
317 * need to figure out why some empty
318 * lockreses are still marked dirty */
319 mlog(ML_ERROR, "lockres %.*s dirty!\n",
320 res->lockname.len, res->lockname.name);
321
322 spin_unlock(&dlm->spinlock);
323 dlm_kick_thread(dlm, res);
324 wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
325 dlm_lockres_put(res);
326 goto restart;
327 }
328 dlm_purge_lockres(dlm, res);
329 dlm_lockres_put(res);
330 }
331 }
332 spin_unlock(&dlm->spinlock);
333
334 mlog(0, "DONE Migrating locks from domain %s\n", dlm->name);
335}
336
337static int dlm_no_joining_node(struct dlm_ctxt *dlm)
338{
339 int ret;
340
341 spin_lock(&dlm->spinlock);
342 ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN;
343 spin_unlock(&dlm->spinlock);
344
345 return ret;
346}
347
348static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm)
349{
350 /* Yikes, a double spinlock! I need domain_lock for the dlm
351 * state and the dlm spinlock for join state... Sorry! */
352again:
353 spin_lock(&dlm_domain_lock);
354 spin_lock(&dlm->spinlock);
355
356 if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
357 mlog(0, "Node %d is joining, we wait on it.\n",
358 dlm->joining_node);
359 spin_unlock(&dlm->spinlock);
360 spin_unlock(&dlm_domain_lock);
361
362 wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm));
363 goto again;
364 }
365
366 dlm->dlm_state = DLM_CTXT_LEAVING;
367 spin_unlock(&dlm->spinlock);
368 spin_unlock(&dlm_domain_lock);
369}
370
371static void __dlm_print_nodes(struct dlm_ctxt *dlm)
372{
373 int node = -1;
374
375 assert_spin_locked(&dlm->spinlock);
376
377 mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name);
378
379 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
380 node + 1)) < O2NM_MAX_NODES) {
381 mlog(ML_NOTICE, " node %d\n", node);
382 }
383}
384
385static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
386{
387 struct dlm_ctxt *dlm = data;
388 unsigned int node;
389 struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
390
391 mlog_entry("%p %u %p", msg, len, data);
392
393 if (!dlm_grab(dlm))
394 return 0;
395
396 node = exit_msg->node_idx;
397
398 mlog(0, "Node %u leaves domain %s\n", node, dlm->name);
399
400 spin_lock(&dlm->spinlock);
401 clear_bit(node, dlm->domain_map);
402 __dlm_print_nodes(dlm);
403
404 /* notify anything attached to the heartbeat events */
405 dlm_hb_event_notify_attached(dlm, node, 0);
406
407 spin_unlock(&dlm->spinlock);
408
409 dlm_put(dlm);
410
411 return 0;
412}
413
414static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm,
415 unsigned int node)
416{
417 int status;
418 struct dlm_exit_domain leave_msg;
419
420 mlog(0, "Asking node %u if we can leave the domain %s me = %u\n",
421 node, dlm->name, dlm->node_num);
422
423 memset(&leave_msg, 0, sizeof(leave_msg));
424 leave_msg.node_idx = dlm->node_num;
425
426 status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key,
427 &leave_msg, sizeof(leave_msg), node,
428 NULL);
429
430 mlog(0, "status return %d from o2net_send_message\n", status);
431
432 return status;
433}
434
435
436static void dlm_leave_domain(struct dlm_ctxt *dlm)
437{
438 int node, clear_node, status;
439
440 /* At this point we've migrated away all our locks and won't
441 * accept mastership of new ones. The dlm is responsible for
442 * almost nothing now. We make sure not to confuse any joining
443 * nodes and then commence shutdown procedure. */
444
445 spin_lock(&dlm->spinlock);
446 /* Clear ourselves from the domain map */
447 clear_bit(dlm->node_num, dlm->domain_map);
448 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
449 0)) < O2NM_MAX_NODES) {
450 /* Drop the dlm spinlock. This is safe wrt the domain_map.
451 * -nodes cannot be added now as the
452 * query_join_handlers knows to respond with OK_NO_MAP
453 * -we catch the right network errors if a node is
454 * removed from the map while we're sending him the
455 * exit message. */
456 spin_unlock(&dlm->spinlock);
457
458 clear_node = 1;
459
460 status = dlm_send_one_domain_exit(dlm, node);
461 if (status < 0 &&
462 status != -ENOPROTOOPT &&
463 status != -ENOTCONN) {
464 mlog(ML_NOTICE, "Error %d sending domain exit message "
465 "to node %d\n", status, node);
466
467 /* Not sure what to do here but lets sleep for
468 * a bit in case this was a transient
469 * error... */
470 msleep(DLM_DOMAIN_BACKOFF_MS);
471 clear_node = 0;
472 }
473
474 spin_lock(&dlm->spinlock);
475 /* If we're not clearing the node bit then we intend
476 * to loop back around to try again. */
477 if (clear_node)
478 clear_bit(node, dlm->domain_map);
479 }
480 spin_unlock(&dlm->spinlock);
481}
482
483int dlm_joined(struct dlm_ctxt *dlm)
484{
485 int ret = 0;
486
487 spin_lock(&dlm_domain_lock);
488
489 if (dlm->dlm_state == DLM_CTXT_JOINED)
490 ret = 1;
491
492 spin_unlock(&dlm_domain_lock);
493
494 return ret;
495}
496
497int dlm_shutting_down(struct dlm_ctxt *dlm)
498{
499 int ret = 0;
500
501 spin_lock(&dlm_domain_lock);
502
503 if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
504 ret = 1;
505
506 spin_unlock(&dlm_domain_lock);
507
508 return ret;
509}
510
511void dlm_unregister_domain(struct dlm_ctxt *dlm)
512{
513 int leave = 0;
514
515 spin_lock(&dlm_domain_lock);
516 BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
517 BUG_ON(!dlm->num_joins);
518
519 dlm->num_joins--;
520 if (!dlm->num_joins) {
521 /* We mark it "in shutdown" now so new register
522 * requests wait until we've completely left the
523 * domain. Don't use DLM_CTXT_LEAVING yet as we still
524 * want new domain joins to communicate with us at
525 * least until we've completed migration of our
526 * resources. */
527 dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
528 leave = 1;
529 }
530 spin_unlock(&dlm_domain_lock);
531
532 if (leave) {
533 mlog(0, "shutting down domain %s\n", dlm->name);
534
535 /* We changed dlm state, notify the thread */
536 dlm_kick_thread(dlm, NULL);
537
538 dlm_migrate_all_locks(dlm);
539 dlm_mark_domain_leaving(dlm);
540 dlm_leave_domain(dlm);
541 dlm_complete_dlm_shutdown(dlm);
542 }
543 dlm_put(dlm);
544}
545EXPORT_SYMBOL_GPL(dlm_unregister_domain);
546
547static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
548{
549 struct dlm_query_join_request *query;
550 enum dlm_query_join_response response;
551 struct dlm_ctxt *dlm = NULL;
552
553 query = (struct dlm_query_join_request *) msg->buf;
554
555 mlog(0, "node %u wants to join domain %s\n", query->node_idx,
556 query->domain);
557
558 /*
559 * If heartbeat doesn't consider the node live, tell it
560 * to back off and try again. This gives heartbeat a chance
561 * to catch up.
562 */
563 if (!o2hb_check_node_heartbeating(query->node_idx)) {
564 mlog(0, "node %u is not in our live map yet\n",
565 query->node_idx);
566
567 response = JOIN_DISALLOW;
568 goto respond;
569 }
570
571 response = JOIN_OK_NO_MAP;
572
573 spin_lock(&dlm_domain_lock);
574 dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
575 /* Once the dlm ctxt is marked as leaving then we don't want
576 * to be put in someone's domain map. */
577 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
578 spin_lock(&dlm->spinlock);
579
580 if (dlm->dlm_state == DLM_CTXT_NEW &&
581 dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) {
582 /*If this is a brand new context and we
583 * haven't started our join process yet, then
584 * the other node won the race. */
585 response = JOIN_OK_NO_MAP;
586 } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
587 /* Disallow parallel joins. */
588 response = JOIN_DISALLOW;
589 } else {
590 /* Alright we're fully a part of this domain
591 * so we keep some state as to who's joining
592 * and indicate to him that needs to be fixed
593 * up. */
594 response = JOIN_OK;
595 __dlm_set_joining_node(dlm, query->node_idx);
596 }
597
598 spin_unlock(&dlm->spinlock);
599 }
600 spin_unlock(&dlm_domain_lock);
601
602respond:
603 mlog(0, "We respond with %u\n", response);
604
605 return response;
606}
607
608static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
609{
610 struct dlm_assert_joined *assert;
611 struct dlm_ctxt *dlm = NULL;
612
613 assert = (struct dlm_assert_joined *) msg->buf;
614
615 mlog(0, "node %u asserts join on domain %s\n", assert->node_idx,
616 assert->domain);
617
618 spin_lock(&dlm_domain_lock);
619 dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len);
620 /* XXX should we consider no dlm ctxt an error? */
621 if (dlm) {
622 spin_lock(&dlm->spinlock);
623
624 /* Alright, this node has officially joined our
625 * domain. Set him in the map and clean up our
626 * leftover join state. */
627 BUG_ON(dlm->joining_node != assert->node_idx);
628 set_bit(assert->node_idx, dlm->domain_map);
629 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
630
631 __dlm_print_nodes(dlm);
632
633 /* notify anything attached to the heartbeat events */
634 dlm_hb_event_notify_attached(dlm, assert->node_idx, 1);
635
636 spin_unlock(&dlm->spinlock);
637 }
638 spin_unlock(&dlm_domain_lock);
639
640 return 0;
641}
642
643static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data)
644{
645 struct dlm_cancel_join *cancel;
646 struct dlm_ctxt *dlm = NULL;
647
648 cancel = (struct dlm_cancel_join *) msg->buf;
649
650 mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx,
651 cancel->domain);
652
653 spin_lock(&dlm_domain_lock);
654 dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len);
655
656 if (dlm) {
657 spin_lock(&dlm->spinlock);
658
659 /* Yikes, this guy wants to cancel his join. No
660 * problem, we simply cleanup our join state. */
661 BUG_ON(dlm->joining_node != cancel->node_idx);
662 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
663
664 spin_unlock(&dlm->spinlock);
665 }
666 spin_unlock(&dlm_domain_lock);
667
668 return 0;
669}
670
671static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm,
672 unsigned int node)
673{
674 int status;
675 struct dlm_cancel_join cancel_msg;
676
677 memset(&cancel_msg, 0, sizeof(cancel_msg));
678 cancel_msg.node_idx = dlm->node_num;
679 cancel_msg.name_len = strlen(dlm->name);
680 memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len);
681
682 status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
683 &cancel_msg, sizeof(cancel_msg), node,
684 NULL);
685 if (status < 0) {
686 mlog_errno(status);
687 goto bail;
688 }
689
690bail:
691 return status;
692}
693
694/* map_size should be in bytes. */
695static int dlm_send_join_cancels(struct dlm_ctxt *dlm,
696 unsigned long *node_map,
697 unsigned int map_size)
698{
699 int status, tmpstat;
700 unsigned int node;
701
702 if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) *
703 sizeof(unsigned long))) {
704 mlog(ML_ERROR,
705 "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n",
706 map_size, BITS_TO_LONGS(O2NM_MAX_NODES));
707 return -EINVAL;
708 }
709
710 status = 0;
711 node = -1;
712 while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
713 node + 1)) < O2NM_MAX_NODES) {
714 if (node == dlm->node_num)
715 continue;
716
717 tmpstat = dlm_send_one_join_cancel(dlm, node);
718 if (tmpstat) {
719 mlog(ML_ERROR, "Error return %d cancelling join on "
720 "node %d\n", tmpstat, node);
721 if (!status)
722 status = tmpstat;
723 }
724 }
725
726 if (status)
727 mlog_errno(status);
728 return status;
729}
730
731static int dlm_request_join(struct dlm_ctxt *dlm,
732 int node,
733 enum dlm_query_join_response *response)
734{
735 int status, retval;
736 struct dlm_query_join_request join_msg;
737
738 mlog(0, "querying node %d\n", node);
739
740 memset(&join_msg, 0, sizeof(join_msg));
741 join_msg.node_idx = dlm->node_num;
742 join_msg.name_len = strlen(dlm->name);
743 memcpy(join_msg.domain, dlm->name, join_msg.name_len);
744
745 status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
746 sizeof(join_msg), node, &retval);
747 if (status < 0 && status != -ENOPROTOOPT) {
748 mlog_errno(status);
749 goto bail;
750 }
751
752 /* -ENOPROTOOPT from the net code means the other side isn't
753 listening for our message type -- that's fine, it means
754 his dlm isn't up, so we can consider him a 'yes' but not
755 joined into the domain. */
756 if (status == -ENOPROTOOPT) {
757 status = 0;
758 *response = JOIN_OK_NO_MAP;
759 } else if (retval == JOIN_DISALLOW ||
760 retval == JOIN_OK ||
761 retval == JOIN_OK_NO_MAP) {
762 *response = retval;
763 } else {
764 status = -EINVAL;
765 mlog(ML_ERROR, "invalid response %d from node %u\n", retval,
766 node);
767 }
768
769 mlog(0, "status %d, node %d response is %d\n", status, node,
770 *response);
771
772bail:
773 return status;
774}
775
776static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
777 unsigned int node)
778{
779 int status;
780 struct dlm_assert_joined assert_msg;
781
782 mlog(0, "Sending join assert to node %u\n", node);
783
784 memset(&assert_msg, 0, sizeof(assert_msg));
785 assert_msg.node_idx = dlm->node_num;
786 assert_msg.name_len = strlen(dlm->name);
787 memcpy(assert_msg.domain, dlm->name, assert_msg.name_len);
788
789 status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
790 &assert_msg, sizeof(assert_msg), node,
791 NULL);
792 if (status < 0)
793 mlog_errno(status);
794
795 return status;
796}
797
798static void dlm_send_join_asserts(struct dlm_ctxt *dlm,
799 unsigned long *node_map)
800{
801 int status, node, live;
802
803 status = 0;
804 node = -1;
805 while ((node = find_next_bit(node_map, O2NM_MAX_NODES,
806 node + 1)) < O2NM_MAX_NODES) {
807 if (node == dlm->node_num)
808 continue;
809
810 do {
811 /* It is very important that this message be
812 * received so we spin until either the node
813 * has died or it gets the message. */
814 status = dlm_send_one_join_assert(dlm, node);
815
816 spin_lock(&dlm->spinlock);
817 live = test_bit(node, dlm->live_nodes_map);
818 spin_unlock(&dlm->spinlock);
819
820 if (status) {
821 mlog(ML_ERROR, "Error return %d asserting "
822 "join on node %d\n", status, node);
823
824 /* give us some time between errors... */
825 if (live)
826 msleep(DLM_DOMAIN_BACKOFF_MS);
827 }
828 } while (status && live);
829 }
830}
831
832struct domain_join_ctxt {
833 unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
834 unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
835};
836
837static int dlm_should_restart_join(struct dlm_ctxt *dlm,
838 struct domain_join_ctxt *ctxt,
839 enum dlm_query_join_response response)
840{
841 int ret;
842
843 if (response == JOIN_DISALLOW) {
844 mlog(0, "Latest response of disallow -- should restart\n");
845 return 1;
846 }
847
848 spin_lock(&dlm->spinlock);
849 /* For now, we restart the process if the node maps have
850 * changed at all */
851 ret = memcmp(ctxt->live_map, dlm->live_nodes_map,
852 sizeof(dlm->live_nodes_map));
853 spin_unlock(&dlm->spinlock);
854
855 if (ret)
856 mlog(0, "Node maps changed -- should restart\n");
857
858 return ret;
859}
860
861static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
862{
863 int status = 0, tmpstat, node;
864 struct domain_join_ctxt *ctxt;
865 enum dlm_query_join_response response;
866
867 mlog_entry("%p", dlm);
868
869 ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL);
870 if (!ctxt) {
871 status = -ENOMEM;
872 mlog_errno(status);
873 goto bail;
874 }
875
876 /* group sem locking should work for us here -- we're already
877 * registered for heartbeat events so filling this should be
878 * atomic wrt getting those handlers called. */
879 o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map));
880
881 spin_lock(&dlm->spinlock);
882 memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map));
883
884 __dlm_set_joining_node(dlm, dlm->node_num);
885
886 spin_unlock(&dlm->spinlock);
887
888 node = -1;
889 while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES,
890 node + 1)) < O2NM_MAX_NODES) {
891 if (node == dlm->node_num)
892 continue;
893
894 status = dlm_request_join(dlm, node, &response);
895 if (status < 0) {
896 mlog_errno(status);
897 goto bail;
898 }
899
900 /* Ok, either we got a response or the node doesn't have a
901 * dlm up. */
902 if (response == JOIN_OK)
903 set_bit(node, ctxt->yes_resp_map);
904
905 if (dlm_should_restart_join(dlm, ctxt, response)) {
906 status = -EAGAIN;
907 goto bail;
908 }
909 }
910
911 mlog(0, "Yay, done querying nodes!\n");
912
913 /* Yay, everyone agree's we can join the domain. My domain is
914 * comprised of all nodes who were put in the
915 * yes_resp_map. Copy that into our domain map and send a join
916 * assert message to clean up everyone elses state. */
917 spin_lock(&dlm->spinlock);
918 memcpy(dlm->domain_map, ctxt->yes_resp_map,
919 sizeof(ctxt->yes_resp_map));
920 set_bit(dlm->node_num, dlm->domain_map);
921 spin_unlock(&dlm->spinlock);
922
923 dlm_send_join_asserts(dlm, ctxt->yes_resp_map);
924
925 /* Joined state *must* be set before the joining node
926 * information, otherwise the query_join handler may read no
927 * current joiner but a state of NEW and tell joining nodes
928 * we're not in the domain. */
929 spin_lock(&dlm_domain_lock);
930 dlm->dlm_state = DLM_CTXT_JOINED;
931 dlm->num_joins++;
932 spin_unlock(&dlm_domain_lock);
933
934bail:
935 spin_lock(&dlm->spinlock);
936 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
937 if (!status)
938 __dlm_print_nodes(dlm);
939 spin_unlock(&dlm->spinlock);
940
941 if (ctxt) {
942 /* Do we need to send a cancel message to any nodes? */
943 if (status < 0) {
944 tmpstat = dlm_send_join_cancels(dlm,
945 ctxt->yes_resp_map,
946 sizeof(ctxt->yes_resp_map));
947 if (tmpstat < 0)
948 mlog_errno(tmpstat);
949 }
950 kfree(ctxt);
951 }
952
953 mlog(0, "returning %d\n", status);
954 return status;
955}
956
957static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
958{
959 o2hb_unregister_callback(&dlm->dlm_hb_up);
960 o2hb_unregister_callback(&dlm->dlm_hb_down);
961 o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
962}
963
964static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
965{
966 int status;
967
968 mlog(0, "registering handlers.\n");
969
970 o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
971 dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
972 status = o2hb_register_callback(&dlm->dlm_hb_down);
973 if (status)
974 goto bail;
975
976 o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
977 dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
978 status = o2hb_register_callback(&dlm->dlm_hb_up);
979 if (status)
980 goto bail;
981
982 status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key,
983 sizeof(struct dlm_master_request),
984 dlm_master_request_handler,
985 dlm, &dlm->dlm_domain_handlers);
986 if (status)
987 goto bail;
988
989 status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
990 sizeof(struct dlm_assert_master),
991 dlm_assert_master_handler,
992 dlm, &dlm->dlm_domain_handlers);
993 if (status)
994 goto bail;
995
996 status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key,
997 sizeof(struct dlm_create_lock),
998 dlm_create_lock_handler,
999 dlm, &dlm->dlm_domain_handlers);
1000 if (status)
1001 goto bail;
1002
1003 status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key,
1004 DLM_CONVERT_LOCK_MAX_LEN,
1005 dlm_convert_lock_handler,
1006 dlm, &dlm->dlm_domain_handlers);
1007 if (status)
1008 goto bail;
1009
1010 status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key,
1011 DLM_UNLOCK_LOCK_MAX_LEN,
1012 dlm_unlock_lock_handler,
1013 dlm, &dlm->dlm_domain_handlers);
1014 if (status)
1015 goto bail;
1016
1017 status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key,
1018 DLM_PROXY_AST_MAX_LEN,
1019 dlm_proxy_ast_handler,
1020 dlm, &dlm->dlm_domain_handlers);
1021 if (status)
1022 goto bail;
1023
1024 status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key,
1025 sizeof(struct dlm_exit_domain),
1026 dlm_exit_domain_handler,
1027 dlm, &dlm->dlm_domain_handlers);
1028 if (status)
1029 goto bail;
1030
1031 status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key,
1032 sizeof(struct dlm_migrate_request),
1033 dlm_migrate_request_handler,
1034 dlm, &dlm->dlm_domain_handlers);
1035 if (status)
1036 goto bail;
1037
1038 status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key,
1039 DLM_MIG_LOCKRES_MAX_LEN,
1040 dlm_mig_lockres_handler,
1041 dlm, &dlm->dlm_domain_handlers);
1042 if (status)
1043 goto bail;
1044
1045 status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key,
1046 sizeof(struct dlm_master_requery),
1047 dlm_master_requery_handler,
1048 dlm, &dlm->dlm_domain_handlers);
1049 if (status)
1050 goto bail;
1051
1052 status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key,
1053 sizeof(struct dlm_lock_request),
1054 dlm_request_all_locks_handler,
1055 dlm, &dlm->dlm_domain_handlers);
1056 if (status)
1057 goto bail;
1058
1059 status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key,
1060 sizeof(struct dlm_reco_data_done),
1061 dlm_reco_data_done_handler,
1062 dlm, &dlm->dlm_domain_handlers);
1063 if (status)
1064 goto bail;
1065
1066 status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key,
1067 sizeof(struct dlm_begin_reco),
1068 dlm_begin_reco_handler,
1069 dlm, &dlm->dlm_domain_handlers);
1070 if (status)
1071 goto bail;
1072
1073 status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key,
1074 sizeof(struct dlm_finalize_reco),
1075 dlm_finalize_reco_handler,
1076 dlm, &dlm->dlm_domain_handlers);
1077 if (status)
1078 goto bail;
1079
1080bail:
1081 if (status)
1082 dlm_unregister_domain_handlers(dlm);
1083
1084 return status;
1085}
1086
1087static int dlm_join_domain(struct dlm_ctxt *dlm)
1088{
1089 int status;
1090
1091 BUG_ON(!dlm);
1092
1093 mlog(0, "Join domain %s\n", dlm->name);
1094
1095 status = dlm_register_domain_handlers(dlm);
1096 if (status) {
1097 mlog_errno(status);
1098 goto bail;
1099 }
1100
1101 status = dlm_launch_thread(dlm);
1102 if (status < 0) {
1103 mlog_errno(status);
1104 goto bail;
1105 }
1106
1107 status = dlm_launch_recovery_thread(dlm);
1108 if (status < 0) {
1109 mlog_errno(status);
1110 goto bail;
1111 }
1112
1113 do {
1114 unsigned int backoff;
1115 status = dlm_try_to_join_domain(dlm);
1116
1117 /* If we're racing another node to the join, then we
1118 * need to back off temporarily and let them
1119 * complete. */
1120 if (status == -EAGAIN) {
1121 if (signal_pending(current)) {
1122 status = -ERESTARTSYS;
1123 goto bail;
1124 }
1125
1126 /*
1127 * <chip> After you!
1128 * <dale> No, after you!
1129 * <chip> I insist!
1130 * <dale> But you first!
1131 * ...
1132 */
1133 backoff = (unsigned int)(jiffies & 0x3);
1134 backoff *= DLM_DOMAIN_BACKOFF_MS;
1135 mlog(0, "backoff %d\n", backoff);
1136 msleep(backoff);
1137 }
1138 } while (status == -EAGAIN);
1139
1140 if (status < 0) {
1141 mlog_errno(status);
1142 goto bail;
1143 }
1144
1145 status = 0;
1146bail:
1147 wake_up(&dlm_domain_events);
1148
1149 if (status) {
1150 dlm_unregister_domain_handlers(dlm);
1151 dlm_complete_thread(dlm);
1152 dlm_complete_recovery_thread(dlm);
1153 }
1154
1155 return status;
1156}
1157
1158static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1159 u32 key)
1160{
1161 int i;
1162 struct dlm_ctxt *dlm = NULL;
1163
1164 dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL);
1165 if (!dlm) {
1166 mlog_errno(-ENOMEM);
1167 goto leave;
1168 }
1169
1170 dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL);
1171 if (dlm->name == NULL) {
1172 mlog_errno(-ENOMEM);
1173 kfree(dlm);
1174 dlm = NULL;
1175 goto leave;
1176 }
1177
1178 dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
1179 if (!dlm->resources) {
1180 mlog_errno(-ENOMEM);
1181 kfree(dlm->name);
1182 kfree(dlm);
1183 dlm = NULL;
1184 goto leave;
1185 }
1186 memset(dlm->resources, 0, PAGE_SIZE);
1187
1188 for (i=0; i<DLM_HASH_SIZE; i++)
1189 INIT_LIST_HEAD(&dlm->resources[i]);
1190
1191 strcpy(dlm->name, domain);
1192 dlm->key = key;
1193 dlm->node_num = o2nm_this_node();
1194
1195 spin_lock_init(&dlm->spinlock);
1196 spin_lock_init(&dlm->master_lock);
1197 spin_lock_init(&dlm->ast_lock);
1198 INIT_LIST_HEAD(&dlm->list);
1199 INIT_LIST_HEAD(&dlm->dirty_list);
1200 INIT_LIST_HEAD(&dlm->reco.resources);
1201 INIT_LIST_HEAD(&dlm->reco.received);
1202 INIT_LIST_HEAD(&dlm->reco.node_data);
1203 INIT_LIST_HEAD(&dlm->purge_list);
1204 INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
1205 dlm->reco.state = 0;
1206
1207 INIT_LIST_HEAD(&dlm->pending_asts);
1208 INIT_LIST_HEAD(&dlm->pending_basts);
1209
1210 mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n",
1211 dlm->recovery_map, &(dlm->recovery_map[0]));
1212
1213 memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map));
1214 memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map));
1215 memset(dlm->domain_map, 0, sizeof(dlm->domain_map));
1216
1217 dlm->dlm_thread_task = NULL;
1218 dlm->dlm_reco_thread_task = NULL;
1219 init_waitqueue_head(&dlm->dlm_thread_wq);
1220 init_waitqueue_head(&dlm->dlm_reco_thread_wq);
1221 init_waitqueue_head(&dlm->reco.event);
1222 init_waitqueue_head(&dlm->ast_wq);
1223 init_waitqueue_head(&dlm->migration_wq);
1224 INIT_LIST_HEAD(&dlm->master_list);
1225 INIT_LIST_HEAD(&dlm->mle_hb_events);
1226
1227 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
1228 init_waitqueue_head(&dlm->dlm_join_events);
1229
1230 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
1231 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
1232 atomic_set(&dlm->local_resources, 0);
1233 atomic_set(&dlm->remote_resources, 0);
1234 atomic_set(&dlm->unknown_resources, 0);
1235
1236 spin_lock_init(&dlm->work_lock);
1237 INIT_LIST_HEAD(&dlm->work_list);
1238 INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work, dlm);
1239
1240 kref_init(&dlm->dlm_refs);
1241 dlm->dlm_state = DLM_CTXT_NEW;
1242
1243 INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);
1244
1245 mlog(0, "context init: refcount %u\n",
1246 atomic_read(&dlm->dlm_refs.refcount));
1247
1248leave:
1249 return dlm;
1250}
1251
1252/*
1253 * dlm_register_domain: one-time setup per "domain"
1254 */
1255struct dlm_ctxt * dlm_register_domain(const char *domain,
1256 u32 key)
1257{
1258 int ret;
1259 struct dlm_ctxt *dlm = NULL;
1260 struct dlm_ctxt *new_ctxt = NULL;
1261
1262 if (strlen(domain) > O2NM_MAX_NAME_LEN) {
1263 ret = -ENAMETOOLONG;
1264 mlog(ML_ERROR, "domain name length too long\n");
1265 goto leave;
1266 }
1267
1268 if (!o2hb_check_local_node_heartbeating()) {
1269 mlog(ML_ERROR, "the local node has not been configured, or is "
1270 "not heartbeating\n");
1271 ret = -EPROTO;
1272 goto leave;
1273 }
1274
1275 mlog(0, "register called for domain \"%s\"\n", domain);
1276
1277retry:
1278 dlm = NULL;
1279 if (signal_pending(current)) {
1280 ret = -ERESTARTSYS;
1281 mlog_errno(ret);
1282 goto leave;
1283 }
1284
1285 spin_lock(&dlm_domain_lock);
1286
1287 dlm = __dlm_lookup_domain(domain);
1288 if (dlm) {
1289 if (dlm->dlm_state != DLM_CTXT_JOINED) {
1290 spin_unlock(&dlm_domain_lock);
1291
1292 mlog(0, "This ctxt is not joined yet!\n");
1293 wait_event_interruptible(dlm_domain_events,
1294 dlm_wait_on_domain_helper(
1295 domain));
1296 goto retry;
1297 }
1298
1299 __dlm_get(dlm);
1300 dlm->num_joins++;
1301
1302 spin_unlock(&dlm_domain_lock);
1303
1304 ret = 0;
1305 goto leave;
1306 }
1307
1308 /* doesn't exist */
1309 if (!new_ctxt) {
1310 spin_unlock(&dlm_domain_lock);
1311
1312 new_ctxt = dlm_alloc_ctxt(domain, key);
1313 if (new_ctxt)
1314 goto retry;
1315
1316 ret = -ENOMEM;
1317 mlog_errno(ret);
1318 goto leave;
1319 }
1320
1321 /* a little variable switch-a-roo here... */
1322 dlm = new_ctxt;
1323 new_ctxt = NULL;
1324
1325 /* add the new domain */
1326 list_add_tail(&dlm->list, &dlm_domains);
1327 spin_unlock(&dlm_domain_lock);
1328
1329 ret = dlm_join_domain(dlm);
1330 if (ret) {
1331 mlog_errno(ret);
1332 dlm_put(dlm);
1333 goto leave;
1334 }
1335
1336 ret = 0;
1337leave:
1338 if (new_ctxt)
1339 dlm_free_ctxt_mem(new_ctxt);
1340
1341 if (ret < 0)
1342 dlm = ERR_PTR(ret);
1343
1344 return dlm;
1345}
1346EXPORT_SYMBOL_GPL(dlm_register_domain);
1347
1348static LIST_HEAD(dlm_join_handlers);
1349
1350static void dlm_unregister_net_handlers(void)
1351{
1352 o2net_unregister_handler_list(&dlm_join_handlers);
1353}
1354
1355static int dlm_register_net_handlers(void)
1356{
1357 int status = 0;
1358
1359 status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY,
1360 sizeof(struct dlm_query_join_request),
1361 dlm_query_join_handler,
1362 NULL, &dlm_join_handlers);
1363 if (status)
1364 goto bail;
1365
1366 status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
1367 sizeof(struct dlm_assert_joined),
1368 dlm_assert_joined_handler,
1369 NULL, &dlm_join_handlers);
1370 if (status)
1371 goto bail;
1372
1373 status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY,
1374 sizeof(struct dlm_cancel_join),
1375 dlm_cancel_join_handler,
1376 NULL, &dlm_join_handlers);
1377
1378bail:
1379 if (status < 0)
1380 dlm_unregister_net_handlers();
1381
1382 return status;
1383}
1384
1385/* Domain eviction callback handling.
1386 *
1387 * The file system requires notification of node death *before* the
1388 * dlm completes it's recovery work, otherwise it may be able to
1389 * acquire locks on resources requiring recovery. Since the dlm can
1390 * evict a node from it's domain *before* heartbeat fires, a similar
1391 * mechanism is required. */
1392
1393/* Eviction is not expected to happen often, so a per-domain lock is
1394 * not necessary. Eviction callbacks are allowed to sleep for short
1395 * periods of time. */
1396static DECLARE_RWSEM(dlm_callback_sem);
1397
1398void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
1399 int node_num)
1400{
1401 struct list_head *iter;
1402 struct dlm_eviction_cb *cb;
1403
1404 down_read(&dlm_callback_sem);
1405 list_for_each(iter, &dlm->dlm_eviction_callbacks) {
1406 cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
1407
1408 cb->ec_func(node_num, cb->ec_data);
1409 }
1410 up_read(&dlm_callback_sem);
1411}
1412
1413void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
1414 dlm_eviction_func *f,
1415 void *data)
1416{
1417 INIT_LIST_HEAD(&cb->ec_item);
1418 cb->ec_func = f;
1419 cb->ec_data = data;
1420}
1421EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb);
1422
1423void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
1424 struct dlm_eviction_cb *cb)
1425{
1426 down_write(&dlm_callback_sem);
1427 list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks);
1428 up_write(&dlm_callback_sem);
1429}
1430EXPORT_SYMBOL_GPL(dlm_register_eviction_cb);
1431
1432void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb)
1433{
1434 down_write(&dlm_callback_sem);
1435 list_del_init(&cb->ec_item);
1436 up_write(&dlm_callback_sem);
1437}
1438EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb);
1439
1440static int __init dlm_init(void)
1441{
1442 int status;
1443
1444 dlm_print_version();
1445
1446 status = dlm_init_mle_cache();
1447 if (status)
1448 return -1;
1449
1450 status = dlm_register_net_handlers();
1451 if (status) {
1452 dlm_destroy_mle_cache();
1453 return -1;
1454 }
1455
1456 return 0;
1457}
1458
1459static void __exit dlm_exit (void)
1460{
1461 dlm_unregister_net_handlers();
1462 dlm_destroy_mle_cache();
1463}
1464
1465MODULE_AUTHOR("Oracle");
1466MODULE_LICENSE("GPL");
1467
1468module_init(dlm_init);
1469module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
new file mode 100644
index 000000000000..2f7f60bfeb3b
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -0,0 +1,36 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdomain.h
5 *
6 * Copyright (C) 2004 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMDOMAIN_H
26#define DLMDOMAIN_H
27
28extern spinlock_t dlm_domain_lock;
29extern struct list_head dlm_domains;
30
31int dlm_joined(struct dlm_ctxt *dlm);
32int dlm_shutting_down(struct dlm_ctxt *dlm);
33void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
34 int node_num);
35
36#endif
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
new file mode 100644
index 000000000000..dd2d24dc25e0
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -0,0 +1,640 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfs.c
5 *
6 * Code which implements the kernel side of a minimal userspace
7 * interface to our DLM. This file handles the virtual file system
8 * used for communication with userspace. Credit should go to ramfs,
9 * which was a template for the fs side of this module.
10 *
11 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public
15 * License as published by the Free Software Foundation; either
16 * version 2 of the License, or (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 * General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public
24 * License along with this program; if not, write to the
25 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
26 * Boston, MA 021110-1307, USA.
27 */
28
29/* Simple VFS hooks based on: */
30/*
31 * Resizable simple ram filesystem for Linux.
32 *
33 * Copyright (C) 2000 Linus Torvalds.
34 * 2000 Transmeta Corp.
35 */
36
37#include <linux/module.h>
38#include <linux/fs.h>
39#include <linux/pagemap.h>
40#include <linux/types.h>
41#include <linux/slab.h>
42#include <linux/highmem.h>
43#include <linux/init.h>
44#include <linux/string.h>
45#include <linux/smp_lock.h>
46#include <linux/backing-dev.h>
47
48#include <asm/uaccess.h>
49
50
51#include "cluster/nodemanager.h"
52#include "cluster/heartbeat.h"
53#include "cluster/tcp.h"
54
55#include "dlmapi.h"
56
57#include "userdlm.h"
58
59#include "dlmfsver.h"
60
61#define MLOG_MASK_PREFIX ML_DLMFS
62#include "cluster/masklog.h"
63
64static struct super_operations dlmfs_ops;
65static struct file_operations dlmfs_file_operations;
66static struct inode_operations dlmfs_dir_inode_operations;
67static struct inode_operations dlmfs_root_inode_operations;
68static struct inode_operations dlmfs_file_inode_operations;
69static kmem_cache_t *dlmfs_inode_cache;
70
71struct workqueue_struct *user_dlm_worker;
72
73/*
74 * decodes a set of open flags into a valid lock level and a set of flags.
75 * returns < 0 if we have invalid flags
76 * flags which mean something to us:
77 * O_RDONLY -> PRMODE level
78 * O_WRONLY -> EXMODE level
79 *
80 * O_NONBLOCK -> LKM_NOQUEUE
81 */
82static int dlmfs_decode_open_flags(int open_flags,
83 int *level,
84 int *flags)
85{
86 if (open_flags & (O_WRONLY|O_RDWR))
87 *level = LKM_EXMODE;
88 else
89 *level = LKM_PRMODE;
90
91 *flags = 0;
92 if (open_flags & O_NONBLOCK)
93 *flags |= LKM_NOQUEUE;
94
95 return 0;
96}
97
98static int dlmfs_file_open(struct inode *inode,
99 struct file *file)
100{
101 int status, level, flags;
102 struct dlmfs_filp_private *fp = NULL;
103 struct dlmfs_inode_private *ip;
104
105 if (S_ISDIR(inode->i_mode))
106 BUG();
107
108 mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
109 file->f_flags);
110
111 status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
112 if (status < 0)
113 goto bail;
114
115 /* We don't want to honor O_APPEND at read/write time as it
116 * doesn't make sense for LVB writes. */
117 file->f_flags &= ~O_APPEND;
118
119 fp = kmalloc(sizeof(*fp), GFP_KERNEL);
120 if (!fp) {
121 status = -ENOMEM;
122 goto bail;
123 }
124 fp->fp_lock_level = level;
125
126 ip = DLMFS_I(inode);
127
128 status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
129 if (status < 0) {
130 /* this is a strange error to return here but I want
131 * to be able userspace to be able to distinguish a
132 * valid lock request from one that simply couldn't be
133 * granted. */
134 if (flags & LKM_NOQUEUE && status == -EAGAIN)
135 status = -ETXTBSY;
136 kfree(fp);
137 goto bail;
138 }
139
140 file->private_data = fp;
141bail:
142 return status;
143}
144
145static int dlmfs_file_release(struct inode *inode,
146 struct file *file)
147{
148 int level, status;
149 struct dlmfs_inode_private *ip = DLMFS_I(inode);
150 struct dlmfs_filp_private *fp =
151 (struct dlmfs_filp_private *) file->private_data;
152
153 if (S_ISDIR(inode->i_mode))
154 BUG();
155
156 mlog(0, "close called on inode %lu\n", inode->i_ino);
157
158 status = 0;
159 if (fp) {
160 level = fp->fp_lock_level;
161 if (level != LKM_IVMODE)
162 user_dlm_cluster_unlock(&ip->ip_lockres, level);
163
164 kfree(fp);
165 file->private_data = NULL;
166 }
167
168 return 0;
169}
170
171static ssize_t dlmfs_file_read(struct file *filp,
172 char __user *buf,
173 size_t count,
174 loff_t *ppos)
175{
176 int bytes_left;
177 ssize_t readlen;
178 char *lvb_buf;
179 struct inode *inode = filp->f_dentry->d_inode;
180
181 mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
182 inode->i_ino, count, *ppos);
183
184 if (*ppos >= i_size_read(inode))
185 return 0;
186
187 if (!count)
188 return 0;
189
190 if (!access_ok(VERIFY_WRITE, buf, count))
191 return -EFAULT;
192
193 /* don't read past the lvb */
194 if ((count + *ppos) > i_size_read(inode))
195 readlen = i_size_read(inode) - *ppos;
196 else
197 readlen = count - *ppos;
198
199 lvb_buf = kmalloc(readlen, GFP_KERNEL);
200 if (!lvb_buf)
201 return -ENOMEM;
202
203 user_dlm_read_lvb(inode, lvb_buf, readlen);
204 bytes_left = __copy_to_user(buf, lvb_buf, readlen);
205 readlen -= bytes_left;
206
207 kfree(lvb_buf);
208
209 *ppos = *ppos + readlen;
210
211 mlog(0, "read %zd bytes\n", readlen);
212 return readlen;
213}
214
215static ssize_t dlmfs_file_write(struct file *filp,
216 const char __user *buf,
217 size_t count,
218 loff_t *ppos)
219{
220 int bytes_left;
221 ssize_t writelen;
222 char *lvb_buf;
223 struct inode *inode = filp->f_dentry->d_inode;
224
225 mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
226 inode->i_ino, count, *ppos);
227
228 if (*ppos >= i_size_read(inode))
229 return -ENOSPC;
230
231 if (!count)
232 return 0;
233
234 if (!access_ok(VERIFY_READ, buf, count))
235 return -EFAULT;
236
237 /* don't write past the lvb */
238 if ((count + *ppos) > i_size_read(inode))
239 writelen = i_size_read(inode) - *ppos;
240 else
241 writelen = count - *ppos;
242
243 lvb_buf = kmalloc(writelen, GFP_KERNEL);
244 if (!lvb_buf)
245 return -ENOMEM;
246
247 bytes_left = copy_from_user(lvb_buf, buf, writelen);
248 writelen -= bytes_left;
249 if (writelen)
250 user_dlm_write_lvb(inode, lvb_buf, writelen);
251
252 kfree(lvb_buf);
253
254 *ppos = *ppos + writelen;
255 mlog(0, "wrote %zd bytes\n", writelen);
256 return writelen;
257}
258
259static void dlmfs_init_once(void *foo,
260 kmem_cache_t *cachep,
261 unsigned long flags)
262{
263 struct dlmfs_inode_private *ip =
264 (struct dlmfs_inode_private *) foo;
265
266 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
267 SLAB_CTOR_CONSTRUCTOR) {
268 ip->ip_dlm = NULL;
269 ip->ip_parent = NULL;
270
271 inode_init_once(&ip->ip_vfs_inode);
272 }
273}
274
275static struct inode *dlmfs_alloc_inode(struct super_block *sb)
276{
277 struct dlmfs_inode_private *ip;
278
279 ip = kmem_cache_alloc(dlmfs_inode_cache, SLAB_NOFS);
280 if (!ip)
281 return NULL;
282
283 return &ip->ip_vfs_inode;
284}
285
286static void dlmfs_destroy_inode(struct inode *inode)
287{
288 kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
289}
290
291static void dlmfs_clear_inode(struct inode *inode)
292{
293 int status;
294 struct dlmfs_inode_private *ip;
295
296 if (!inode)
297 return;
298
299 mlog(0, "inode %lu\n", inode->i_ino);
300
301 ip = DLMFS_I(inode);
302
303 if (S_ISREG(inode->i_mode)) {
304 status = user_dlm_destroy_lock(&ip->ip_lockres);
305 if (status < 0)
306 mlog_errno(status);
307 iput(ip->ip_parent);
308 goto clear_fields;
309 }
310
311 mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
312 /* we must be a directory. If required, lets unregister the
313 * dlm context now. */
314 if (ip->ip_dlm)
315 user_dlm_unregister_context(ip->ip_dlm);
316clear_fields:
317 ip->ip_parent = NULL;
318 ip->ip_dlm = NULL;
319}
320
321static struct backing_dev_info dlmfs_backing_dev_info = {
322 .ra_pages = 0, /* No readahead */
323 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
324};
325
326static struct inode *dlmfs_get_root_inode(struct super_block *sb)
327{
328 struct inode *inode = new_inode(sb);
329 int mode = S_IFDIR | 0755;
330 struct dlmfs_inode_private *ip;
331
332 if (inode) {
333 ip = DLMFS_I(inode);
334
335 inode->i_mode = mode;
336 inode->i_uid = current->fsuid;
337 inode->i_gid = current->fsgid;
338 inode->i_blksize = PAGE_CACHE_SIZE;
339 inode->i_blocks = 0;
340 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
341 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
342 inode->i_nlink++;
343
344 inode->i_fop = &simple_dir_operations;
345 inode->i_op = &dlmfs_root_inode_operations;
346 }
347
348 return inode;
349}
350
351static struct inode *dlmfs_get_inode(struct inode *parent,
352 struct dentry *dentry,
353 int mode)
354{
355 struct super_block *sb = parent->i_sb;
356 struct inode * inode = new_inode(sb);
357 struct dlmfs_inode_private *ip;
358
359 if (!inode)
360 return NULL;
361
362 inode->i_mode = mode;
363 inode->i_uid = current->fsuid;
364 inode->i_gid = current->fsgid;
365 inode->i_blksize = PAGE_CACHE_SIZE;
366 inode->i_blocks = 0;
367 inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
368 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
369
370 ip = DLMFS_I(inode);
371 ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
372
373 switch (mode & S_IFMT) {
374 default:
375 /* for now we don't support anything other than
376 * directories and regular files. */
377 BUG();
378 break;
379 case S_IFREG:
380 inode->i_op = &dlmfs_file_inode_operations;
381 inode->i_fop = &dlmfs_file_operations;
382
383 i_size_write(inode, DLM_LVB_LEN);
384
385 user_dlm_lock_res_init(&ip->ip_lockres, dentry);
386
387 /* released at clear_inode time, this insures that we
388 * get to drop the dlm reference on each lock *before*
389 * we call the unregister code for releasing parent
390 * directories. */
391 ip->ip_parent = igrab(parent);
392 BUG_ON(!ip->ip_parent);
393 break;
394 case S_IFDIR:
395 inode->i_op = &dlmfs_dir_inode_operations;
396 inode->i_fop = &simple_dir_operations;
397
398 /* directory inodes start off with i_nlink ==
399 * 2 (for "." entry) */
400 inode->i_nlink++;
401 break;
402 }
403
404 if (parent->i_mode & S_ISGID) {
405 inode->i_gid = parent->i_gid;
406 if (S_ISDIR(mode))
407 inode->i_mode |= S_ISGID;
408 }
409
410 return inode;
411}
412
413/*
414 * File creation. Allocate an inode, and we're done..
415 */
416/* SMP-safe */
417static int dlmfs_mkdir(struct inode * dir,
418 struct dentry * dentry,
419 int mode)
420{
421 int status;
422 struct inode *inode = NULL;
423 struct qstr *domain = &dentry->d_name;
424 struct dlmfs_inode_private *ip;
425 struct dlm_ctxt *dlm;
426
427 mlog(0, "mkdir %.*s\n", domain->len, domain->name);
428
429 /* verify that we have a proper domain */
430 if (domain->len >= O2NM_MAX_NAME_LEN) {
431 status = -EINVAL;
432 mlog(ML_ERROR, "invalid domain name for directory.\n");
433 goto bail;
434 }
435
436 inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
437 if (!inode) {
438 status = -ENOMEM;
439 mlog_errno(status);
440 goto bail;
441 }
442
443 ip = DLMFS_I(inode);
444
445 dlm = user_dlm_register_context(domain);
446 if (IS_ERR(dlm)) {
447 status = PTR_ERR(dlm);
448 mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
449 status, domain->len, domain->name);
450 goto bail;
451 }
452 ip->ip_dlm = dlm;
453
454 dir->i_nlink++;
455 d_instantiate(dentry, inode);
456 dget(dentry); /* Extra count - pin the dentry in core */
457
458 status = 0;
459bail:
460 if (status < 0)
461 iput(inode);
462 return status;
463}
464
465static int dlmfs_create(struct inode *dir,
466 struct dentry *dentry,
467 int mode,
468 struct nameidata *nd)
469{
470 int status = 0;
471 struct inode *inode;
472 struct qstr *name = &dentry->d_name;
473
474 mlog(0, "create %.*s\n", name->len, name->name);
475
476 /* verify name is valid and doesn't contain any dlm reserved
477 * characters */
478 if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
479 name->name[0] == '$') {
480 status = -EINVAL;
481 mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
482 name->name);
483 goto bail;
484 }
485
486 inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
487 if (!inode) {
488 status = -ENOMEM;
489 mlog_errno(status);
490 goto bail;
491 }
492
493 d_instantiate(dentry, inode);
494 dget(dentry); /* Extra count - pin the dentry in core */
495bail:
496 return status;
497}
498
499static int dlmfs_unlink(struct inode *dir,
500 struct dentry *dentry)
501{
502 int status;
503 struct inode *inode = dentry->d_inode;
504
505 mlog(0, "unlink inode %lu\n", inode->i_ino);
506
507 /* if there are no current holders, or none that are waiting
508 * to acquire a lock, this basically destroys our lockres. */
509 status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
510 if (status < 0) {
511 mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
512 dentry->d_name.len, dentry->d_name.name, status);
513 goto bail;
514 }
515 status = simple_unlink(dir, dentry);
516bail:
517 return status;
518}
519
520static int dlmfs_fill_super(struct super_block * sb,
521 void * data,
522 int silent)
523{
524 struct inode * inode;
525 struct dentry * root;
526
527 sb->s_maxbytes = MAX_LFS_FILESIZE;
528 sb->s_blocksize = PAGE_CACHE_SIZE;
529 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
530 sb->s_magic = DLMFS_MAGIC;
531 sb->s_op = &dlmfs_ops;
532 inode = dlmfs_get_root_inode(sb);
533 if (!inode)
534 return -ENOMEM;
535
536 root = d_alloc_root(inode);
537 if (!root) {
538 iput(inode);
539 return -ENOMEM;
540 }
541 sb->s_root = root;
542 return 0;
543}
544
545static struct file_operations dlmfs_file_operations = {
546 .open = dlmfs_file_open,
547 .release = dlmfs_file_release,
548 .read = dlmfs_file_read,
549 .write = dlmfs_file_write,
550};
551
552static struct inode_operations dlmfs_dir_inode_operations = {
553 .create = dlmfs_create,
554 .lookup = simple_lookup,
555 .unlink = dlmfs_unlink,
556};
557
558/* this way we can restrict mkdir to only the toplevel of the fs. */
559static struct inode_operations dlmfs_root_inode_operations = {
560 .lookup = simple_lookup,
561 .mkdir = dlmfs_mkdir,
562 .rmdir = simple_rmdir,
563};
564
565static struct super_operations dlmfs_ops = {
566 .statfs = simple_statfs,
567 .alloc_inode = dlmfs_alloc_inode,
568 .destroy_inode = dlmfs_destroy_inode,
569 .clear_inode = dlmfs_clear_inode,
570 .drop_inode = generic_delete_inode,
571};
572
573static struct inode_operations dlmfs_file_inode_operations = {
574 .getattr = simple_getattr,
575};
576
577static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
578 int flags, const char *dev_name, void *data)
579{
580 return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
581}
582
583static struct file_system_type dlmfs_fs_type = {
584 .owner = THIS_MODULE,
585 .name = "ocfs2_dlmfs",
586 .get_sb = dlmfs_get_sb,
587 .kill_sb = kill_litter_super,
588};
589
590static int __init init_dlmfs_fs(void)
591{
592 int status;
593 int cleanup_inode = 0, cleanup_worker = 0;
594
595 dlmfs_print_version();
596
597 dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
598 sizeof(struct dlmfs_inode_private),
599 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
600 dlmfs_init_once, NULL);
601 if (!dlmfs_inode_cache)
602 return -ENOMEM;
603 cleanup_inode = 1;
604
605 user_dlm_worker = create_singlethread_workqueue("user_dlm");
606 if (!user_dlm_worker) {
607 status = -ENOMEM;
608 goto bail;
609 }
610 cleanup_worker = 1;
611
612 status = register_filesystem(&dlmfs_fs_type);
613bail:
614 if (status) {
615 if (cleanup_inode)
616 kmem_cache_destroy(dlmfs_inode_cache);
617 if (cleanup_worker)
618 destroy_workqueue(user_dlm_worker);
619 } else
620 printk("OCFS2 User DLM kernel interface loaded\n");
621 return status;
622}
623
624static void __exit exit_dlmfs_fs(void)
625{
626 unregister_filesystem(&dlmfs_fs_type);
627
628 flush_workqueue(user_dlm_worker);
629 destroy_workqueue(user_dlm_worker);
630
631 if (kmem_cache_destroy(dlmfs_inode_cache))
632 printk(KERN_INFO "dlmfs_inode_cache: not all structures "
633 "were freed\n");
634}
635
636MODULE_AUTHOR("Oracle");
637MODULE_LICENSE("GPL");
638
639module_init(init_dlmfs_fs)
640module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c
new file mode 100644
index 000000000000..d2be3ad841f9
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfsver.c
@@ -0,0 +1,42 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfsver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "dlmfsver.h"
30
31#define DLM_BUILD_VERSION "1.3.3"
32
33#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
34
35void dlmfs_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlm/dlmfsver.h
new file mode 100644
index 000000000000..f35eadbed25c
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmfsver.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef DLMFS_VER_H
27#define DLMFS_VER_H
28
29void dlmfs_print_version(void);
30
31#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
new file mode 100644
index 000000000000..d1a0038557a3
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -0,0 +1,676 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmlock.c
5 *
6 * underlying calls for lock creation
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41#include <linux/delay.h>
42
43
44#include "cluster/heartbeat.h"
45#include "cluster/nodemanager.h"
46#include "cluster/tcp.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50
51#include "dlmconvert.h"
52
53#define MLOG_MASK_PREFIX ML_DLM
54#include "cluster/masklog.h"
55
56static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
57static u64 dlm_next_cookie = 1;
58
59static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
60 struct dlm_lock_resource *res,
61 struct dlm_lock *lock, int flags);
62static void dlm_init_lock(struct dlm_lock *newlock, int type,
63 u8 node, u64 cookie);
64static void dlm_lock_release(struct kref *kref);
65static void dlm_lock_detach_lockres(struct dlm_lock *lock);
66
67/* Tell us whether we can grant a new lock request.
68 * locking:
69 * caller needs: res->spinlock
70 * taken: none
71 * held on exit: none
72 * returns: 1 if the lock can be granted, 0 otherwise.
73 */
74static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
75 struct dlm_lock *lock)
76{
77 struct list_head *iter;
78 struct dlm_lock *tmplock;
79
80 list_for_each(iter, &res->granted) {
81 tmplock = list_entry(iter, struct dlm_lock, list);
82
83 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
84 return 0;
85 }
86
87 list_for_each(iter, &res->converting) {
88 tmplock = list_entry(iter, struct dlm_lock, list);
89
90 if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
91 return 0;
92 }
93
94 return 1;
95}
96
97/* performs lock creation at the lockres master site
98 * locking:
99 * caller needs: none
100 * taken: takes and drops res->spinlock
101 * held on exit: none
102 * returns: DLM_NORMAL, DLM_NOTQUEUED
103 */
104static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
105 struct dlm_lock_resource *res,
106 struct dlm_lock *lock, int flags)
107{
108 int call_ast = 0, kick_thread = 0;
109 enum dlm_status status = DLM_NORMAL;
110
111 mlog_entry("type=%d\n", lock->ml.type);
112
113 spin_lock(&res->spinlock);
114 /* if called from dlm_create_lock_handler, need to
115 * ensure it will not sleep in dlm_wait_on_lockres */
116 status = __dlm_lockres_state_to_status(res);
117 if (status != DLM_NORMAL &&
118 lock->ml.node != dlm->node_num) {
119 /* erf. state changed after lock was dropped. */
120 spin_unlock(&res->spinlock);
121 dlm_error(status);
122 return status;
123 }
124 __dlm_wait_on_lockres(res);
125 __dlm_lockres_reserve_ast(res);
126
127 if (dlm_can_grant_new_lock(res, lock)) {
128 mlog(0, "I can grant this lock right away\n");
129 /* got it right away */
130 lock->lksb->status = DLM_NORMAL;
131 status = DLM_NORMAL;
132 dlm_lock_get(lock);
133 list_add_tail(&lock->list, &res->granted);
134
135 /* for the recovery lock, we can't allow the ast
136 * to be queued since the dlmthread is already
137 * frozen. but the recovery lock is always locked
138 * with LKM_NOQUEUE so we do not need the ast in
139 * this special case */
140 if (!dlm_is_recovery_lock(res->lockname.name,
141 res->lockname.len)) {
142 kick_thread = 1;
143 call_ast = 1;
144 }
145 } else {
146 /* for NOQUEUE request, unless we get the
147 * lock right away, return DLM_NOTQUEUED */
148 if (flags & LKM_NOQUEUE)
149 status = DLM_NOTQUEUED;
150 else {
151 dlm_lock_get(lock);
152 list_add_tail(&lock->list, &res->blocked);
153 kick_thread = 1;
154 }
155 }
156
157 spin_unlock(&res->spinlock);
158 wake_up(&res->wq);
159
160 /* either queue the ast or release it */
161 if (call_ast)
162 dlm_queue_ast(dlm, lock);
163 else
164 dlm_lockres_release_ast(dlm, res);
165
166 dlm_lockres_calc_usage(dlm, res);
167 if (kick_thread)
168 dlm_kick_thread(dlm, res);
169
170 return status;
171}
172
173void dlm_revert_pending_lock(struct dlm_lock_resource *res,
174 struct dlm_lock *lock)
175{
176 /* remove from local queue if it failed */
177 list_del_init(&lock->list);
178 lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
179}
180
181
182/*
183 * locking:
184 * caller needs: none
185 * taken: takes and drops res->spinlock
186 * held on exit: none
187 * returns: DLM_DENIED, DLM_RECOVERING, or net status
188 */
189static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
190 struct dlm_lock_resource *res,
191 struct dlm_lock *lock, int flags)
192{
193 enum dlm_status status = DLM_DENIED;
194
195 mlog_entry("type=%d\n", lock->ml.type);
196 mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
197 res->lockname.name, flags);
198
199 spin_lock(&res->spinlock);
200
201 /* will exit this call with spinlock held */
202 __dlm_wait_on_lockres(res);
203 res->state |= DLM_LOCK_RES_IN_PROGRESS;
204
205 /* add lock to local (secondary) queue */
206 dlm_lock_get(lock);
207 list_add_tail(&lock->list, &res->blocked);
208 lock->lock_pending = 1;
209 spin_unlock(&res->spinlock);
210
211 /* spec seems to say that you will get DLM_NORMAL when the lock
212 * has been queued, meaning we need to wait for a reply here. */
213 status = dlm_send_remote_lock_request(dlm, res, lock, flags);
214
215 spin_lock(&res->spinlock);
216 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
217 lock->lock_pending = 0;
218 if (status != DLM_NORMAL) {
219 if (status != DLM_NOTQUEUED)
220 dlm_error(status);
221 dlm_revert_pending_lock(res, lock);
222 dlm_lock_put(lock);
223 }
224 spin_unlock(&res->spinlock);
225
226 dlm_lockres_calc_usage(dlm, res);
227
228 wake_up(&res->wq);
229 return status;
230}
231
232
233/* for remote lock creation.
234 * locking:
235 * caller needs: none, but need res->state & DLM_LOCK_RES_IN_PROGRESS
236 * taken: none
237 * held on exit: none
238 * returns: DLM_NOLOCKMGR, or net status
239 */
240static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
241 struct dlm_lock_resource *res,
242 struct dlm_lock *lock, int flags)
243{
244 struct dlm_create_lock create;
245 int tmpret, status = 0;
246 enum dlm_status ret;
247
248 mlog_entry_void();
249
250 memset(&create, 0, sizeof(create));
251 create.node_idx = dlm->node_num;
252 create.requested_type = lock->ml.type;
253 create.cookie = lock->ml.cookie;
254 create.namelen = res->lockname.len;
255 create.flags = cpu_to_be32(flags);
256 memcpy(create.name, res->lockname.name, create.namelen);
257
258 tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
259 sizeof(create), res->owner, &status);
260 if (tmpret >= 0) {
261 // successfully sent and received
262 ret = status; // this is already a dlm_status
263 } else {
264 mlog_errno(tmpret);
265 if (dlm_is_host_down(tmpret)) {
266 ret = DLM_RECOVERING;
267 mlog(0, "node %u died so returning DLM_RECOVERING "
268 "from lock message!\n", res->owner);
269 } else {
270 ret = dlm_err_to_dlm_status(tmpret);
271 }
272 }
273
274 return ret;
275}
276
277void dlm_lock_get(struct dlm_lock *lock)
278{
279 kref_get(&lock->lock_refs);
280}
281
282void dlm_lock_put(struct dlm_lock *lock)
283{
284 kref_put(&lock->lock_refs, dlm_lock_release);
285}
286
287static void dlm_lock_release(struct kref *kref)
288{
289 struct dlm_lock *lock;
290
291 lock = container_of(kref, struct dlm_lock, lock_refs);
292
293 BUG_ON(!list_empty(&lock->list));
294 BUG_ON(!list_empty(&lock->ast_list));
295 BUG_ON(!list_empty(&lock->bast_list));
296 BUG_ON(lock->ast_pending);
297 BUG_ON(lock->bast_pending);
298
299 dlm_lock_detach_lockres(lock);
300
301 if (lock->lksb_kernel_allocated) {
302 mlog(0, "freeing kernel-allocated lksb\n");
303 kfree(lock->lksb);
304 }
305 kfree(lock);
306}
307
308/* associate a lock with it's lockres, getting a ref on the lockres */
309void dlm_lock_attach_lockres(struct dlm_lock *lock,
310 struct dlm_lock_resource *res)
311{
312 dlm_lockres_get(res);
313 lock->lockres = res;
314}
315
316/* drop ref on lockres, if there is still one associated with lock */
317static void dlm_lock_detach_lockres(struct dlm_lock *lock)
318{
319 struct dlm_lock_resource *res;
320
321 res = lock->lockres;
322 if (res) {
323 lock->lockres = NULL;
324 mlog(0, "removing lock's lockres reference\n");
325 dlm_lockres_put(res);
326 }
327}
328
329static void dlm_init_lock(struct dlm_lock *newlock, int type,
330 u8 node, u64 cookie)
331{
332 INIT_LIST_HEAD(&newlock->list);
333 INIT_LIST_HEAD(&newlock->ast_list);
334 INIT_LIST_HEAD(&newlock->bast_list);
335 spin_lock_init(&newlock->spinlock);
336 newlock->ml.type = type;
337 newlock->ml.convert_type = LKM_IVMODE;
338 newlock->ml.highest_blocked = LKM_IVMODE;
339 newlock->ml.node = node;
340 newlock->ml.pad1 = 0;
341 newlock->ml.list = 0;
342 newlock->ml.flags = 0;
343 newlock->ast = NULL;
344 newlock->bast = NULL;
345 newlock->astdata = NULL;
346 newlock->ml.cookie = cpu_to_be64(cookie);
347 newlock->ast_pending = 0;
348 newlock->bast_pending = 0;
349 newlock->convert_pending = 0;
350 newlock->lock_pending = 0;
351 newlock->unlock_pending = 0;
352 newlock->cancel_pending = 0;
353 newlock->lksb_kernel_allocated = 0;
354
355 kref_init(&newlock->lock_refs);
356}
357
358struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
359 struct dlm_lockstatus *lksb)
360{
361 struct dlm_lock *lock;
362 int kernel_allocated = 0;
363
364 lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
365 if (!lock)
366 return NULL;
367
368 if (!lksb) {
369 /* zero memory only if kernel-allocated */
370 lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
371 if (!lksb) {
372 kfree(lock);
373 return NULL;
374 }
375 kernel_allocated = 1;
376 }
377
378 dlm_init_lock(lock, type, node, cookie);
379 if (kernel_allocated)
380 lock->lksb_kernel_allocated = 1;
381 lock->lksb = lksb;
382 lksb->lockid = lock;
383 return lock;
384}
385
386/* handler for lock creation net message
387 * locking:
388 * caller needs: none
389 * taken: takes and drops res->spinlock
390 * held on exit: none
391 * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
392 */
393int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
394{
395 struct dlm_ctxt *dlm = data;
396 struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
397 struct dlm_lock_resource *res = NULL;
398 struct dlm_lock *newlock = NULL;
399 struct dlm_lockstatus *lksb = NULL;
400 enum dlm_status status = DLM_NORMAL;
401 char *name;
402 unsigned int namelen;
403
404 BUG_ON(!dlm);
405
406 mlog_entry_void();
407
408 if (!dlm_grab(dlm))
409 return DLM_REJECTED;
410
411 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
412 "Domain %s not fully joined!\n", dlm->name);
413
414 name = create->name;
415 namelen = create->namelen;
416
417 status = DLM_IVBUFLEN;
418 if (namelen > DLM_LOCKID_NAME_MAX) {
419 dlm_error(status);
420 goto leave;
421 }
422
423 status = DLM_SYSERR;
424 newlock = dlm_new_lock(create->requested_type,
425 create->node_idx,
426 be64_to_cpu(create->cookie), NULL);
427 if (!newlock) {
428 dlm_error(status);
429 goto leave;
430 }
431
432 lksb = newlock->lksb;
433
434 if (be32_to_cpu(create->flags) & LKM_GET_LVB) {
435 lksb->flags |= DLM_LKSB_GET_LVB;
436 mlog(0, "set DLM_LKSB_GET_LVB flag\n");
437 }
438
439 status = DLM_IVLOCKID;
440 res = dlm_lookup_lockres(dlm, name, namelen);
441 if (!res) {
442 dlm_error(status);
443 goto leave;
444 }
445
446 spin_lock(&res->spinlock);
447 status = __dlm_lockres_state_to_status(res);
448 spin_unlock(&res->spinlock);
449
450 if (status != DLM_NORMAL) {
451 mlog(0, "lockres recovering/migrating/in-progress\n");
452 goto leave;
453 }
454
455 dlm_lock_attach_lockres(newlock, res);
456
457 status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags));
458leave:
459 if (status != DLM_NORMAL)
460 if (newlock)
461 dlm_lock_put(newlock);
462
463 if (res)
464 dlm_lockres_put(res);
465
466 dlm_put(dlm);
467
468 return status;
469}
470
471
472/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */
473static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie)
474{
475 u64 tmpnode = node_num;
476
477 /* shift single byte of node num into top 8 bits */
478 tmpnode <<= 56;
479
480 spin_lock(&dlm_cookie_lock);
481 *cookie = (dlm_next_cookie | tmpnode);
482 if (++dlm_next_cookie & 0xff00000000000000ull) {
483 mlog(0, "This node's cookie will now wrap!\n");
484 dlm_next_cookie = 1;
485 }
486 spin_unlock(&dlm_cookie_lock);
487}
488
489enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
490 struct dlm_lockstatus *lksb, int flags,
491 const char *name, dlm_astlockfunc_t *ast, void *data,
492 dlm_bastlockfunc_t *bast)
493{
494 enum dlm_status status;
495 struct dlm_lock_resource *res = NULL;
496 struct dlm_lock *lock = NULL;
497 int convert = 0, recovery = 0;
498
499 /* yes this function is a mess.
500 * TODO: clean this up. lots of common code in the
501 * lock and convert paths, especially in the retry blocks */
502 if (!lksb) {
503 dlm_error(DLM_BADARGS);
504 return DLM_BADARGS;
505 }
506
507 status = DLM_BADPARAM;
508 if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) {
509 dlm_error(status);
510 goto error;
511 }
512
513 if (flags & ~LKM_VALID_FLAGS) {
514 dlm_error(status);
515 goto error;
516 }
517
518 convert = (flags & LKM_CONVERT);
519 recovery = (flags & LKM_RECOVERY);
520
521 if (recovery &&
522 (!dlm_is_recovery_lock(name, strlen(name)) || convert) ) {
523 dlm_error(status);
524 goto error;
525 }
526 if (convert && (flags & LKM_LOCAL)) {
527 mlog(ML_ERROR, "strange LOCAL convert request!\n");
528 goto error;
529 }
530
531 if (convert) {
532 /* CONVERT request */
533
534 /* if converting, must pass in a valid dlm_lock */
535 lock = lksb->lockid;
536 if (!lock) {
537 mlog(ML_ERROR, "NULL lock pointer in convert "
538 "request\n");
539 goto error;
540 }
541
542 res = lock->lockres;
543 if (!res) {
544 mlog(ML_ERROR, "NULL lockres pointer in convert "
545 "request\n");
546 goto error;
547 }
548 dlm_lockres_get(res);
549
550 /* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are
551 * static after the original lock call. convert requests will
552 * ensure that everything is the same, or return DLM_BADARGS.
553 * this means that DLM_DENIED_NOASTS will never be returned.
554 */
555 if (lock->lksb != lksb || lock->ast != ast ||
556 lock->bast != bast || lock->astdata != data) {
557 status = DLM_BADARGS;
558 mlog(ML_ERROR, "new args: lksb=%p, ast=%p, bast=%p, "
559 "astdata=%p\n", lksb, ast, bast, data);
560 mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, "
561 "astdata=%p\n", lock->lksb, lock->ast,
562 lock->bast, lock->astdata);
563 goto error;
564 }
565retry_convert:
566 dlm_wait_for_recovery(dlm);
567
568 if (res->owner == dlm->node_num)
569 status = dlmconvert_master(dlm, res, lock, flags, mode);
570 else
571 status = dlmconvert_remote(dlm, res, lock, flags, mode);
572 if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
573 status == DLM_FORWARD) {
574 /* for now, see how this works without sleeping
575 * and just retry right away. I suspect the reco
576 * or migration will complete fast enough that
577 * no waiting will be necessary */
578 mlog(0, "retrying convert with migration/recovery/"
579 "in-progress\n");
580 msleep(100);
581 goto retry_convert;
582 }
583 } else {
584 u64 tmpcookie;
585
586 /* LOCK request */
587 status = DLM_BADARGS;
588 if (!name) {
589 dlm_error(status);
590 goto error;
591 }
592
593 status = DLM_IVBUFLEN;
594 if (strlen(name) > DLM_LOCKID_NAME_MAX || strlen(name) < 1) {
595 dlm_error(status);
596 goto error;
597 }
598
599 dlm_get_next_cookie(dlm->node_num, &tmpcookie);
600 lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb);
601 if (!lock) {
602 dlm_error(status);
603 goto error;
604 }
605
606 if (!recovery)
607 dlm_wait_for_recovery(dlm);
608
609 /* find or create the lock resource */
610 res = dlm_get_lock_resource(dlm, name, flags);
611 if (!res) {
612 status = DLM_IVLOCKID;
613 dlm_error(status);
614 goto error;
615 }
616
617 mlog(0, "type=%d, flags = 0x%x\n", mode, flags);
618 mlog(0, "creating lock: lock=%p res=%p\n", lock, res);
619
620 dlm_lock_attach_lockres(lock, res);
621 lock->ast = ast;
622 lock->bast = bast;
623 lock->astdata = data;
624
625retry_lock:
626 if (flags & LKM_VALBLK) {
627 mlog(0, "LKM_VALBLK passed by caller\n");
628
629 /* LVB requests for non PR, PW or EX locks are
630 * ignored. */
631 if (mode < LKM_PRMODE)
632 flags &= ~LKM_VALBLK;
633 else {
634 flags |= LKM_GET_LVB;
635 lock->lksb->flags |= DLM_LKSB_GET_LVB;
636 }
637 }
638
639 if (res->owner == dlm->node_num)
640 status = dlmlock_master(dlm, res, lock, flags);
641 else
642 status = dlmlock_remote(dlm, res, lock, flags);
643
644 if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
645 status == DLM_FORWARD) {
646 mlog(0, "retrying lock with migration/"
647 "recovery/in progress\n");
648 msleep(100);
649 dlm_wait_for_recovery(dlm);
650 goto retry_lock;
651 }
652
653 if (status != DLM_NORMAL) {
654 lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
655 if (status != DLM_NOTQUEUED)
656 dlm_error(status);
657 goto error;
658 }
659 }
660
661error:
662 if (status != DLM_NORMAL) {
663 if (lock && !convert)
664 dlm_lock_put(lock);
665 // this is kind of unnecessary
666 lksb->status = status;
667 }
668
669 /* put lockres ref from the convert path
670 * or from dlm_get_lock_resource */
671 if (res)
672 dlm_lockres_put(res);
673
674 return status;
675}
676EXPORT_SYMBOL_GPL(dlmlock);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
new file mode 100644
index 000000000000..27e984f7e4cd
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -0,0 +1,2664 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmmod.c
5 *
6 * standalone DLM module
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41#include <linux/delay.h>
42
43
44#include "cluster/heartbeat.h"
45#include "cluster/nodemanager.h"
46#include "cluster/tcp.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50#include "dlmdebug.h"
51#include "dlmdomain.h"
52
53#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
54#include "cluster/masklog.h"
55
56enum dlm_mle_type {
57 DLM_MLE_BLOCK,
58 DLM_MLE_MASTER,
59 DLM_MLE_MIGRATION
60};
61
62struct dlm_lock_name
63{
64 u8 len;
65 u8 name[DLM_LOCKID_NAME_MAX];
66};
67
68struct dlm_master_list_entry
69{
70 struct list_head list;
71 struct list_head hb_events;
72 struct dlm_ctxt *dlm;
73 spinlock_t spinlock;
74 wait_queue_head_t wq;
75 atomic_t woken;
76 struct kref mle_refs;
77 unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
78 unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
79 unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
80 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
81 u8 master;
82 u8 new_master;
83 enum dlm_mle_type type;
84 struct o2hb_callback_func mle_hb_up;
85 struct o2hb_callback_func mle_hb_down;
86 union {
87 struct dlm_lock_resource *res;
88 struct dlm_lock_name name;
89 } u;
90};
91
92static void dlm_mle_node_down(struct dlm_ctxt *dlm,
93 struct dlm_master_list_entry *mle,
94 struct o2nm_node *node,
95 int idx);
96static void dlm_mle_node_up(struct dlm_ctxt *dlm,
97 struct dlm_master_list_entry *mle,
98 struct o2nm_node *node,
99 int idx);
100
101static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
102static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
103 unsigned int namelen, void *nodemap,
104 u32 flags);
105
106static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
107 struct dlm_master_list_entry *mle,
108 const char *name,
109 unsigned int namelen)
110{
111 struct dlm_lock_resource *res;
112
113 if (dlm != mle->dlm)
114 return 0;
115
116 if (mle->type == DLM_MLE_BLOCK ||
117 mle->type == DLM_MLE_MIGRATION) {
118 if (namelen != mle->u.name.len ||
119 memcmp(name, mle->u.name.name, namelen)!=0)
120 return 0;
121 } else {
122 res = mle->u.res;
123 if (namelen != res->lockname.len ||
124 memcmp(res->lockname.name, name, namelen) != 0)
125 return 0;
126 }
127 return 1;
128}
129
130#if 0
131/* Code here is included but defined out as it aids debugging */
132
133void dlm_print_one_mle(struct dlm_master_list_entry *mle)
134{
135 int i = 0, refs;
136 char *type;
137 char attached;
138 u8 master;
139 unsigned int namelen;
140 const char *name;
141 struct kref *k;
142
143 k = &mle->mle_refs;
144 if (mle->type == DLM_MLE_BLOCK)
145 type = "BLK";
146 else if (mle->type == DLM_MLE_MASTER)
147 type = "MAS";
148 else
149 type = "MIG";
150 refs = atomic_read(&k->refcount);
151 master = mle->master;
152 attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
153
154 if (mle->type != DLM_MLE_MASTER) {
155 namelen = mle->u.name.len;
156 name = mle->u.name.name;
157 } else {
158 namelen = mle->u.res->lockname.len;
159 name = mle->u.res->lockname.name;
160 }
161
162 mlog(ML_NOTICE, " #%3d: %3s %3d %3u %3u %c (%d)%.*s\n",
163 i, type, refs, master, mle->new_master, attached,
164 namelen, namelen, name);
165}
166
167static void dlm_dump_mles(struct dlm_ctxt *dlm)
168{
169 struct dlm_master_list_entry *mle;
170 struct list_head *iter;
171
172 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
173 mlog(ML_NOTICE, " ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
174 spin_lock(&dlm->master_lock);
175 list_for_each(iter, &dlm->master_list) {
176 mle = list_entry(iter, struct dlm_master_list_entry, list);
177 dlm_print_one_mle(mle);
178 }
179 spin_unlock(&dlm->master_lock);
180}
181
182int dlm_dump_all_mles(const char __user *data, unsigned int len)
183{
184 struct list_head *iter;
185 struct dlm_ctxt *dlm;
186
187 spin_lock(&dlm_domain_lock);
188 list_for_each(iter, &dlm_domains) {
189 dlm = list_entry (iter, struct dlm_ctxt, list);
190 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
191 dlm_dump_mles(dlm);
192 }
193 spin_unlock(&dlm_domain_lock);
194 return len;
195}
196EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
197
198#endif /* 0 */
199
200
201static kmem_cache_t *dlm_mle_cache = NULL;
202
203
204static void dlm_mle_release(struct kref *kref);
205static void dlm_init_mle(struct dlm_master_list_entry *mle,
206 enum dlm_mle_type type,
207 struct dlm_ctxt *dlm,
208 struct dlm_lock_resource *res,
209 const char *name,
210 unsigned int namelen);
211static void dlm_put_mle(struct dlm_master_list_entry *mle);
212static void __dlm_put_mle(struct dlm_master_list_entry *mle);
213static int dlm_find_mle(struct dlm_ctxt *dlm,
214 struct dlm_master_list_entry **mle,
215 char *name, unsigned int namelen);
216
217static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to);
218
219
220static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
221 struct dlm_lock_resource *res,
222 struct dlm_master_list_entry *mle,
223 int *blocked);
224static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
225 struct dlm_lock_resource *res,
226 struct dlm_master_list_entry *mle,
227 int blocked);
228static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
229 struct dlm_lock_resource *res,
230 struct dlm_master_list_entry *mle,
231 struct dlm_master_list_entry **oldmle,
232 const char *name, unsigned int namelen,
233 u8 new_master, u8 master);
234
235static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
236 struct dlm_lock_resource *res);
237static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
238 struct dlm_lock_resource *res);
239static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
240 struct dlm_lock_resource *res,
241 u8 target);
242
243
244int dlm_is_host_down(int errno)
245{
246 switch (errno) {
247 case -EBADF:
248 case -ECONNREFUSED:
249 case -ENOTCONN:
250 case -ECONNRESET:
251 case -EPIPE:
252 case -EHOSTDOWN:
253 case -EHOSTUNREACH:
254 case -ETIMEDOUT:
255 case -ECONNABORTED:
256 case -ENETDOWN:
257 case -ENETUNREACH:
258 case -ENETRESET:
259 case -ESHUTDOWN:
260 case -ENOPROTOOPT:
261 case -EINVAL: /* if returned from our tcp code,
262 this means there is no socket */
263 return 1;
264 }
265 return 0;
266}
267
268
269/*
270 * MASTER LIST FUNCTIONS
271 */
272
273
274/*
275 * regarding master list entries and heartbeat callbacks:
276 *
277 * in order to avoid sleeping and allocation that occurs in
278 * heartbeat, master list entries are simply attached to the
279 * dlm's established heartbeat callbacks. the mle is attached
280 * when it is created, and since the dlm->spinlock is held at
281 * that time, any heartbeat event will be properly discovered
282 * by the mle. the mle needs to be detached from the
283 * dlm->mle_hb_events list as soon as heartbeat events are no
284 * longer useful to the mle, and before the mle is freed.
285 *
286 * as a general rule, heartbeat events are no longer needed by
287 * the mle once an "answer" regarding the lock master has been
288 * received.
289 */
290static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
291 struct dlm_master_list_entry *mle)
292{
293 assert_spin_locked(&dlm->spinlock);
294
295 list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
296}
297
298
299static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
300 struct dlm_master_list_entry *mle)
301{
302 if (!list_empty(&mle->hb_events))
303 list_del_init(&mle->hb_events);
304}
305
306
307static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
308 struct dlm_master_list_entry *mle)
309{
310 spin_lock(&dlm->spinlock);
311 __dlm_mle_detach_hb_events(dlm, mle);
312 spin_unlock(&dlm->spinlock);
313}
314
315/* remove from list and free */
316static void __dlm_put_mle(struct dlm_master_list_entry *mle)
317{
318 struct dlm_ctxt *dlm;
319 dlm = mle->dlm;
320
321 assert_spin_locked(&dlm->spinlock);
322 assert_spin_locked(&dlm->master_lock);
323 BUG_ON(!atomic_read(&mle->mle_refs.refcount));
324
325 kref_put(&mle->mle_refs, dlm_mle_release);
326}
327
328
329/* must not have any spinlocks coming in */
330static void dlm_put_mle(struct dlm_master_list_entry *mle)
331{
332 struct dlm_ctxt *dlm;
333 dlm = mle->dlm;
334
335 spin_lock(&dlm->spinlock);
336 spin_lock(&dlm->master_lock);
337 __dlm_put_mle(mle);
338 spin_unlock(&dlm->master_lock);
339 spin_unlock(&dlm->spinlock);
340}
341
342static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
343{
344 kref_get(&mle->mle_refs);
345}
346
347static void dlm_init_mle(struct dlm_master_list_entry *mle,
348 enum dlm_mle_type type,
349 struct dlm_ctxt *dlm,
350 struct dlm_lock_resource *res,
351 const char *name,
352 unsigned int namelen)
353{
354 assert_spin_locked(&dlm->spinlock);
355
356 mle->dlm = dlm;
357 mle->type = type;
358 INIT_LIST_HEAD(&mle->list);
359 INIT_LIST_HEAD(&mle->hb_events);
360 memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
361 spin_lock_init(&mle->spinlock);
362 init_waitqueue_head(&mle->wq);
363 atomic_set(&mle->woken, 0);
364 kref_init(&mle->mle_refs);
365 memset(mle->response_map, 0, sizeof(mle->response_map));
366 mle->master = O2NM_MAX_NODES;
367 mle->new_master = O2NM_MAX_NODES;
368
369 if (mle->type == DLM_MLE_MASTER) {
370 BUG_ON(!res);
371 mle->u.res = res;
372 } else if (mle->type == DLM_MLE_BLOCK) {
373 BUG_ON(!name);
374 memcpy(mle->u.name.name, name, namelen);
375 mle->u.name.len = namelen;
376 } else /* DLM_MLE_MIGRATION */ {
377 BUG_ON(!name);
378 memcpy(mle->u.name.name, name, namelen);
379 mle->u.name.len = namelen;
380 }
381
382 /* copy off the node_map and register hb callbacks on our copy */
383 memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
384 memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
385 clear_bit(dlm->node_num, mle->vote_map);
386 clear_bit(dlm->node_num, mle->node_map);
387
388 /* attach the mle to the domain node up/down events */
389 __dlm_mle_attach_hb_events(dlm, mle);
390}
391
392
393/* returns 1 if found, 0 if not */
394static int dlm_find_mle(struct dlm_ctxt *dlm,
395 struct dlm_master_list_entry **mle,
396 char *name, unsigned int namelen)
397{
398 struct dlm_master_list_entry *tmpmle;
399 struct list_head *iter;
400
401 assert_spin_locked(&dlm->master_lock);
402
403 list_for_each(iter, &dlm->master_list) {
404 tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
405 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
406 continue;
407 dlm_get_mle(tmpmle);
408 *mle = tmpmle;
409 return 1;
410 }
411 return 0;
412}
413
414void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
415{
416 struct dlm_master_list_entry *mle;
417 struct list_head *iter;
418
419 assert_spin_locked(&dlm->spinlock);
420
421 list_for_each(iter, &dlm->mle_hb_events) {
422 mle = list_entry(iter, struct dlm_master_list_entry,
423 hb_events);
424 if (node_up)
425 dlm_mle_node_up(dlm, mle, NULL, idx);
426 else
427 dlm_mle_node_down(dlm, mle, NULL, idx);
428 }
429}
430
431static void dlm_mle_node_down(struct dlm_ctxt *dlm,
432 struct dlm_master_list_entry *mle,
433 struct o2nm_node *node, int idx)
434{
435 spin_lock(&mle->spinlock);
436
437 if (!test_bit(idx, mle->node_map))
438 mlog(0, "node %u already removed from nodemap!\n", idx);
439 else
440 clear_bit(idx, mle->node_map);
441
442 spin_unlock(&mle->spinlock);
443}
444
445static void dlm_mle_node_up(struct dlm_ctxt *dlm,
446 struct dlm_master_list_entry *mle,
447 struct o2nm_node *node, int idx)
448{
449 spin_lock(&mle->spinlock);
450
451 if (test_bit(idx, mle->node_map))
452 mlog(0, "node %u already in node map!\n", idx);
453 else
454 set_bit(idx, mle->node_map);
455
456 spin_unlock(&mle->spinlock);
457}
458
459
460int dlm_init_mle_cache(void)
461{
462 dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
463 sizeof(struct dlm_master_list_entry),
464 0, SLAB_HWCACHE_ALIGN,
465 NULL, NULL);
466 if (dlm_mle_cache == NULL)
467 return -ENOMEM;
468 return 0;
469}
470
471void dlm_destroy_mle_cache(void)
472{
473 if (dlm_mle_cache)
474 kmem_cache_destroy(dlm_mle_cache);
475}
476
477static void dlm_mle_release(struct kref *kref)
478{
479 struct dlm_master_list_entry *mle;
480 struct dlm_ctxt *dlm;
481
482 mlog_entry_void();
483
484 mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
485 dlm = mle->dlm;
486
487 if (mle->type != DLM_MLE_MASTER) {
488 mlog(0, "calling mle_release for %.*s, type %d\n",
489 mle->u.name.len, mle->u.name.name, mle->type);
490 } else {
491 mlog(0, "calling mle_release for %.*s, type %d\n",
492 mle->u.res->lockname.len,
493 mle->u.res->lockname.name, mle->type);
494 }
495 assert_spin_locked(&dlm->spinlock);
496 assert_spin_locked(&dlm->master_lock);
497
498 /* remove from list if not already */
499 if (!list_empty(&mle->list))
500 list_del_init(&mle->list);
501
502 /* detach the mle from the domain node up/down events */
503 __dlm_mle_detach_hb_events(dlm, mle);
504
505 /* NOTE: kfree under spinlock here.
506 * if this is bad, we can move this to a freelist. */
507 kmem_cache_free(dlm_mle_cache, mle);
508}
509
510
511/*
512 * LOCK RESOURCE FUNCTIONS
513 */
514
515static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
516 struct dlm_lock_resource *res,
517 u8 owner)
518{
519 assert_spin_locked(&res->spinlock);
520
521 mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
522
523 if (owner == dlm->node_num)
524 atomic_inc(&dlm->local_resources);
525 else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
526 atomic_inc(&dlm->unknown_resources);
527 else
528 atomic_inc(&dlm->remote_resources);
529
530 res->owner = owner;
531}
532
533void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
534 struct dlm_lock_resource *res, u8 owner)
535{
536 assert_spin_locked(&res->spinlock);
537
538 if (owner == res->owner)
539 return;
540
541 if (res->owner == dlm->node_num)
542 atomic_dec(&dlm->local_resources);
543 else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
544 atomic_dec(&dlm->unknown_resources);
545 else
546 atomic_dec(&dlm->remote_resources);
547
548 dlm_set_lockres_owner(dlm, res, owner);
549}
550
551
552static void dlm_lockres_release(struct kref *kref)
553{
554 struct dlm_lock_resource *res;
555
556 res = container_of(kref, struct dlm_lock_resource, refs);
557
558 /* This should not happen -- all lockres' have a name
559 * associated with them at init time. */
560 BUG_ON(!res->lockname.name);
561
562 mlog(0, "destroying lockres %.*s\n", res->lockname.len,
563 res->lockname.name);
564
565 /* By the time we're ready to blow this guy away, we shouldn't
566 * be on any lists. */
567 BUG_ON(!list_empty(&res->list));
568 BUG_ON(!list_empty(&res->granted));
569 BUG_ON(!list_empty(&res->converting));
570 BUG_ON(!list_empty(&res->blocked));
571 BUG_ON(!list_empty(&res->dirty));
572 BUG_ON(!list_empty(&res->recovering));
573 BUG_ON(!list_empty(&res->purge));
574
575 kfree(res->lockname.name);
576
577 kfree(res);
578}
579
580void dlm_lockres_get(struct dlm_lock_resource *res)
581{
582 kref_get(&res->refs);
583}
584
585void dlm_lockres_put(struct dlm_lock_resource *res)
586{
587 kref_put(&res->refs, dlm_lockres_release);
588}
589
590static void dlm_init_lockres(struct dlm_ctxt *dlm,
591 struct dlm_lock_resource *res,
592 const char *name, unsigned int namelen)
593{
594 char *qname;
595
596 /* If we memset here, we lose our reference to the kmalloc'd
597 * res->lockname.name, so be sure to init every field
598 * correctly! */
599
600 qname = (char *) res->lockname.name;
601 memcpy(qname, name, namelen);
602
603 res->lockname.len = namelen;
604 res->lockname.hash = full_name_hash(name, namelen);
605
606 init_waitqueue_head(&res->wq);
607 spin_lock_init(&res->spinlock);
608 INIT_LIST_HEAD(&res->list);
609 INIT_LIST_HEAD(&res->granted);
610 INIT_LIST_HEAD(&res->converting);
611 INIT_LIST_HEAD(&res->blocked);
612 INIT_LIST_HEAD(&res->dirty);
613 INIT_LIST_HEAD(&res->recovering);
614 INIT_LIST_HEAD(&res->purge);
615 atomic_set(&res->asts_reserved, 0);
616 res->migration_pending = 0;
617
618 kref_init(&res->refs);
619
620 /* just for consistency */
621 spin_lock(&res->spinlock);
622 dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
623 spin_unlock(&res->spinlock);
624
625 res->state = DLM_LOCK_RES_IN_PROGRESS;
626
627 res->last_used = 0;
628
629 memset(res->lvb, 0, DLM_LVB_LEN);
630}
631
632struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
633 const char *name,
634 unsigned int namelen)
635{
636 struct dlm_lock_resource *res;
637
638 res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
639 if (!res)
640 return NULL;
641
642 res->lockname.name = kmalloc(namelen, GFP_KERNEL);
643 if (!res->lockname.name) {
644 kfree(res);
645 return NULL;
646 }
647
648 dlm_init_lockres(dlm, res, name, namelen);
649 return res;
650}
651
652/*
653 * lookup a lock resource by name.
654 * may already exist in the hashtable.
655 * lockid is null terminated
656 *
657 * if not, allocate enough for the lockres and for
658 * the temporary structure used in doing the mastering.
659 *
660 * also, do a lookup in the dlm->master_list to see
661 * if another node has begun mastering the same lock.
662 * if so, there should be a block entry in there
663 * for this name, and we should *not* attempt to master
664 * the lock here. need to wait around for that node
665 * to assert_master (or die).
666 *
667 */
668struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
669 const char *lockid,
670 int flags)
671{
672 struct dlm_lock_resource *tmpres=NULL, *res=NULL;
673 struct dlm_master_list_entry *mle = NULL;
674 struct dlm_master_list_entry *alloc_mle = NULL;
675 int blocked = 0;
676 int ret, nodenum;
677 struct dlm_node_iter iter;
678 unsigned int namelen;
679 int tries = 0;
680
681 BUG_ON(!lockid);
682
683 namelen = strlen(lockid);
684
685 mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
686
687lookup:
688 spin_lock(&dlm->spinlock);
689 tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
690 if (tmpres) {
691 spin_unlock(&dlm->spinlock);
692 mlog(0, "found in hash!\n");
693 if (res)
694 dlm_lockres_put(res);
695 res = tmpres;
696 goto leave;
697 }
698
699 if (!res) {
700 spin_unlock(&dlm->spinlock);
701 mlog(0, "allocating a new resource\n");
702 /* nothing found and we need to allocate one. */
703 alloc_mle = (struct dlm_master_list_entry *)
704 kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
705 if (!alloc_mle)
706 goto leave;
707 res = dlm_new_lockres(dlm, lockid, namelen);
708 if (!res)
709 goto leave;
710 goto lookup;
711 }
712
713 mlog(0, "no lockres found, allocated our own: %p\n", res);
714
715 if (flags & LKM_LOCAL) {
716 /* caller knows it's safe to assume it's not mastered elsewhere
717 * DONE! return right away */
718 spin_lock(&res->spinlock);
719 dlm_change_lockres_owner(dlm, res, dlm->node_num);
720 __dlm_insert_lockres(dlm, res);
721 spin_unlock(&res->spinlock);
722 spin_unlock(&dlm->spinlock);
723 /* lockres still marked IN_PROGRESS */
724 goto wake_waiters;
725 }
726
727 /* check master list to see if another node has started mastering it */
728 spin_lock(&dlm->master_lock);
729
730 /* if we found a block, wait for lock to be mastered by another node */
731 blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
732 if (blocked) {
733 if (mle->type == DLM_MLE_MASTER) {
734 mlog(ML_ERROR, "master entry for nonexistent lock!\n");
735 BUG();
736 } else if (mle->type == DLM_MLE_MIGRATION) {
737 /* migration is in progress! */
738 /* the good news is that we now know the
739 * "current" master (mle->master). */
740
741 spin_unlock(&dlm->master_lock);
742 assert_spin_locked(&dlm->spinlock);
743
744 /* set the lockres owner and hash it */
745 spin_lock(&res->spinlock);
746 dlm_set_lockres_owner(dlm, res, mle->master);
747 __dlm_insert_lockres(dlm, res);
748 spin_unlock(&res->spinlock);
749 spin_unlock(&dlm->spinlock);
750
751 /* master is known, detach */
752 dlm_mle_detach_hb_events(dlm, mle);
753 dlm_put_mle(mle);
754 mle = NULL;
755 goto wake_waiters;
756 }
757 } else {
758 /* go ahead and try to master lock on this node */
759 mle = alloc_mle;
760 /* make sure this does not get freed below */
761 alloc_mle = NULL;
762 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
763 set_bit(dlm->node_num, mle->maybe_map);
764 list_add(&mle->list, &dlm->master_list);
765 }
766
767 /* at this point there is either a DLM_MLE_BLOCK or a
768 * DLM_MLE_MASTER on the master list, so it's safe to add the
769 * lockres to the hashtable. anyone who finds the lock will
770 * still have to wait on the IN_PROGRESS. */
771
772 /* finally add the lockres to its hash bucket */
773 __dlm_insert_lockres(dlm, res);
774 /* get an extra ref on the mle in case this is a BLOCK
775 * if so, the creator of the BLOCK may try to put the last
776 * ref at this time in the assert master handler, so we
777 * need an extra one to keep from a bad ptr deref. */
778 dlm_get_mle(mle);
779 spin_unlock(&dlm->master_lock);
780 spin_unlock(&dlm->spinlock);
781
782 /* must wait for lock to be mastered elsewhere */
783 if (blocked)
784 goto wait;
785
786redo_request:
787 ret = -EINVAL;
788 dlm_node_iter_init(mle->vote_map, &iter);
789 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
790 ret = dlm_do_master_request(mle, nodenum);
791 if (ret < 0)
792 mlog_errno(ret);
793 if (mle->master != O2NM_MAX_NODES) {
794 /* found a master ! */
795 break;
796 }
797 }
798
799wait:
800 /* keep going until the response map includes all nodes */
801 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
802 if (ret < 0) {
803 mlog(0, "%s:%.*s: node map changed, redo the "
804 "master request now, blocked=%d\n",
805 dlm->name, res->lockname.len,
806 res->lockname.name, blocked);
807 if (++tries > 20) {
808 mlog(ML_ERROR, "%s:%.*s: spinning on "
809 "dlm_wait_for_lock_mastery, blocked=%d\n",
810 dlm->name, res->lockname.len,
811 res->lockname.name, blocked);
812 dlm_print_one_lock_resource(res);
813 /* dlm_print_one_mle(mle); */
814 tries = 0;
815 }
816 goto redo_request;
817 }
818
819 mlog(0, "lockres mastered by %u\n", res->owner);
820 /* make sure we never continue without this */
821 BUG_ON(res->owner == O2NM_MAX_NODES);
822
823 /* master is known, detach if not already detached */
824 dlm_mle_detach_hb_events(dlm, mle);
825 dlm_put_mle(mle);
826 /* put the extra ref */
827 dlm_put_mle(mle);
828
829wake_waiters:
830 spin_lock(&res->spinlock);
831 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
832 spin_unlock(&res->spinlock);
833 wake_up(&res->wq);
834
835leave:
836 /* need to free the unused mle */
837 if (alloc_mle)
838 kmem_cache_free(dlm_mle_cache, alloc_mle);
839
840 return res;
841}
842
843
844#define DLM_MASTERY_TIMEOUT_MS 5000
845
846static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
847 struct dlm_lock_resource *res,
848 struct dlm_master_list_entry *mle,
849 int *blocked)
850{
851 u8 m;
852 int ret, bit;
853 int map_changed, voting_done;
854 int assert, sleep;
855
856recheck:
857 ret = 0;
858 assert = 0;
859
860 /* check if another node has already become the owner */
861 spin_lock(&res->spinlock);
862 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
863 spin_unlock(&res->spinlock);
864 goto leave;
865 }
866 spin_unlock(&res->spinlock);
867
868 spin_lock(&mle->spinlock);
869 m = mle->master;
870 map_changed = (memcmp(mle->vote_map, mle->node_map,
871 sizeof(mle->vote_map)) != 0);
872 voting_done = (memcmp(mle->vote_map, mle->response_map,
873 sizeof(mle->vote_map)) == 0);
874
875 /* restart if we hit any errors */
876 if (map_changed) {
877 int b;
878 mlog(0, "%s: %.*s: node map changed, restarting\n",
879 dlm->name, res->lockname.len, res->lockname.name);
880 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
881 b = (mle->type == DLM_MLE_BLOCK);
882 if ((*blocked && !b) || (!*blocked && b)) {
883 mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
884 dlm->name, res->lockname.len, res->lockname.name,
885 *blocked, b);
886 *blocked = b;
887 }
888 spin_unlock(&mle->spinlock);
889 if (ret < 0) {
890 mlog_errno(ret);
891 goto leave;
892 }
893 mlog(0, "%s:%.*s: restart lock mastery succeeded, "
894 "rechecking now\n", dlm->name, res->lockname.len,
895 res->lockname.name);
896 goto recheck;
897 }
898
899 if (m != O2NM_MAX_NODES) {
900 /* another node has done an assert!
901 * all done! */
902 sleep = 0;
903 } else {
904 sleep = 1;
905 /* have all nodes responded? */
906 if (voting_done && !*blocked) {
907 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
908 if (dlm->node_num <= bit) {
909 /* my node number is lowest.
910 * now tell other nodes that I am
911 * mastering this. */
912 mle->master = dlm->node_num;
913 assert = 1;
914 sleep = 0;
915 }
916 /* if voting is done, but we have not received
917 * an assert master yet, we must sleep */
918 }
919 }
920
921 spin_unlock(&mle->spinlock);
922
923 /* sleep if we haven't finished voting yet */
924 if (sleep) {
925 unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
926
927 /*
928 if (atomic_read(&mle->mle_refs.refcount) < 2)
929 mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
930 atomic_read(&mle->mle_refs.refcount),
931 res->lockname.len, res->lockname.name);
932 */
933 atomic_set(&mle->woken, 0);
934 (void)wait_event_timeout(mle->wq,
935 (atomic_read(&mle->woken) == 1),
936 timeo);
937 if (res->owner == O2NM_MAX_NODES) {
938 mlog(0, "waiting again\n");
939 goto recheck;
940 }
941 mlog(0, "done waiting, master is %u\n", res->owner);
942 ret = 0;
943 goto leave;
944 }
945
946 ret = 0; /* done */
947 if (assert) {
948 m = dlm->node_num;
949 mlog(0, "about to master %.*s here, this=%u\n",
950 res->lockname.len, res->lockname.name, m);
951 ret = dlm_do_assert_master(dlm, res->lockname.name,
952 res->lockname.len, mle->vote_map, 0);
953 if (ret) {
954 /* This is a failure in the network path,
955 * not in the response to the assert_master
956 * (any nonzero response is a BUG on this node).
957 * Most likely a socket just got disconnected
958 * due to node death. */
959 mlog_errno(ret);
960 }
961 /* no longer need to restart lock mastery.
962 * all living nodes have been contacted. */
963 ret = 0;
964 }
965
966 /* set the lockres owner */
967 spin_lock(&res->spinlock);
968 dlm_change_lockres_owner(dlm, res, m);
969 spin_unlock(&res->spinlock);
970
971leave:
972 return ret;
973}
974
975struct dlm_bitmap_diff_iter
976{
977 int curnode;
978 unsigned long *orig_bm;
979 unsigned long *cur_bm;
980 unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
981};
982
983enum dlm_node_state_change
984{
985 NODE_DOWN = -1,
986 NODE_NO_CHANGE = 0,
987 NODE_UP
988};
989
990static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
991 unsigned long *orig_bm,
992 unsigned long *cur_bm)
993{
994 unsigned long p1, p2;
995 int i;
996
997 iter->curnode = -1;
998 iter->orig_bm = orig_bm;
999 iter->cur_bm = cur_bm;
1000
1001 for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
1002 p1 = *(iter->orig_bm + i);
1003 p2 = *(iter->cur_bm + i);
1004 iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
1005 }
1006}
1007
1008static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
1009 enum dlm_node_state_change *state)
1010{
1011 int bit;
1012
1013 if (iter->curnode >= O2NM_MAX_NODES)
1014 return -ENOENT;
1015
1016 bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
1017 iter->curnode+1);
1018 if (bit >= O2NM_MAX_NODES) {
1019 iter->curnode = O2NM_MAX_NODES;
1020 return -ENOENT;
1021 }
1022
1023 /* if it was there in the original then this node died */
1024 if (test_bit(bit, iter->orig_bm))
1025 *state = NODE_DOWN;
1026 else
1027 *state = NODE_UP;
1028
1029 iter->curnode = bit;
1030 return bit;
1031}
1032
1033
1034static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1035 struct dlm_lock_resource *res,
1036 struct dlm_master_list_entry *mle,
1037 int blocked)
1038{
1039 struct dlm_bitmap_diff_iter bdi;
1040 enum dlm_node_state_change sc;
1041 int node;
1042 int ret = 0;
1043
1044 mlog(0, "something happened such that the "
1045 "master process may need to be restarted!\n");
1046
1047 assert_spin_locked(&mle->spinlock);
1048
1049 dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
1050 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1051 while (node >= 0) {
1052 if (sc == NODE_UP) {
1053 /* a node came up. easy. might not even need
1054 * to talk to it if its node number is higher
1055 * or if we are already blocked. */
1056 mlog(0, "node up! %d\n", node);
1057 if (blocked)
1058 goto next;
1059
1060 if (node > dlm->node_num) {
1061 mlog(0, "node > this node. skipping.\n");
1062 goto next;
1063 }
1064
1065 /* redo the master request, but only for the new node */
1066 mlog(0, "sending request to new node\n");
1067 clear_bit(node, mle->response_map);
1068 set_bit(node, mle->vote_map);
1069 } else {
1070 mlog(ML_ERROR, "node down! %d\n", node);
1071
1072 /* if the node wasn't involved in mastery skip it,
1073 * but clear it out from the maps so that it will
1074 * not affect mastery of this lockres */
1075 clear_bit(node, mle->response_map);
1076 clear_bit(node, mle->vote_map);
1077 if (!test_bit(node, mle->maybe_map))
1078 goto next;
1079
1080 /* if we're already blocked on lock mastery, and the
1081 * dead node wasn't the expected master, or there is
1082 * another node in the maybe_map, keep waiting */
1083 if (blocked) {
1084 int lowest = find_next_bit(mle->maybe_map,
1085 O2NM_MAX_NODES, 0);
1086
1087 /* act like it was never there */
1088 clear_bit(node, mle->maybe_map);
1089
1090 if (node != lowest)
1091 goto next;
1092
1093 mlog(ML_ERROR, "expected master %u died while "
1094 "this node was blocked waiting on it!\n",
1095 node);
1096 lowest = find_next_bit(mle->maybe_map,
1097 O2NM_MAX_NODES,
1098 lowest+1);
1099 if (lowest < O2NM_MAX_NODES) {
1100 mlog(0, "still blocked. waiting "
1101 "on %u now\n", lowest);
1102 goto next;
1103 }
1104
1105 /* mle is an MLE_BLOCK, but there is now
1106 * nothing left to block on. we need to return
1107 * all the way back out and try again with
1108 * an MLE_MASTER. dlm_do_local_recovery_cleanup
1109 * has already run, so the mle refcount is ok */
1110 mlog(0, "no longer blocking. we can "
1111 "try to master this here\n");
1112 mle->type = DLM_MLE_MASTER;
1113 memset(mle->maybe_map, 0,
1114 sizeof(mle->maybe_map));
1115 memset(mle->response_map, 0,
1116 sizeof(mle->maybe_map));
1117 memcpy(mle->vote_map, mle->node_map,
1118 sizeof(mle->node_map));
1119 mle->u.res = res;
1120 set_bit(dlm->node_num, mle->maybe_map);
1121
1122 ret = -EAGAIN;
1123 goto next;
1124 }
1125
1126 clear_bit(node, mle->maybe_map);
1127 if (node > dlm->node_num)
1128 goto next;
1129
1130 mlog(0, "dead node in map!\n");
1131 /* yuck. go back and re-contact all nodes
1132 * in the vote_map, removing this node. */
1133 memset(mle->response_map, 0,
1134 sizeof(mle->response_map));
1135 }
1136 ret = -EAGAIN;
1137next:
1138 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1139 }
1140 return ret;
1141}
1142
1143
1144/*
1145 * DLM_MASTER_REQUEST_MSG
1146 *
1147 * returns: 0 on success,
1148 * -errno on a network error
1149 *
1150 * on error, the caller should assume the target node is "dead"
1151 *
1152 */
1153
1154static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to)
1155{
1156 struct dlm_ctxt *dlm = mle->dlm;
1157 struct dlm_master_request request;
1158 int ret, response=0, resend;
1159
1160 memset(&request, 0, sizeof(request));
1161 request.node_idx = dlm->node_num;
1162
1163 BUG_ON(mle->type == DLM_MLE_MIGRATION);
1164
1165 if (mle->type != DLM_MLE_MASTER) {
1166 request.namelen = mle->u.name.len;
1167 memcpy(request.name, mle->u.name.name, request.namelen);
1168 } else {
1169 request.namelen = mle->u.res->lockname.len;
1170 memcpy(request.name, mle->u.res->lockname.name,
1171 request.namelen);
1172 }
1173
1174again:
1175 ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
1176 sizeof(request), to, &response);
1177 if (ret < 0) {
1178 if (ret == -ESRCH) {
1179 /* should never happen */
1180 mlog(ML_ERROR, "TCP stack not ready!\n");
1181 BUG();
1182 } else if (ret == -EINVAL) {
1183 mlog(ML_ERROR, "bad args passed to o2net!\n");
1184 BUG();
1185 } else if (ret == -ENOMEM) {
1186 mlog(ML_ERROR, "out of memory while trying to send "
1187 "network message! retrying\n");
1188 /* this is totally crude */
1189 msleep(50);
1190 goto again;
1191 } else if (!dlm_is_host_down(ret)) {
1192 /* not a network error. bad. */
1193 mlog_errno(ret);
1194 mlog(ML_ERROR, "unhandled error!");
1195 BUG();
1196 }
1197 /* all other errors should be network errors,
1198 * and likely indicate node death */
1199 mlog(ML_ERROR, "link to %d went down!\n", to);
1200 goto out;
1201 }
1202
1203 ret = 0;
1204 resend = 0;
1205 spin_lock(&mle->spinlock);
1206 switch (response) {
1207 case DLM_MASTER_RESP_YES:
1208 set_bit(to, mle->response_map);
1209 mlog(0, "node %u is the master, response=YES\n", to);
1210 mle->master = to;
1211 break;
1212 case DLM_MASTER_RESP_NO:
1213 mlog(0, "node %u not master, response=NO\n", to);
1214 set_bit(to, mle->response_map);
1215 break;
1216 case DLM_MASTER_RESP_MAYBE:
1217 mlog(0, "node %u not master, response=MAYBE\n", to);
1218 set_bit(to, mle->response_map);
1219 set_bit(to, mle->maybe_map);
1220 break;
1221 case DLM_MASTER_RESP_ERROR:
1222 mlog(0, "node %u hit an error, resending\n", to);
1223 resend = 1;
1224 response = 0;
1225 break;
1226 default:
1227 mlog(ML_ERROR, "bad response! %u\n", response);
1228 BUG();
1229 }
1230 spin_unlock(&mle->spinlock);
1231 if (resend) {
1232 /* this is also totally crude */
1233 msleep(50);
1234 goto again;
1235 }
1236
1237out:
1238 return ret;
1239}
1240
1241/*
1242 * locks that can be taken here:
1243 * dlm->spinlock
1244 * res->spinlock
1245 * mle->spinlock
1246 * dlm->master_list
1247 *
1248 * if possible, TRIM THIS DOWN!!!
1249 */
1250int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
1251{
1252 u8 response = DLM_MASTER_RESP_MAYBE;
1253 struct dlm_ctxt *dlm = data;
1254 struct dlm_lock_resource *res;
1255 struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
1256 struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
1257 char *name;
1258 unsigned int namelen;
1259 int found, ret;
1260 int set_maybe;
1261
1262 if (!dlm_grab(dlm))
1263 return DLM_MASTER_RESP_NO;
1264
1265 if (!dlm_domain_fully_joined(dlm)) {
1266 response = DLM_MASTER_RESP_NO;
1267 goto send_response;
1268 }
1269
1270 name = request->name;
1271 namelen = request->namelen;
1272
1273 if (namelen > DLM_LOCKID_NAME_MAX) {
1274 response = DLM_IVBUFLEN;
1275 goto send_response;
1276 }
1277
1278way_up_top:
1279 spin_lock(&dlm->spinlock);
1280 res = __dlm_lookup_lockres(dlm, name, namelen);
1281 if (res) {
1282 spin_unlock(&dlm->spinlock);
1283
1284 /* take care of the easy cases up front */
1285 spin_lock(&res->spinlock);
1286 if (res->state & DLM_LOCK_RES_RECOVERING) {
1287 spin_unlock(&res->spinlock);
1288 mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
1289 "being recovered\n");
1290 response = DLM_MASTER_RESP_ERROR;
1291 if (mle)
1292 kmem_cache_free(dlm_mle_cache, mle);
1293 goto send_response;
1294 }
1295
1296 if (res->owner == dlm->node_num) {
1297 u32 flags = DLM_ASSERT_MASTER_MLE_CLEANUP;
1298 spin_unlock(&res->spinlock);
1299 // mlog(0, "this node is the master\n");
1300 response = DLM_MASTER_RESP_YES;
1301 if (mle)
1302 kmem_cache_free(dlm_mle_cache, mle);
1303
1304 /* this node is the owner.
1305 * there is some extra work that needs to
1306 * happen now. the requesting node has
1307 * caused all nodes up to this one to
1308 * create mles. this node now needs to
1309 * go back and clean those up. */
1310 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1311 dlm->node_num, res->lockname.len, res->lockname.name);
1312 ret = dlm_dispatch_assert_master(dlm, res, 1,
1313 request->node_idx,
1314 flags);
1315 if (ret < 0) {
1316 mlog(ML_ERROR, "failed to dispatch assert "
1317 "master work\n");
1318 response = DLM_MASTER_RESP_ERROR;
1319 }
1320 goto send_response;
1321 } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1322 spin_unlock(&res->spinlock);
1323 // mlog(0, "node %u is the master\n", res->owner);
1324 response = DLM_MASTER_RESP_NO;
1325 if (mle)
1326 kmem_cache_free(dlm_mle_cache, mle);
1327 goto send_response;
1328 }
1329
1330 /* ok, there is no owner. either this node is
1331 * being blocked, or it is actively trying to
1332 * master this lock. */
1333 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1334 mlog(ML_ERROR, "lock with no owner should be "
1335 "in-progress!\n");
1336 BUG();
1337 }
1338
1339 // mlog(0, "lockres is in progress...\n");
1340 spin_lock(&dlm->master_lock);
1341 found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1342 if (!found) {
1343 mlog(ML_ERROR, "no mle found for this lock!\n");
1344 BUG();
1345 }
1346 set_maybe = 1;
1347 spin_lock(&tmpmle->spinlock);
1348 if (tmpmle->type == DLM_MLE_BLOCK) {
1349 // mlog(0, "this node is waiting for "
1350 // "lockres to be mastered\n");
1351 response = DLM_MASTER_RESP_NO;
1352 } else if (tmpmle->type == DLM_MLE_MIGRATION) {
1353 mlog(0, "node %u is master, but trying to migrate to "
1354 "node %u.\n", tmpmle->master, tmpmle->new_master);
1355 if (tmpmle->master == dlm->node_num) {
1356 response = DLM_MASTER_RESP_YES;
1357 mlog(ML_ERROR, "no owner on lockres, but this "
1358 "node is trying to migrate it to %u?!\n",
1359 tmpmle->new_master);
1360 BUG();
1361 } else {
1362 /* the real master can respond on its own */
1363 response = DLM_MASTER_RESP_NO;
1364 }
1365 } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1366 set_maybe = 0;
1367 if (tmpmle->master == dlm->node_num)
1368 response = DLM_MASTER_RESP_YES;
1369 else
1370 response = DLM_MASTER_RESP_NO;
1371 } else {
1372 // mlog(0, "this node is attempting to "
1373 // "master lockres\n");
1374 response = DLM_MASTER_RESP_MAYBE;
1375 }
1376 if (set_maybe)
1377 set_bit(request->node_idx, tmpmle->maybe_map);
1378 spin_unlock(&tmpmle->spinlock);
1379
1380 spin_unlock(&dlm->master_lock);
1381 spin_unlock(&res->spinlock);
1382
1383 /* keep the mle attached to heartbeat events */
1384 dlm_put_mle(tmpmle);
1385 if (mle)
1386 kmem_cache_free(dlm_mle_cache, mle);
1387 goto send_response;
1388 }
1389
1390 /*
1391 * lockres doesn't exist on this node
1392 * if there is an MLE_BLOCK, return NO
1393 * if there is an MLE_MASTER, return MAYBE
1394 * otherwise, add an MLE_BLOCK, return NO
1395 */
1396 spin_lock(&dlm->master_lock);
1397 found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1398 if (!found) {
1399 /* this lockid has never been seen on this node yet */
1400 // mlog(0, "no mle found\n");
1401 if (!mle) {
1402 spin_unlock(&dlm->master_lock);
1403 spin_unlock(&dlm->spinlock);
1404
1405 mle = (struct dlm_master_list_entry *)
1406 kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
1407 if (!mle) {
1408 // bad bad bad... this sucks.
1409 response = DLM_MASTER_RESP_ERROR;
1410 goto send_response;
1411 }
1412 spin_lock(&dlm->spinlock);
1413 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
1414 name, namelen);
1415 spin_unlock(&dlm->spinlock);
1416 goto way_up_top;
1417 }
1418
1419 // mlog(0, "this is second time thru, already allocated, "
1420 // "add the block.\n");
1421 set_bit(request->node_idx, mle->maybe_map);
1422 list_add(&mle->list, &dlm->master_list);
1423 response = DLM_MASTER_RESP_NO;
1424 } else {
1425 // mlog(0, "mle was found\n");
1426 set_maybe = 1;
1427 spin_lock(&tmpmle->spinlock);
1428 if (tmpmle->type == DLM_MLE_BLOCK)
1429 response = DLM_MASTER_RESP_NO;
1430 else if (tmpmle->type == DLM_MLE_MIGRATION) {
1431 mlog(0, "migration mle was found (%u->%u)\n",
1432 tmpmle->master, tmpmle->new_master);
1433 if (tmpmle->master == dlm->node_num) {
1434 mlog(ML_ERROR, "no lockres, but migration mle "
1435 "says that this node is master!\n");
1436 BUG();
1437 }
1438 /* real master can respond on its own */
1439 response = DLM_MASTER_RESP_NO;
1440 } else {
1441 if (tmpmle->master == dlm->node_num) {
1442 response = DLM_MASTER_RESP_YES;
1443 set_maybe = 0;
1444 } else
1445 response = DLM_MASTER_RESP_MAYBE;
1446 }
1447 if (set_maybe)
1448 set_bit(request->node_idx, tmpmle->maybe_map);
1449 spin_unlock(&tmpmle->spinlock);
1450 }
1451 spin_unlock(&dlm->master_lock);
1452 spin_unlock(&dlm->spinlock);
1453
1454 if (found) {
1455 /* keep the mle attached to heartbeat events */
1456 dlm_put_mle(tmpmle);
1457 }
1458send_response:
1459 dlm_put(dlm);
1460 return response;
1461}
1462
1463/*
1464 * DLM_ASSERT_MASTER_MSG
1465 */
1466
1467
1468/*
1469 * NOTE: this can be used for debugging
1470 * can periodically run all locks owned by this node
1471 * and re-assert across the cluster...
1472 */
1473static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
1474 unsigned int namelen, void *nodemap,
1475 u32 flags)
1476{
1477 struct dlm_assert_master assert;
1478 int to, tmpret;
1479 struct dlm_node_iter iter;
1480 int ret = 0;
1481
1482 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
1483
1484 /* note that if this nodemap is empty, it returns 0 */
1485 dlm_node_iter_init(nodemap, &iter);
1486 while ((to = dlm_node_iter_next(&iter)) >= 0) {
1487 int r = 0;
1488 mlog(0, "sending assert master to %d (%.*s)\n", to,
1489 namelen, lockname);
1490 memset(&assert, 0, sizeof(assert));
1491 assert.node_idx = dlm->node_num;
1492 assert.namelen = namelen;
1493 memcpy(assert.name, lockname, namelen);
1494 assert.flags = cpu_to_be32(flags);
1495
1496 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1497 &assert, sizeof(assert), to, &r);
1498 if (tmpret < 0) {
1499 mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
1500 if (!dlm_is_host_down(tmpret)) {
1501 mlog(ML_ERROR, "unhandled error!\n");
1502 BUG();
1503 }
1504 /* a node died. finish out the rest of the nodes. */
1505 mlog(ML_ERROR, "link to %d went down!\n", to);
1506 /* any nonzero status return will do */
1507 ret = tmpret;
1508 } else if (r < 0) {
1509 /* ok, something horribly messed. kill thyself. */
1510 mlog(ML_ERROR,"during assert master of %.*s to %u, "
1511 "got %d.\n", namelen, lockname, to, r);
1512 dlm_dump_lock_resources(dlm);
1513 BUG();
1514 }
1515 }
1516
1517 return ret;
1518}
1519
1520/*
1521 * locks that can be taken here:
1522 * dlm->spinlock
1523 * res->spinlock
1524 * mle->spinlock
1525 * dlm->master_list
1526 *
1527 * if possible, TRIM THIS DOWN!!!
1528 */
1529int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
1530{
1531 struct dlm_ctxt *dlm = data;
1532 struct dlm_master_list_entry *mle = NULL;
1533 struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
1534 struct dlm_lock_resource *res = NULL;
1535 char *name;
1536 unsigned int namelen;
1537 u32 flags;
1538
1539 if (!dlm_grab(dlm))
1540 return 0;
1541
1542 name = assert->name;
1543 namelen = assert->namelen;
1544 flags = be32_to_cpu(assert->flags);
1545
1546 if (namelen > DLM_LOCKID_NAME_MAX) {
1547 mlog(ML_ERROR, "Invalid name length!");
1548 goto done;
1549 }
1550
1551 spin_lock(&dlm->spinlock);
1552
1553 if (flags)
1554 mlog(0, "assert_master with flags: %u\n", flags);
1555
1556 /* find the MLE */
1557 spin_lock(&dlm->master_lock);
1558 if (!dlm_find_mle(dlm, &mle, name, namelen)) {
1559 /* not an error, could be master just re-asserting */
1560 mlog(0, "just got an assert_master from %u, but no "
1561 "MLE for it! (%.*s)\n", assert->node_idx,
1562 namelen, name);
1563 } else {
1564 int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
1565 if (bit >= O2NM_MAX_NODES) {
1566 /* not necessarily an error, though less likely.
1567 * could be master just re-asserting. */
1568 mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
1569 "is asserting! (%.*s)\n", assert->node_idx,
1570 namelen, name);
1571 } else if (bit != assert->node_idx) {
1572 if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1573 mlog(0, "master %u was found, %u should "
1574 "back off\n", assert->node_idx, bit);
1575 } else {
1576 /* with the fix for bug 569, a higher node
1577 * number winning the mastery will respond
1578 * YES to mastery requests, but this node
1579 * had no way of knowing. let it pass. */
1580 mlog(ML_ERROR, "%u is the lowest node, "
1581 "%u is asserting. (%.*s) %u must "
1582 "have begun after %u won.\n", bit,
1583 assert->node_idx, namelen, name, bit,
1584 assert->node_idx);
1585 }
1586 }
1587 }
1588 spin_unlock(&dlm->master_lock);
1589
1590 /* ok everything checks out with the MLE
1591 * now check to see if there is a lockres */
1592 res = __dlm_lookup_lockres(dlm, name, namelen);
1593 if (res) {
1594 spin_lock(&res->spinlock);
1595 if (res->state & DLM_LOCK_RES_RECOVERING) {
1596 mlog(ML_ERROR, "%u asserting but %.*s is "
1597 "RECOVERING!\n", assert->node_idx, namelen, name);
1598 goto kill;
1599 }
1600 if (!mle) {
1601 if (res->owner != assert->node_idx) {
1602 mlog(ML_ERROR, "assert_master from "
1603 "%u, but current owner is "
1604 "%u! (%.*s)\n",
1605 assert->node_idx, res->owner,
1606 namelen, name);
1607 goto kill;
1608 }
1609 } else if (mle->type != DLM_MLE_MIGRATION) {
1610 if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1611 /* owner is just re-asserting */
1612 if (res->owner == assert->node_idx) {
1613 mlog(0, "owner %u re-asserting on "
1614 "lock %.*s\n", assert->node_idx,
1615 namelen, name);
1616 goto ok;
1617 }
1618 mlog(ML_ERROR, "got assert_master from "
1619 "node %u, but %u is the owner! "
1620 "(%.*s)\n", assert->node_idx,
1621 res->owner, namelen, name);
1622 goto kill;
1623 }
1624 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1625 mlog(ML_ERROR, "got assert from %u, but lock "
1626 "with no owner should be "
1627 "in-progress! (%.*s)\n",
1628 assert->node_idx,
1629 namelen, name);
1630 goto kill;
1631 }
1632 } else /* mle->type == DLM_MLE_MIGRATION */ {
1633 /* should only be getting an assert from new master */
1634 if (assert->node_idx != mle->new_master) {
1635 mlog(ML_ERROR, "got assert from %u, but "
1636 "new master is %u, and old master "
1637 "was %u (%.*s)\n",
1638 assert->node_idx, mle->new_master,
1639 mle->master, namelen, name);
1640 goto kill;
1641 }
1642
1643 }
1644ok:
1645 spin_unlock(&res->spinlock);
1646 }
1647 spin_unlock(&dlm->spinlock);
1648
1649 // mlog(0, "woo! got an assert_master from node %u!\n",
1650 // assert->node_idx);
1651 if (mle) {
1652 int extra_ref;
1653
1654 spin_lock(&mle->spinlock);
1655 extra_ref = !!(mle->type == DLM_MLE_BLOCK
1656 || mle->type == DLM_MLE_MIGRATION);
1657 mle->master = assert->node_idx;
1658 atomic_set(&mle->woken, 1);
1659 wake_up(&mle->wq);
1660 spin_unlock(&mle->spinlock);
1661
1662 if (mle->type == DLM_MLE_MIGRATION && res) {
1663 mlog(0, "finishing off migration of lockres %.*s, "
1664 "from %u to %u\n",
1665 res->lockname.len, res->lockname.name,
1666 dlm->node_num, mle->new_master);
1667 spin_lock(&res->spinlock);
1668 res->state &= ~DLM_LOCK_RES_MIGRATING;
1669 dlm_change_lockres_owner(dlm, res, mle->new_master);
1670 BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
1671 spin_unlock(&res->spinlock);
1672 }
1673 /* master is known, detach if not already detached */
1674 dlm_mle_detach_hb_events(dlm, mle);
1675 dlm_put_mle(mle);
1676
1677 if (extra_ref) {
1678 /* the assert master message now balances the extra
1679 * ref given by the master / migration request message.
1680 * if this is the last put, it will be removed
1681 * from the list. */
1682 dlm_put_mle(mle);
1683 }
1684 }
1685
1686done:
1687 if (res)
1688 dlm_lockres_put(res);
1689 dlm_put(dlm);
1690 return 0;
1691
1692kill:
1693 /* kill the caller! */
1694 spin_unlock(&res->spinlock);
1695 spin_unlock(&dlm->spinlock);
1696 dlm_lockres_put(res);
1697 mlog(ML_ERROR, "Bad message received from another node. Dumping state "
1698 "and killing the other node now! This node is OK and can continue.\n");
1699 dlm_dump_lock_resources(dlm);
1700 dlm_put(dlm);
1701 return -EINVAL;
1702}
1703
1704int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
1705 struct dlm_lock_resource *res,
1706 int ignore_higher, u8 request_from, u32 flags)
1707{
1708 struct dlm_work_item *item;
1709 item = kcalloc(1, sizeof(*item), GFP_KERNEL);
1710 if (!item)
1711 return -ENOMEM;
1712
1713
1714 /* queue up work for dlm_assert_master_worker */
1715 dlm_grab(dlm); /* get an extra ref for the work item */
1716 dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
1717 item->u.am.lockres = res; /* already have a ref */
1718 /* can optionally ignore node numbers higher than this node */
1719 item->u.am.ignore_higher = ignore_higher;
1720 item->u.am.request_from = request_from;
1721 item->u.am.flags = flags;
1722
1723 spin_lock(&dlm->work_lock);
1724 list_add_tail(&item->list, &dlm->work_list);
1725 spin_unlock(&dlm->work_lock);
1726
1727 schedule_work(&dlm->dispatched_work);
1728 return 0;
1729}
1730
1731static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
1732{
1733 struct dlm_ctxt *dlm = data;
1734 int ret = 0;
1735 struct dlm_lock_resource *res;
1736 unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
1737 int ignore_higher;
1738 int bit;
1739 u8 request_from;
1740 u32 flags;
1741
1742 dlm = item->dlm;
1743 res = item->u.am.lockres;
1744 ignore_higher = item->u.am.ignore_higher;
1745 request_from = item->u.am.request_from;
1746 flags = item->u.am.flags;
1747
1748 spin_lock(&dlm->spinlock);
1749 memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
1750 spin_unlock(&dlm->spinlock);
1751
1752 clear_bit(dlm->node_num, nodemap);
1753 if (ignore_higher) {
1754 /* if is this just to clear up mles for nodes below
1755 * this node, do not send the message to the original
1756 * caller or any node number higher than this */
1757 clear_bit(request_from, nodemap);
1758 bit = dlm->node_num;
1759 while (1) {
1760 bit = find_next_bit(nodemap, O2NM_MAX_NODES,
1761 bit+1);
1762 if (bit >= O2NM_MAX_NODES)
1763 break;
1764 clear_bit(bit, nodemap);
1765 }
1766 }
1767
1768 /* this call now finishes out the nodemap
1769 * even if one or more nodes die */
1770 mlog(0, "worker about to master %.*s here, this=%u\n",
1771 res->lockname.len, res->lockname.name, dlm->node_num);
1772 ret = dlm_do_assert_master(dlm, res->lockname.name,
1773 res->lockname.len,
1774 nodemap, flags);
1775 if (ret < 0) {
1776 /* no need to restart, we are done */
1777 mlog_errno(ret);
1778 }
1779
1780 dlm_lockres_put(res);
1781
1782 mlog(0, "finished with dlm_assert_master_worker\n");
1783}
1784
1785
1786/*
1787 * DLM_MIGRATE_LOCKRES
1788 */
1789
1790
1791int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1792 u8 target)
1793{
1794 struct dlm_master_list_entry *mle = NULL;
1795 struct dlm_master_list_entry *oldmle = NULL;
1796 struct dlm_migratable_lockres *mres = NULL;
1797 int ret = -EINVAL;
1798 const char *name;
1799 unsigned int namelen;
1800 int mle_added = 0;
1801 struct list_head *queue, *iter;
1802 int i;
1803 struct dlm_lock *lock;
1804 int empty = 1;
1805
1806 if (!dlm_grab(dlm))
1807 return -EINVAL;
1808
1809 name = res->lockname.name;
1810 namelen = res->lockname.len;
1811
1812 mlog(0, "migrating %.*s to %u\n", namelen, name, target);
1813
1814 /*
1815 * ensure this lockres is a proper candidate for migration
1816 */
1817 spin_lock(&res->spinlock);
1818 if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
1819 mlog(0, "cannot migrate lockres with unknown owner!\n");
1820 spin_unlock(&res->spinlock);
1821 goto leave;
1822 }
1823 if (res->owner != dlm->node_num) {
1824 mlog(0, "cannot migrate lockres this node doesn't own!\n");
1825 spin_unlock(&res->spinlock);
1826 goto leave;
1827 }
1828 mlog(0, "checking queues...\n");
1829 queue = &res->granted;
1830 for (i=0; i<3; i++) {
1831 list_for_each(iter, queue) {
1832 lock = list_entry (iter, struct dlm_lock, list);
1833 empty = 0;
1834 if (lock->ml.node == dlm->node_num) {
1835 mlog(0, "found a lock owned by this node "
1836 "still on the %s queue! will not "
1837 "migrate this lockres\n",
1838 i==0 ? "granted" :
1839 (i==1 ? "converting" : "blocked"));
1840 spin_unlock(&res->spinlock);
1841 ret = -ENOTEMPTY;
1842 goto leave;
1843 }
1844 }
1845 queue++;
1846 }
1847 mlog(0, "all locks on this lockres are nonlocal. continuing\n");
1848 spin_unlock(&res->spinlock);
1849
1850 /* no work to do */
1851 if (empty) {
1852 mlog(0, "no locks were found on this lockres! done!\n");
1853 ret = 0;
1854 goto leave;
1855 }
1856
1857 /*
1858 * preallocate up front
1859 * if this fails, abort
1860 */
1861
1862 ret = -ENOMEM;
1863 mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
1864 if (!mres) {
1865 mlog_errno(ret);
1866 goto leave;
1867 }
1868
1869 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
1870 GFP_KERNEL);
1871 if (!mle) {
1872 mlog_errno(ret);
1873 goto leave;
1874 }
1875 ret = 0;
1876
1877 /*
1878 * find a node to migrate the lockres to
1879 */
1880
1881 mlog(0, "picking a migration node\n");
1882 spin_lock(&dlm->spinlock);
1883 /* pick a new node */
1884 if (!test_bit(target, dlm->domain_map) ||
1885 target >= O2NM_MAX_NODES) {
1886 target = dlm_pick_migration_target(dlm, res);
1887 }
1888 mlog(0, "node %u chosen for migration\n", target);
1889
1890 if (target >= O2NM_MAX_NODES ||
1891 !test_bit(target, dlm->domain_map)) {
1892 /* target chosen is not alive */
1893 ret = -EINVAL;
1894 }
1895
1896 if (ret) {
1897 spin_unlock(&dlm->spinlock);
1898 goto fail;
1899 }
1900
1901 mlog(0, "continuing with target = %u\n", target);
1902
1903 /*
1904 * clear any existing master requests and
1905 * add the migration mle to the list
1906 */
1907 spin_lock(&dlm->master_lock);
1908 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
1909 namelen, target, dlm->node_num);
1910 spin_unlock(&dlm->master_lock);
1911 spin_unlock(&dlm->spinlock);
1912
1913 if (ret == -EEXIST) {
1914 mlog(0, "another process is already migrating it\n");
1915 goto fail;
1916 }
1917 mle_added = 1;
1918
1919 /*
1920 * set the MIGRATING flag and flush asts
1921 * if we fail after this we need to re-dirty the lockres
1922 */
1923 if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
1924 mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
1925 "the target went down.\n", res->lockname.len,
1926 res->lockname.name, target);
1927 spin_lock(&res->spinlock);
1928 res->state &= ~DLM_LOCK_RES_MIGRATING;
1929 spin_unlock(&res->spinlock);
1930 ret = -EINVAL;
1931 }
1932
1933fail:
1934 if (oldmle) {
1935 /* master is known, detach if not already detached */
1936 dlm_mle_detach_hb_events(dlm, oldmle);
1937 dlm_put_mle(oldmle);
1938 }
1939
1940 if (ret < 0) {
1941 if (mle_added) {
1942 dlm_mle_detach_hb_events(dlm, mle);
1943 dlm_put_mle(mle);
1944 } else if (mle) {
1945 kmem_cache_free(dlm_mle_cache, mle);
1946 }
1947 goto leave;
1948 }
1949
1950 /*
1951 * at this point, we have a migration target, an mle
1952 * in the master list, and the MIGRATING flag set on
1953 * the lockres
1954 */
1955
1956
1957 /* get an extra reference on the mle.
1958 * otherwise the assert_master from the new
1959 * master will destroy this.
1960 * also, make sure that all callers of dlm_get_mle
1961 * take both dlm->spinlock and dlm->master_lock */
1962 spin_lock(&dlm->spinlock);
1963 spin_lock(&dlm->master_lock);
1964 dlm_get_mle(mle);
1965 spin_unlock(&dlm->master_lock);
1966 spin_unlock(&dlm->spinlock);
1967
1968 /* notify new node and send all lock state */
1969 /* call send_one_lockres with migration flag.
1970 * this serves as notice to the target node that a
1971 * migration is starting. */
1972 ret = dlm_send_one_lockres(dlm, res, mres, target,
1973 DLM_MRES_MIGRATION);
1974
1975 if (ret < 0) {
1976 mlog(0, "migration to node %u failed with %d\n",
1977 target, ret);
1978 /* migration failed, detach and clean up mle */
1979 dlm_mle_detach_hb_events(dlm, mle);
1980 dlm_put_mle(mle);
1981 dlm_put_mle(mle);
1982 goto leave;
1983 }
1984
1985 /* at this point, the target sends a message to all nodes,
1986 * (using dlm_do_migrate_request). this node is skipped since
1987 * we had to put an mle in the list to begin the process. this
1988 * node now waits for target to do an assert master. this node
1989 * will be the last one notified, ensuring that the migration
1990 * is complete everywhere. if the target dies while this is
1991 * going on, some nodes could potentially see the target as the
1992 * master, so it is important that my recovery finds the migration
1993 * mle and sets the master to UNKNONWN. */
1994
1995
1996 /* wait for new node to assert master */
1997 while (1) {
1998 ret = wait_event_interruptible_timeout(mle->wq,
1999 (atomic_read(&mle->woken) == 1),
2000 msecs_to_jiffies(5000));
2001
2002 if (ret >= 0) {
2003 if (atomic_read(&mle->woken) == 1 ||
2004 res->owner == target)
2005 break;
2006
2007 mlog(0, "timed out during migration\n");
2008 }
2009 if (ret == -ERESTARTSYS) {
2010 /* migration failed, detach and clean up mle */
2011 dlm_mle_detach_hb_events(dlm, mle);
2012 dlm_put_mle(mle);
2013 dlm_put_mle(mle);
2014 goto leave;
2015 }
2016 /* TODO: if node died: stop, clean up, return error */
2017 }
2018
2019 /* all done, set the owner, clear the flag */
2020 spin_lock(&res->spinlock);
2021 dlm_set_lockres_owner(dlm, res, target);
2022 res->state &= ~DLM_LOCK_RES_MIGRATING;
2023 dlm_remove_nonlocal_locks(dlm, res);
2024 spin_unlock(&res->spinlock);
2025 wake_up(&res->wq);
2026
2027 /* master is known, detach if not already detached */
2028 dlm_mle_detach_hb_events(dlm, mle);
2029 dlm_put_mle(mle);
2030 ret = 0;
2031
2032 dlm_lockres_calc_usage(dlm, res);
2033
2034leave:
2035 /* re-dirty the lockres if we failed */
2036 if (ret < 0)
2037 dlm_kick_thread(dlm, res);
2038
2039 /* TODO: cleanup */
2040 if (mres)
2041 free_page((unsigned long)mres);
2042
2043 dlm_put(dlm);
2044
2045 mlog(0, "returning %d\n", ret);
2046 return ret;
2047}
2048EXPORT_SYMBOL_GPL(dlm_migrate_lockres);
2049
2050int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
2051{
2052 int ret;
2053 spin_lock(&dlm->ast_lock);
2054 spin_lock(&lock->spinlock);
2055 ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
2056 spin_unlock(&lock->spinlock);
2057 spin_unlock(&dlm->ast_lock);
2058 return ret;
2059}
2060
2061static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
2062 struct dlm_lock_resource *res,
2063 u8 mig_target)
2064{
2065 int can_proceed;
2066 spin_lock(&res->spinlock);
2067 can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
2068 spin_unlock(&res->spinlock);
2069
2070 /* target has died, so make the caller break out of the
2071 * wait_event, but caller must recheck the domain_map */
2072 spin_lock(&dlm->spinlock);
2073 if (!test_bit(mig_target, dlm->domain_map))
2074 can_proceed = 1;
2075 spin_unlock(&dlm->spinlock);
2076 return can_proceed;
2077}
2078
2079int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2080{
2081 int ret;
2082 spin_lock(&res->spinlock);
2083 ret = !!(res->state & DLM_LOCK_RES_DIRTY);
2084 spin_unlock(&res->spinlock);
2085 return ret;
2086}
2087
2088
2089static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
2090 struct dlm_lock_resource *res,
2091 u8 target)
2092{
2093 int ret = 0;
2094
2095 mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
2096 res->lockname.len, res->lockname.name, dlm->node_num,
2097 target);
2098 /* need to set MIGRATING flag on lockres. this is done by
2099 * ensuring that all asts have been flushed for this lockres. */
2100 spin_lock(&res->spinlock);
2101 BUG_ON(res->migration_pending);
2102 res->migration_pending = 1;
2103 /* strategy is to reserve an extra ast then release
2104 * it below, letting the release do all of the work */
2105 __dlm_lockres_reserve_ast(res);
2106 spin_unlock(&res->spinlock);
2107
2108 /* now flush all the pending asts.. hang out for a bit */
2109 dlm_kick_thread(dlm, res);
2110 wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
2111 dlm_lockres_release_ast(dlm, res);
2112
2113 mlog(0, "about to wait on migration_wq, dirty=%s\n",
2114 res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
2115 /* if the extra ref we just put was the final one, this
2116 * will pass thru immediately. otherwise, we need to wait
2117 * for the last ast to finish. */
2118again:
2119 ret = wait_event_interruptible_timeout(dlm->migration_wq,
2120 dlm_migration_can_proceed(dlm, res, target),
2121 msecs_to_jiffies(1000));
2122 if (ret < 0) {
2123 mlog(0, "woken again: migrating? %s, dead? %s\n",
2124 res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2125 test_bit(target, dlm->domain_map) ? "no":"yes");
2126 } else {
2127 mlog(0, "all is well: migrating? %s, dead? %s\n",
2128 res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2129 test_bit(target, dlm->domain_map) ? "no":"yes");
2130 }
2131 if (!dlm_migration_can_proceed(dlm, res, target)) {
2132 mlog(0, "trying again...\n");
2133 goto again;
2134 }
2135
2136 /* did the target go down or die? */
2137 spin_lock(&dlm->spinlock);
2138 if (!test_bit(target, dlm->domain_map)) {
2139 mlog(ML_ERROR, "aha. migration target %u just went down\n",
2140 target);
2141 ret = -EHOSTDOWN;
2142 }
2143 spin_unlock(&dlm->spinlock);
2144
2145 /*
2146 * at this point:
2147 *
2148 * o the DLM_LOCK_RES_MIGRATING flag is set
2149 * o there are no pending asts on this lockres
2150 * o all processes trying to reserve an ast on this
2151 * lockres must wait for the MIGRATING flag to clear
2152 */
2153 return ret;
2154}
2155
2156/* last step in the migration process.
2157 * original master calls this to free all of the dlm_lock
2158 * structures that used to be for other nodes. */
2159static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2160 struct dlm_lock_resource *res)
2161{
2162 struct list_head *iter, *iter2;
2163 struct list_head *queue = &res->granted;
2164 int i;
2165 struct dlm_lock *lock;
2166
2167 assert_spin_locked(&res->spinlock);
2168
2169 BUG_ON(res->owner == dlm->node_num);
2170
2171 for (i=0; i<3; i++) {
2172 list_for_each_safe(iter, iter2, queue) {
2173 lock = list_entry (iter, struct dlm_lock, list);
2174 if (lock->ml.node != dlm->node_num) {
2175 mlog(0, "putting lock for node %u\n",
2176 lock->ml.node);
2177 /* be extra careful */
2178 BUG_ON(!list_empty(&lock->ast_list));
2179 BUG_ON(!list_empty(&lock->bast_list));
2180 BUG_ON(lock->ast_pending);
2181 BUG_ON(lock->bast_pending);
2182 list_del_init(&lock->list);
2183 dlm_lock_put(lock);
2184 }
2185 }
2186 queue++;
2187 }
2188}
2189
2190/* for now this is not too intelligent. we will
2191 * need stats to make this do the right thing.
2192 * this just finds the first lock on one of the
2193 * queues and uses that node as the target. */
2194static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2195 struct dlm_lock_resource *res)
2196{
2197 int i;
2198 struct list_head *queue = &res->granted;
2199 struct list_head *iter;
2200 struct dlm_lock *lock;
2201 int nodenum;
2202
2203 assert_spin_locked(&dlm->spinlock);
2204
2205 spin_lock(&res->spinlock);
2206 for (i=0; i<3; i++) {
2207 list_for_each(iter, queue) {
2208 /* up to the caller to make sure this node
2209 * is alive */
2210 lock = list_entry (iter, struct dlm_lock, list);
2211 if (lock->ml.node != dlm->node_num) {
2212 spin_unlock(&res->spinlock);
2213 return lock->ml.node;
2214 }
2215 }
2216 queue++;
2217 }
2218 spin_unlock(&res->spinlock);
2219 mlog(0, "have not found a suitable target yet! checking domain map\n");
2220
2221 /* ok now we're getting desperate. pick anyone alive. */
2222 nodenum = -1;
2223 while (1) {
2224 nodenum = find_next_bit(dlm->domain_map,
2225 O2NM_MAX_NODES, nodenum+1);
2226 mlog(0, "found %d in domain map\n", nodenum);
2227 if (nodenum >= O2NM_MAX_NODES)
2228 break;
2229 if (nodenum != dlm->node_num) {
2230 mlog(0, "picking %d\n", nodenum);
2231 return nodenum;
2232 }
2233 }
2234
2235 mlog(0, "giving up. no master to migrate to\n");
2236 return DLM_LOCK_RES_OWNER_UNKNOWN;
2237}
2238
2239
2240
2241/* this is called by the new master once all lockres
2242 * data has been received */
2243static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2244 struct dlm_lock_resource *res,
2245 u8 master, u8 new_master,
2246 struct dlm_node_iter *iter)
2247{
2248 struct dlm_migrate_request migrate;
2249 int ret, status = 0;
2250 int nodenum;
2251
2252 memset(&migrate, 0, sizeof(migrate));
2253 migrate.namelen = res->lockname.len;
2254 memcpy(migrate.name, res->lockname.name, migrate.namelen);
2255 migrate.new_master = new_master;
2256 migrate.master = master;
2257
2258 ret = 0;
2259
2260 /* send message to all nodes, except the master and myself */
2261 while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
2262 if (nodenum == master ||
2263 nodenum == new_master)
2264 continue;
2265
2266 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
2267 &migrate, sizeof(migrate), nodenum,
2268 &status);
2269 if (ret < 0)
2270 mlog_errno(ret);
2271 else if (status < 0) {
2272 mlog(0, "migrate request (node %u) returned %d!\n",
2273 nodenum, status);
2274 ret = status;
2275 }
2276 }
2277
2278 if (ret < 0)
2279 mlog_errno(ret);
2280
2281 mlog(0, "returning ret=%d\n", ret);
2282 return ret;
2283}
2284
2285
2286/* if there is an existing mle for this lockres, we now know who the master is.
2287 * (the one who sent us *this* message) we can clear it up right away.
2288 * since the process that put the mle on the list still has a reference to it,
2289 * we can unhash it now, set the master and wake the process. as a result,
2290 * we will have no mle in the list to start with. now we can add an mle for
2291 * the migration and this should be the only one found for those scanning the
2292 * list. */
2293int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
2294{
2295 struct dlm_ctxt *dlm = data;
2296 struct dlm_lock_resource *res = NULL;
2297 struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
2298 struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
2299 const char *name;
2300 unsigned int namelen;
2301 int ret = 0;
2302
2303 if (!dlm_grab(dlm))
2304 return -EINVAL;
2305
2306 name = migrate->name;
2307 namelen = migrate->namelen;
2308
2309 /* preallocate.. if this fails, abort */
2310 mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
2311 GFP_KERNEL);
2312
2313 if (!mle) {
2314 ret = -ENOMEM;
2315 goto leave;
2316 }
2317
2318 /* check for pre-existing lock */
2319 spin_lock(&dlm->spinlock);
2320 res = __dlm_lookup_lockres(dlm, name, namelen);
2321 spin_lock(&dlm->master_lock);
2322
2323 if (res) {
2324 spin_lock(&res->spinlock);
2325 if (res->state & DLM_LOCK_RES_RECOVERING) {
2326 /* if all is working ok, this can only mean that we got
2327 * a migrate request from a node that we now see as
2328 * dead. what can we do here? drop it to the floor? */
2329 spin_unlock(&res->spinlock);
2330 mlog(ML_ERROR, "Got a migrate request, but the "
2331 "lockres is marked as recovering!");
2332 kmem_cache_free(dlm_mle_cache, mle);
2333 ret = -EINVAL; /* need a better solution */
2334 goto unlock;
2335 }
2336 res->state |= DLM_LOCK_RES_MIGRATING;
2337 spin_unlock(&res->spinlock);
2338 }
2339
2340 /* ignore status. only nonzero status would BUG. */
2341 ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
2342 name, namelen,
2343 migrate->new_master,
2344 migrate->master);
2345
2346unlock:
2347 spin_unlock(&dlm->master_lock);
2348 spin_unlock(&dlm->spinlock);
2349
2350 if (oldmle) {
2351 /* master is known, detach if not already detached */
2352 dlm_mle_detach_hb_events(dlm, oldmle);
2353 dlm_put_mle(oldmle);
2354 }
2355
2356 if (res)
2357 dlm_lockres_put(res);
2358leave:
2359 dlm_put(dlm);
2360 return ret;
2361}
2362
2363/* must be holding dlm->spinlock and dlm->master_lock
2364 * when adding a migration mle, we can clear any other mles
2365 * in the master list because we know with certainty that
2366 * the master is "master". so we remove any old mle from
2367 * the list after setting it's master field, and then add
2368 * the new migration mle. this way we can hold with the rule
2369 * of having only one mle for a given lock name at all times. */
2370static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
2371 struct dlm_lock_resource *res,
2372 struct dlm_master_list_entry *mle,
2373 struct dlm_master_list_entry **oldmle,
2374 const char *name, unsigned int namelen,
2375 u8 new_master, u8 master)
2376{
2377 int found;
2378 int ret = 0;
2379
2380 *oldmle = NULL;
2381
2382 mlog_entry_void();
2383
2384 assert_spin_locked(&dlm->spinlock);
2385 assert_spin_locked(&dlm->master_lock);
2386
2387 /* caller is responsible for any ref taken here on oldmle */
2388 found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
2389 if (found) {
2390 struct dlm_master_list_entry *tmp = *oldmle;
2391 spin_lock(&tmp->spinlock);
2392 if (tmp->type == DLM_MLE_MIGRATION) {
2393 if (master == dlm->node_num) {
2394 /* ah another process raced me to it */
2395 mlog(0, "tried to migrate %.*s, but some "
2396 "process beat me to it\n",
2397 namelen, name);
2398 ret = -EEXIST;
2399 } else {
2400 /* bad. 2 NODES are trying to migrate! */
2401 mlog(ML_ERROR, "migration error mle: "
2402 "master=%u new_master=%u // request: "
2403 "master=%u new_master=%u // "
2404 "lockres=%.*s\n",
2405 tmp->master, tmp->new_master,
2406 master, new_master,
2407 namelen, name);
2408 BUG();
2409 }
2410 } else {
2411 /* this is essentially what assert_master does */
2412 tmp->master = master;
2413 atomic_set(&tmp->woken, 1);
2414 wake_up(&tmp->wq);
2415 /* remove it from the list so that only one
2416 * mle will be found */
2417 list_del_init(&tmp->list);
2418 }
2419 spin_unlock(&tmp->spinlock);
2420 }
2421
2422 /* now add a migration mle to the tail of the list */
2423 dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
2424 mle->new_master = new_master;
2425 mle->master = master;
2426 /* do this for consistency with other mle types */
2427 set_bit(new_master, mle->maybe_map);
2428 list_add(&mle->list, &dlm->master_list);
2429
2430 return ret;
2431}
2432
2433
2434void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
2435{
2436 struct list_head *iter, *iter2;
2437 struct dlm_master_list_entry *mle;
2438 struct dlm_lock_resource *res;
2439
2440 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
2441top:
2442 assert_spin_locked(&dlm->spinlock);
2443
2444 /* clean the master list */
2445 spin_lock(&dlm->master_lock);
2446 list_for_each_safe(iter, iter2, &dlm->master_list) {
2447 mle = list_entry(iter, struct dlm_master_list_entry, list);
2448
2449 BUG_ON(mle->type != DLM_MLE_BLOCK &&
2450 mle->type != DLM_MLE_MASTER &&
2451 mle->type != DLM_MLE_MIGRATION);
2452
2453 /* MASTER mles are initiated locally. the waiting
2454 * process will notice the node map change
2455 * shortly. let that happen as normal. */
2456 if (mle->type == DLM_MLE_MASTER)
2457 continue;
2458
2459
2460 /* BLOCK mles are initiated by other nodes.
2461 * need to clean up if the dead node would have
2462 * been the master. */
2463 if (mle->type == DLM_MLE_BLOCK) {
2464 int bit;
2465
2466 spin_lock(&mle->spinlock);
2467 bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
2468 if (bit != dead_node) {
2469 mlog(0, "mle found, but dead node %u would "
2470 "not have been master\n", dead_node);
2471 spin_unlock(&mle->spinlock);
2472 } else {
2473 /* must drop the refcount by one since the
2474 * assert_master will never arrive. this
2475 * may result in the mle being unlinked and
2476 * freed, but there may still be a process
2477 * waiting in the dlmlock path which is fine. */
2478 mlog(ML_ERROR, "node %u was expected master\n",
2479 dead_node);
2480 atomic_set(&mle->woken, 1);
2481 spin_unlock(&mle->spinlock);
2482 wake_up(&mle->wq);
2483 /* final put will take care of list removal */
2484 __dlm_put_mle(mle);
2485 }
2486 continue;
2487 }
2488
2489 /* everything else is a MIGRATION mle */
2490
2491 /* the rule for MIGRATION mles is that the master
2492 * becomes UNKNOWN if *either* the original or
2493 * the new master dies. all UNKNOWN lockreses
2494 * are sent to whichever node becomes the recovery
2495 * master. the new master is responsible for
2496 * determining if there is still a master for
2497 * this lockres, or if he needs to take over
2498 * mastery. either way, this node should expect
2499 * another message to resolve this. */
2500 if (mle->master != dead_node &&
2501 mle->new_master != dead_node)
2502 continue;
2503
2504 /* if we have reached this point, this mle needs to
2505 * be removed from the list and freed. */
2506
2507 /* remove from the list early. NOTE: unlinking
2508 * list_head while in list_for_each_safe */
2509 spin_lock(&mle->spinlock);
2510 list_del_init(&mle->list);
2511 atomic_set(&mle->woken, 1);
2512 spin_unlock(&mle->spinlock);
2513 wake_up(&mle->wq);
2514
2515 mlog(0, "node %u died during migration from "
2516 "%u to %u!\n", dead_node,
2517 mle->master, mle->new_master);
2518 /* if there is a lockres associated with this
2519 * mle, find it and set its owner to UNKNOWN */
2520 res = __dlm_lookup_lockres(dlm, mle->u.name.name,
2521 mle->u.name.len);
2522 if (res) {
2523 /* unfortunately if we hit this rare case, our
2524 * lock ordering is messed. we need to drop
2525 * the master lock so that we can take the
2526 * lockres lock, meaning that we will have to
2527 * restart from the head of list. */
2528 spin_unlock(&dlm->master_lock);
2529
2530 /* move lockres onto recovery list */
2531 spin_lock(&res->spinlock);
2532 dlm_set_lockres_owner(dlm, res,
2533 DLM_LOCK_RES_OWNER_UNKNOWN);
2534 dlm_move_lockres_to_recovery_list(dlm, res);
2535 spin_unlock(&res->spinlock);
2536 dlm_lockres_put(res);
2537
2538 /* dump the mle */
2539 spin_lock(&dlm->master_lock);
2540 __dlm_put_mle(mle);
2541 spin_unlock(&dlm->master_lock);
2542
2543 /* restart */
2544 goto top;
2545 }
2546
2547 /* this may be the last reference */
2548 __dlm_put_mle(mle);
2549 }
2550 spin_unlock(&dlm->master_lock);
2551}
2552
2553
2554int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
2555 u8 old_master)
2556{
2557 struct dlm_node_iter iter;
2558 int ret = 0;
2559
2560 spin_lock(&dlm->spinlock);
2561 dlm_node_iter_init(dlm->domain_map, &iter);
2562 clear_bit(old_master, iter.node_map);
2563 clear_bit(dlm->node_num, iter.node_map);
2564 spin_unlock(&dlm->spinlock);
2565
2566 mlog(0, "now time to do a migrate request to other nodes\n");
2567 ret = dlm_do_migrate_request(dlm, res, old_master,
2568 dlm->node_num, &iter);
2569 if (ret < 0) {
2570 mlog_errno(ret);
2571 goto leave;
2572 }
2573
2574 mlog(0, "doing assert master of %.*s to all except the original node\n",
2575 res->lockname.len, res->lockname.name);
2576 /* this call now finishes out the nodemap
2577 * even if one or more nodes die */
2578 ret = dlm_do_assert_master(dlm, res->lockname.name,
2579 res->lockname.len, iter.node_map,
2580 DLM_ASSERT_MASTER_FINISH_MIGRATION);
2581 if (ret < 0) {
2582 /* no longer need to retry. all living nodes contacted. */
2583 mlog_errno(ret);
2584 ret = 0;
2585 }
2586
2587 memset(iter.node_map, 0, sizeof(iter.node_map));
2588 set_bit(old_master, iter.node_map);
2589 mlog(0, "doing assert master of %.*s back to %u\n",
2590 res->lockname.len, res->lockname.name, old_master);
2591 ret = dlm_do_assert_master(dlm, res->lockname.name,
2592 res->lockname.len, iter.node_map,
2593 DLM_ASSERT_MASTER_FINISH_MIGRATION);
2594 if (ret < 0) {
2595 mlog(0, "assert master to original master failed "
2596 "with %d.\n", ret);
2597 /* the only nonzero status here would be because of
2598 * a dead original node. we're done. */
2599 ret = 0;
2600 }
2601
2602 /* all done, set the owner, clear the flag */
2603 spin_lock(&res->spinlock);
2604 dlm_set_lockres_owner(dlm, res, dlm->node_num);
2605 res->state &= ~DLM_LOCK_RES_MIGRATING;
2606 spin_unlock(&res->spinlock);
2607 /* re-dirty it on the new master */
2608 dlm_kick_thread(dlm, res);
2609 wake_up(&res->wq);
2610leave:
2611 return ret;
2612}
2613
2614/*
2615 * LOCKRES AST REFCOUNT
2616 * this is integral to migration
2617 */
2618
2619/* for future intent to call an ast, reserve one ahead of time.
2620 * this should be called only after waiting on the lockres
2621 * with dlm_wait_on_lockres, and while still holding the
2622 * spinlock after the call. */
2623void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
2624{
2625 assert_spin_locked(&res->spinlock);
2626 if (res->state & DLM_LOCK_RES_MIGRATING) {
2627 __dlm_print_one_lock_resource(res);
2628 }
2629 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
2630
2631 atomic_inc(&res->asts_reserved);
2632}
2633
2634/*
2635 * used to drop the reserved ast, either because it went unused,
2636 * or because the ast/bast was actually called.
2637 *
2638 * also, if there is a pending migration on this lockres,
2639 * and this was the last pending ast on the lockres,
2640 * atomically set the MIGRATING flag before we drop the lock.
2641 * this is how we ensure that migration can proceed with no
2642 * asts in progress. note that it is ok if the state of the
2643 * queues is such that a lock should be granted in the future
2644 * or that a bast should be fired, because the new master will
2645 * shuffle the lists on this lockres as soon as it is migrated.
2646 */
2647void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
2648 struct dlm_lock_resource *res)
2649{
2650 if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
2651 return;
2652
2653 if (!res->migration_pending) {
2654 spin_unlock(&res->spinlock);
2655 return;
2656 }
2657
2658 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
2659 res->migration_pending = 0;
2660 res->state |= DLM_LOCK_RES_MIGRATING;
2661 spin_unlock(&res->spinlock);
2662 wake_up(&res->wq);
2663 wake_up(&dlm->migration_wq);
2664}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
new file mode 100644
index 000000000000..0c8eb1093f00
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -0,0 +1,2132 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmrecovery.c
5 *
6 * recovery stuff
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/timer.h>
41#include <linux/kthread.h>
42
43
44#include "cluster/heartbeat.h"
45#include "cluster/nodemanager.h"
46#include "cluster/tcp.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50#include "dlmdomain.h"
51
52#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
53#include "cluster/masklog.h"
54
55static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
56
57static int dlm_recovery_thread(void *data);
58void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
59int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
60static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
61static int dlm_do_recovery(struct dlm_ctxt *dlm);
62
63static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
64static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
65static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
66static int dlm_request_all_locks(struct dlm_ctxt *dlm,
67 u8 request_from, u8 dead_node);
68static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
69
70static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
71static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
72 const char *lockname, int namelen,
73 int total_locks, u64 cookie,
74 u8 flags, u8 master);
75static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
76 struct dlm_migratable_lockres *mres,
77 u8 send_to,
78 struct dlm_lock_resource *res,
79 int total_locks);
80static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
81 struct dlm_lock_resource *res,
82 u8 *real_master);
83static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
84 struct dlm_lock_resource *res,
85 struct dlm_migratable_lockres *mres);
86static int dlm_do_master_requery(struct dlm_ctxt *dlm,
87 struct dlm_lock_resource *res,
88 u8 nodenum, u8 *real_master);
89static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
90static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
91 u8 dead_node, u8 send_to);
92static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
93static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
94 struct list_head *list, u8 dead_node);
95static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
96 u8 dead_node, u8 new_master);
97static void dlm_reco_ast(void *astdata);
98static void dlm_reco_bast(void *astdata, int blocked_type);
99static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
100static void dlm_request_all_locks_worker(struct dlm_work_item *item,
101 void *data);
102static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
103
104static u64 dlm_get_next_mig_cookie(void);
105
106static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
107static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
108static u64 dlm_mig_cookie = 1;
109
110static u64 dlm_get_next_mig_cookie(void)
111{
112 u64 c;
113 spin_lock(&dlm_mig_cookie_lock);
114 c = dlm_mig_cookie;
115 if (dlm_mig_cookie == (~0ULL))
116 dlm_mig_cookie = 1;
117 else
118 dlm_mig_cookie++;
119 spin_unlock(&dlm_mig_cookie_lock);
120 return c;
121}
122
123static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
124{
125 spin_lock(&dlm->spinlock);
126 clear_bit(dlm->reco.dead_node, dlm->recovery_map);
127 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
128 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
129 spin_unlock(&dlm->spinlock);
130}
131
132/* Worker function used during recovery. */
133void dlm_dispatch_work(void *data)
134{
135 struct dlm_ctxt *dlm = (struct dlm_ctxt *)data;
136 LIST_HEAD(tmp_list);
137 struct list_head *iter, *iter2;
138 struct dlm_work_item *item;
139 dlm_workfunc_t *workfunc;
140
141 spin_lock(&dlm->work_lock);
142 list_splice_init(&dlm->work_list, &tmp_list);
143 spin_unlock(&dlm->work_lock);
144
145 list_for_each_safe(iter, iter2, &tmp_list) {
146 item = list_entry(iter, struct dlm_work_item, list);
147 workfunc = item->func;
148 list_del_init(&item->list);
149
150 /* already have ref on dlm to avoid having
151 * it disappear. just double-check. */
152 BUG_ON(item->dlm != dlm);
153
154 /* this is allowed to sleep and
155 * call network stuff */
156 workfunc(item, item->data);
157
158 dlm_put(dlm);
159 kfree(item);
160 }
161}
162
163/*
164 * RECOVERY THREAD
165 */
166
167static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
168{
169 /* wake the recovery thread
170 * this will wake the reco thread in one of three places
171 * 1) sleeping with no recovery happening
172 * 2) sleeping with recovery mastered elsewhere
173 * 3) recovery mastered here, waiting on reco data */
174
175 wake_up(&dlm->dlm_reco_thread_wq);
176}
177
178/* Launch the recovery thread */
179int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
180{
181 mlog(0, "starting dlm recovery thread...\n");
182
183 dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
184 "dlm_reco_thread");
185 if (IS_ERR(dlm->dlm_reco_thread_task)) {
186 mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
187 dlm->dlm_reco_thread_task = NULL;
188 return -EINVAL;
189 }
190
191 return 0;
192}
193
194void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
195{
196 if (dlm->dlm_reco_thread_task) {
197 mlog(0, "waiting for dlm recovery thread to exit\n");
198 kthread_stop(dlm->dlm_reco_thread_task);
199 dlm->dlm_reco_thread_task = NULL;
200 }
201}
202
203
204
205/*
206 * this is lame, but here's how recovery works...
207 * 1) all recovery threads cluster wide will work on recovering
208 * ONE node at a time
209 * 2) negotiate who will take over all the locks for the dead node.
210 * thats right... ALL the locks.
211 * 3) once a new master is chosen, everyone scans all locks
212 * and moves aside those mastered by the dead guy
213 * 4) each of these locks should be locked until recovery is done
214 * 5) the new master collects up all of secondary lock queue info
215 * one lock at a time, forcing each node to communicate back
216 * before continuing
217 * 6) each secondary lock queue responds with the full known lock info
218 * 7) once the new master has run all its locks, it sends a ALLDONE!
219 * message to everyone
220 * 8) upon receiving this message, the secondary queue node unlocks
221 * and responds to the ALLDONE
222 * 9) once the new master gets responses from everyone, he unlocks
223 * everything and recovery for this dead node is done
224 *10) go back to 2) while there are still dead nodes
225 *
226 */
227
228
229#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
230
231static int dlm_recovery_thread(void *data)
232{
233 int status;
234 struct dlm_ctxt *dlm = data;
235 unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
236
237 mlog(0, "dlm thread running for %s...\n", dlm->name);
238
239 while (!kthread_should_stop()) {
240 if (dlm_joined(dlm)) {
241 status = dlm_do_recovery(dlm);
242 if (status == -EAGAIN) {
243 /* do not sleep, recheck immediately. */
244 continue;
245 }
246 if (status < 0)
247 mlog_errno(status);
248 }
249
250 wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
251 kthread_should_stop(),
252 timeout);
253 }
254
255 mlog(0, "quitting DLM recovery thread\n");
256 return 0;
257}
258
259/* callers of the top-level api calls (dlmlock/dlmunlock) should
260 * block on the dlm->reco.event when recovery is in progress.
261 * the dlm recovery thread will set this state when it begins
262 * recovering a dead node (as the new master or not) and clear
263 * the state and wake as soon as all affected lock resources have
264 * been marked with the RECOVERY flag */
265static int dlm_in_recovery(struct dlm_ctxt *dlm)
266{
267 int in_recovery;
268 spin_lock(&dlm->spinlock);
269 in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
270 spin_unlock(&dlm->spinlock);
271 return in_recovery;
272}
273
274
275void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
276{
277 wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
278}
279
280static void dlm_begin_recovery(struct dlm_ctxt *dlm)
281{
282 spin_lock(&dlm->spinlock);
283 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
284 dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
285 spin_unlock(&dlm->spinlock);
286}
287
288static void dlm_end_recovery(struct dlm_ctxt *dlm)
289{
290 spin_lock(&dlm->spinlock);
291 BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
292 dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
293 spin_unlock(&dlm->spinlock);
294 wake_up(&dlm->reco.event);
295}
296
297static int dlm_do_recovery(struct dlm_ctxt *dlm)
298{
299 int status = 0;
300
301 spin_lock(&dlm->spinlock);
302
303 /* check to see if the new master has died */
304 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
305 test_bit(dlm->reco.new_master, dlm->recovery_map)) {
306 mlog(0, "new master %u died while recovering %u!\n",
307 dlm->reco.new_master, dlm->reco.dead_node);
308 /* unset the new_master, leave dead_node */
309 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
310 }
311
312 /* select a target to recover */
313 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
314 int bit;
315
316 bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
317 if (bit >= O2NM_MAX_NODES || bit < 0)
318 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
319 else
320 dlm->reco.dead_node = bit;
321 } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
322 /* BUG? */
323 mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
324 dlm->reco.dead_node);
325 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
326 }
327
328 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
329 // mlog(0, "nothing to recover! sleeping now!\n");
330 spin_unlock(&dlm->spinlock);
331 /* return to main thread loop and sleep. */
332 return 0;
333 }
334 mlog(0, "recovery thread found node %u in the recovery map!\n",
335 dlm->reco.dead_node);
336 spin_unlock(&dlm->spinlock);
337
338 /* take write barrier */
339 /* (stops the list reshuffling thread, proxy ast handling) */
340 dlm_begin_recovery(dlm);
341
342 if (dlm->reco.new_master == dlm->node_num)
343 goto master_here;
344
345 if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
346 /* choose a new master */
347 if (!dlm_pick_recovery_master(dlm)) {
348 /* already notified everyone. go. */
349 dlm->reco.new_master = dlm->node_num;
350 goto master_here;
351 }
352 mlog(0, "another node will master this recovery session.\n");
353 }
354 mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
355 dlm->name, dlm->reco.new_master,
356 dlm->node_num, dlm->reco.dead_node);
357
358 /* it is safe to start everything back up here
359 * because all of the dead node's lock resources
360 * have been marked as in-recovery */
361 dlm_end_recovery(dlm);
362
363 /* sleep out in main dlm_recovery_thread loop. */
364 return 0;
365
366master_here:
367 mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
368 dlm->name, dlm->reco.dead_node, dlm->node_num);
369
370 status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
371 if (status < 0) {
372 mlog(ML_ERROR, "error %d remastering locks for node %u, "
373 "retrying.\n", status, dlm->reco.dead_node);
374 } else {
375 /* success! see if any other nodes need recovery */
376 dlm_reset_recovery(dlm);
377 }
378 dlm_end_recovery(dlm);
379
380 /* continue and look for another dead node */
381 return -EAGAIN;
382}
383
384static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
385{
386 int status = 0;
387 struct dlm_reco_node_data *ndata;
388 struct list_head *iter;
389 int all_nodes_done;
390 int destroy = 0;
391 int pass = 0;
392
393 status = dlm_init_recovery_area(dlm, dead_node);
394 if (status < 0)
395 goto leave;
396
397 /* safe to access the node data list without a lock, since this
398 * process is the only one to change the list */
399 list_for_each(iter, &dlm->reco.node_data) {
400 ndata = list_entry (iter, struct dlm_reco_node_data, list);
401 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
402 ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
403
404 mlog(0, "requesting lock info from node %u\n",
405 ndata->node_num);
406
407 if (ndata->node_num == dlm->node_num) {
408 ndata->state = DLM_RECO_NODE_DATA_DONE;
409 continue;
410 }
411
412 status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
413 if (status < 0) {
414 mlog_errno(status);
415 if (dlm_is_host_down(status))
416 ndata->state = DLM_RECO_NODE_DATA_DEAD;
417 else {
418 destroy = 1;
419 goto leave;
420 }
421 }
422
423 switch (ndata->state) {
424 case DLM_RECO_NODE_DATA_INIT:
425 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
426 case DLM_RECO_NODE_DATA_REQUESTED:
427 BUG();
428 break;
429 case DLM_RECO_NODE_DATA_DEAD:
430 mlog(0, "node %u died after requesting "
431 "recovery info for node %u\n",
432 ndata->node_num, dead_node);
433 // start all over
434 destroy = 1;
435 status = -EAGAIN;
436 goto leave;
437 case DLM_RECO_NODE_DATA_REQUESTING:
438 ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
439 mlog(0, "now receiving recovery data from "
440 "node %u for dead node %u\n",
441 ndata->node_num, dead_node);
442 break;
443 case DLM_RECO_NODE_DATA_RECEIVING:
444 mlog(0, "already receiving recovery data from "
445 "node %u for dead node %u\n",
446 ndata->node_num, dead_node);
447 break;
448 case DLM_RECO_NODE_DATA_DONE:
449 mlog(0, "already DONE receiving recovery data "
450 "from node %u for dead node %u\n",
451 ndata->node_num, dead_node);
452 break;
453 }
454 }
455
456 mlog(0, "done requesting all lock info\n");
457
458 /* nodes should be sending reco data now
459 * just need to wait */
460
461 while (1) {
462 /* check all the nodes now to see if we are
463 * done, or if anyone died */
464 all_nodes_done = 1;
465 spin_lock(&dlm_reco_state_lock);
466 list_for_each(iter, &dlm->reco.node_data) {
467 ndata = list_entry (iter, struct dlm_reco_node_data, list);
468
469 mlog(0, "checking recovery state of node %u\n",
470 ndata->node_num);
471 switch (ndata->state) {
472 case DLM_RECO_NODE_DATA_INIT:
473 case DLM_RECO_NODE_DATA_REQUESTING:
474 mlog(ML_ERROR, "bad ndata state for "
475 "node %u: state=%d\n",
476 ndata->node_num, ndata->state);
477 BUG();
478 break;
479 case DLM_RECO_NODE_DATA_DEAD:
480 mlog(0, "node %u died after "
481 "requesting recovery info for "
482 "node %u\n", ndata->node_num,
483 dead_node);
484 spin_unlock(&dlm_reco_state_lock);
485 // start all over
486 destroy = 1;
487 status = -EAGAIN;
488 goto leave;
489 case DLM_RECO_NODE_DATA_RECEIVING:
490 case DLM_RECO_NODE_DATA_REQUESTED:
491 all_nodes_done = 0;
492 break;
493 case DLM_RECO_NODE_DATA_DONE:
494 break;
495 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
496 break;
497 }
498 }
499 spin_unlock(&dlm_reco_state_lock);
500
501 mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
502 all_nodes_done?"yes":"no");
503 if (all_nodes_done) {
504 int ret;
505
506 /* all nodes are now in DLM_RECO_NODE_DATA_DONE state
507 * just send a finalize message to everyone and
508 * clean up */
509 mlog(0, "all nodes are done! send finalize\n");
510 ret = dlm_send_finalize_reco_message(dlm);
511 if (ret < 0)
512 mlog_errno(ret);
513
514 spin_lock(&dlm->spinlock);
515 dlm_finish_local_lockres_recovery(dlm, dead_node,
516 dlm->node_num);
517 spin_unlock(&dlm->spinlock);
518 mlog(0, "should be done with recovery!\n");
519
520 mlog(0, "finishing recovery of %s at %lu, "
521 "dead=%u, this=%u, new=%u\n", dlm->name,
522 jiffies, dlm->reco.dead_node,
523 dlm->node_num, dlm->reco.new_master);
524 destroy = 1;
525 status = ret;
526 /* rescan everything marked dirty along the way */
527 dlm_kick_thread(dlm, NULL);
528 break;
529 }
530 /* wait to be signalled, with periodic timeout
531 * to check for node death */
532 wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
533 kthread_should_stop(),
534 msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
535
536 }
537
538leave:
539 if (destroy)
540 dlm_destroy_recovery_area(dlm, dead_node);
541
542 mlog_exit(status);
543 return status;
544}
545
546static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
547{
548 int num=0;
549 struct dlm_reco_node_data *ndata;
550
551 spin_lock(&dlm->spinlock);
552 memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
553 /* nodes can only be removed (by dying) after dropping
554 * this lock, and death will be trapped later, so this should do */
555 spin_unlock(&dlm->spinlock);
556
557 while (1) {
558 num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
559 if (num >= O2NM_MAX_NODES) {
560 break;
561 }
562 BUG_ON(num == dead_node);
563
564 ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
565 if (!ndata) {
566 dlm_destroy_recovery_area(dlm, dead_node);
567 return -ENOMEM;
568 }
569 ndata->node_num = num;
570 ndata->state = DLM_RECO_NODE_DATA_INIT;
571 spin_lock(&dlm_reco_state_lock);
572 list_add_tail(&ndata->list, &dlm->reco.node_data);
573 spin_unlock(&dlm_reco_state_lock);
574 num++;
575 }
576
577 return 0;
578}
579
580static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
581{
582 struct list_head *iter, *iter2;
583 struct dlm_reco_node_data *ndata;
584 LIST_HEAD(tmplist);
585
586 spin_lock(&dlm_reco_state_lock);
587 list_splice_init(&dlm->reco.node_data, &tmplist);
588 spin_unlock(&dlm_reco_state_lock);
589
590 list_for_each_safe(iter, iter2, &tmplist) {
591 ndata = list_entry (iter, struct dlm_reco_node_data, list);
592 list_del_init(&ndata->list);
593 kfree(ndata);
594 }
595}
596
597static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
598 u8 dead_node)
599{
600 struct dlm_lock_request lr;
601 enum dlm_status ret;
602
603 mlog(0, "\n");
604
605
606 mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
607 "to %u\n", dead_node, request_from);
608
609 memset(&lr, 0, sizeof(lr));
610 lr.node_idx = dlm->node_num;
611 lr.dead_node = dead_node;
612
613 // send message
614 ret = DLM_NOLOCKMGR;
615 ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
616 &lr, sizeof(lr), request_from, NULL);
617
618 /* negative status is handled by caller */
619 if (ret < 0)
620 mlog_errno(ret);
621
622 // return from here, then
623 // sleep until all received or error
624 return ret;
625
626}
627
628int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
629{
630 struct dlm_ctxt *dlm = data;
631 struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
632 char *buf = NULL;
633 struct dlm_work_item *item = NULL;
634
635 if (!dlm_grab(dlm))
636 return -EINVAL;
637
638 BUG_ON(lr->dead_node != dlm->reco.dead_node);
639
640 item = kcalloc(1, sizeof(*item), GFP_KERNEL);
641 if (!item) {
642 dlm_put(dlm);
643 return -ENOMEM;
644 }
645
646 /* this will get freed by dlm_request_all_locks_worker */
647 buf = (char *) __get_free_page(GFP_KERNEL);
648 if (!buf) {
649 kfree(item);
650 dlm_put(dlm);
651 return -ENOMEM;
652 }
653
654 /* queue up work for dlm_request_all_locks_worker */
655 dlm_grab(dlm); /* get an extra ref for the work item */
656 dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
657 item->u.ral.reco_master = lr->node_idx;
658 item->u.ral.dead_node = lr->dead_node;
659 spin_lock(&dlm->work_lock);
660 list_add_tail(&item->list, &dlm->work_list);
661 spin_unlock(&dlm->work_lock);
662 schedule_work(&dlm->dispatched_work);
663
664 dlm_put(dlm);
665 return 0;
666}
667
668static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
669{
670 struct dlm_migratable_lockres *mres;
671 struct dlm_lock_resource *res;
672 struct dlm_ctxt *dlm;
673 LIST_HEAD(resources);
674 struct list_head *iter;
675 int ret;
676 u8 dead_node, reco_master;
677
678 dlm = item->dlm;
679 dead_node = item->u.ral.dead_node;
680 reco_master = item->u.ral.reco_master;
681 BUG_ON(dead_node != dlm->reco.dead_node);
682 BUG_ON(reco_master != dlm->reco.new_master);
683
684 mres = (struct dlm_migratable_lockres *)data;
685
686 /* lock resources should have already been moved to the
687 * dlm->reco.resources list. now move items from that list
688 * to a temp list if the dead owner matches. note that the
689 * whole cluster recovers only one node at a time, so we
690 * can safely move UNKNOWN lock resources for each recovery
691 * session. */
692 dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
693
694 /* now we can begin blasting lockreses without the dlm lock */
695 list_for_each(iter, &resources) {
696 res = list_entry (iter, struct dlm_lock_resource, recovering);
697 ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
698 DLM_MRES_RECOVERY);
699 if (ret < 0)
700 mlog_errno(ret);
701 }
702
703 /* move the resources back to the list */
704 spin_lock(&dlm->spinlock);
705 list_splice_init(&resources, &dlm->reco.resources);
706 spin_unlock(&dlm->spinlock);
707
708 ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
709 if (ret < 0)
710 mlog_errno(ret);
711
712 free_page((unsigned long)data);
713}
714
715
716static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
717{
718 int ret, tmpret;
719 struct dlm_reco_data_done done_msg;
720
721 memset(&done_msg, 0, sizeof(done_msg));
722 done_msg.node_idx = dlm->node_num;
723 done_msg.dead_node = dead_node;
724 mlog(0, "sending DATA DONE message to %u, "
725 "my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
726 done_msg.dead_node);
727
728 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
729 sizeof(done_msg), send_to, &tmpret);
730 /* negative status is ignored by the caller */
731 if (ret >= 0)
732 ret = tmpret;
733 return ret;
734}
735
736
737int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
738{
739 struct dlm_ctxt *dlm = data;
740 struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
741 struct list_head *iter;
742 struct dlm_reco_node_data *ndata = NULL;
743 int ret = -EINVAL;
744
745 if (!dlm_grab(dlm))
746 return -EINVAL;
747
748 mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
749 "node_idx=%u, this node=%u\n", done->dead_node,
750 dlm->reco.dead_node, done->node_idx, dlm->node_num);
751 BUG_ON(done->dead_node != dlm->reco.dead_node);
752
753 spin_lock(&dlm_reco_state_lock);
754 list_for_each(iter, &dlm->reco.node_data) {
755 ndata = list_entry (iter, struct dlm_reco_node_data, list);
756 if (ndata->node_num != done->node_idx)
757 continue;
758
759 switch (ndata->state) {
760 case DLM_RECO_NODE_DATA_INIT:
761 case DLM_RECO_NODE_DATA_DEAD:
762 case DLM_RECO_NODE_DATA_DONE:
763 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
764 mlog(ML_ERROR, "bad ndata state for node %u:"
765 " state=%d\n", ndata->node_num,
766 ndata->state);
767 BUG();
768 break;
769 case DLM_RECO_NODE_DATA_RECEIVING:
770 case DLM_RECO_NODE_DATA_REQUESTED:
771 case DLM_RECO_NODE_DATA_REQUESTING:
772 mlog(0, "node %u is DONE sending "
773 "recovery data!\n",
774 ndata->node_num);
775
776 ndata->state = DLM_RECO_NODE_DATA_DONE;
777 ret = 0;
778 break;
779 }
780 }
781 spin_unlock(&dlm_reco_state_lock);
782
783 /* wake the recovery thread, some node is done */
784 if (!ret)
785 dlm_kick_recovery_thread(dlm);
786
787 if (ret < 0)
788 mlog(ML_ERROR, "failed to find recovery node data for node "
789 "%u\n", done->node_idx);
790 dlm_put(dlm);
791
792 mlog(0, "leaving reco data done handler, ret=%d\n", ret);
793 return ret;
794}
795
796static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
797 struct list_head *list,
798 u8 dead_node)
799{
800 struct dlm_lock_resource *res;
801 struct list_head *iter, *iter2;
802
803 spin_lock(&dlm->spinlock);
804 list_for_each_safe(iter, iter2, &dlm->reco.resources) {
805 res = list_entry (iter, struct dlm_lock_resource, recovering);
806 if (dlm_is_recovery_lock(res->lockname.name,
807 res->lockname.len))
808 continue;
809 if (res->owner == dead_node) {
810 mlog(0, "found lockres owned by dead node while "
811 "doing recovery for node %u. sending it.\n",
812 dead_node);
813 list_del_init(&res->recovering);
814 list_add_tail(&res->recovering, list);
815 } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
816 mlog(0, "found UNKNOWN owner while doing recovery "
817 "for node %u. sending it.\n", dead_node);
818 list_del_init(&res->recovering);
819 list_add_tail(&res->recovering, list);
820 }
821 }
822 spin_unlock(&dlm->spinlock);
823}
824
825static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
826{
827 int total_locks = 0;
828 struct list_head *iter, *queue = &res->granted;
829 int i;
830
831 for (i=0; i<3; i++) {
832 list_for_each(iter, queue)
833 total_locks++;
834 queue++;
835 }
836 return total_locks;
837}
838
839
840static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
841 struct dlm_migratable_lockres *mres,
842 u8 send_to,
843 struct dlm_lock_resource *res,
844 int total_locks)
845{
846 u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
847 int mres_total_locks = be32_to_cpu(mres->total_locks);
848 int sz, ret = 0, status = 0;
849 u8 orig_flags = mres->flags,
850 orig_master = mres->master;
851
852 BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
853 if (!mres->num_locks)
854 return 0;
855
856 sz = sizeof(struct dlm_migratable_lockres) +
857 (mres->num_locks * sizeof(struct dlm_migratable_lock));
858
859 /* add an all-done flag if we reached the last lock */
860 orig_flags = mres->flags;
861 BUG_ON(total_locks > mres_total_locks);
862 if (total_locks == mres_total_locks)
863 mres->flags |= DLM_MRES_ALL_DONE;
864
865 /* send it */
866 ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
867 sz, send_to, &status);
868 if (ret < 0) {
869 /* XXX: negative status is not handled.
870 * this will end up killing this node. */
871 mlog_errno(ret);
872 } else {
873 /* might get an -ENOMEM back here */
874 ret = status;
875 if (ret < 0) {
876 mlog_errno(ret);
877
878 if (ret == -EFAULT) {
879 mlog(ML_ERROR, "node %u told me to kill "
880 "myself!\n", send_to);
881 BUG();
882 }
883 }
884 }
885
886 /* zero and reinit the message buffer */
887 dlm_init_migratable_lockres(mres, res->lockname.name,
888 res->lockname.len, mres_total_locks,
889 mig_cookie, orig_flags, orig_master);
890 return ret;
891}
892
893static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
894 const char *lockname, int namelen,
895 int total_locks, u64 cookie,
896 u8 flags, u8 master)
897{
898 /* mres here is one full page */
899 memset(mres, 0, PAGE_SIZE);
900 mres->lockname_len = namelen;
901 memcpy(mres->lockname, lockname, namelen);
902 mres->num_locks = 0;
903 mres->total_locks = cpu_to_be32(total_locks);
904 mres->mig_cookie = cpu_to_be64(cookie);
905 mres->flags = flags;
906 mres->master = master;
907}
908
909
910/* returns 1 if this lock fills the network structure,
911 * 0 otherwise */
912static int dlm_add_lock_to_array(struct dlm_lock *lock,
913 struct dlm_migratable_lockres *mres, int queue)
914{
915 struct dlm_migratable_lock *ml;
916 int lock_num = mres->num_locks;
917
918 ml = &(mres->ml[lock_num]);
919 ml->cookie = lock->ml.cookie;
920 ml->type = lock->ml.type;
921 ml->convert_type = lock->ml.convert_type;
922 ml->highest_blocked = lock->ml.highest_blocked;
923 ml->list = queue;
924 if (lock->lksb) {
925 ml->flags = lock->lksb->flags;
926 /* send our current lvb */
927 if (ml->type == LKM_EXMODE ||
928 ml->type == LKM_PRMODE) {
929 /* if it is already set, this had better be a PR
930 * and it has to match */
931 if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
932 memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
933 mlog(ML_ERROR, "mismatched lvbs!\n");
934 __dlm_print_one_lock_resource(lock->lockres);
935 BUG();
936 }
937 memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
938 }
939 }
940 ml->node = lock->ml.node;
941 mres->num_locks++;
942 /* we reached the max, send this network message */
943 if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
944 return 1;
945 return 0;
946}
947
948
949int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
950 struct dlm_migratable_lockres *mres,
951 u8 send_to, u8 flags)
952{
953 struct list_head *queue, *iter;
954 int total_locks, i;
955 u64 mig_cookie = 0;
956 struct dlm_lock *lock;
957 int ret = 0;
958
959 BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
960
961 mlog(0, "sending to %u\n", send_to);
962
963 total_locks = dlm_num_locks_in_lockres(res);
964 if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
965 /* rare, but possible */
966 mlog(0, "argh. lockres has %d locks. this will "
967 "require more than one network packet to "
968 "migrate\n", total_locks);
969 mig_cookie = dlm_get_next_mig_cookie();
970 }
971
972 dlm_init_migratable_lockres(mres, res->lockname.name,
973 res->lockname.len, total_locks,
974 mig_cookie, flags, res->owner);
975
976 total_locks = 0;
977 for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
978 queue = dlm_list_idx_to_ptr(res, i);
979 list_for_each(iter, queue) {
980 lock = list_entry (iter, struct dlm_lock, list);
981
982 /* add another lock. */
983 total_locks++;
984 if (!dlm_add_lock_to_array(lock, mres, i))
985 continue;
986
987 /* this filled the lock message,
988 * we must send it immediately. */
989 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
990 res, total_locks);
991 if (ret < 0) {
992 // TODO
993 mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
994 "returned %d, TODO\n", ret);
995 BUG();
996 }
997 }
998 }
999 /* flush any remaining locks */
1000 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
1001 if (ret < 0) {
1002 // TODO
1003 mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
1004 "TODO\n", ret);
1005 BUG();
1006 }
1007 return ret;
1008}
1009
1010
1011
1012/*
1013 * this message will contain no more than one page worth of
1014 * recovery data, and it will work on only one lockres.
1015 * there may be many locks in this page, and we may need to wait
1016 * for additional packets to complete all the locks (rare, but
1017 * possible).
1018 */
1019/*
1020 * NOTE: the allocation error cases here are scary
1021 * we really cannot afford to fail an alloc in recovery
1022 * do we spin? returning an error only delays the problem really
1023 */
1024
1025int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
1026{
1027 struct dlm_ctxt *dlm = data;
1028 struct dlm_migratable_lockres *mres =
1029 (struct dlm_migratable_lockres *)msg->buf;
1030 int ret = 0;
1031 u8 real_master;
1032 char *buf = NULL;
1033 struct dlm_work_item *item = NULL;
1034 struct dlm_lock_resource *res = NULL;
1035
1036 if (!dlm_grab(dlm))
1037 return -EINVAL;
1038
1039 BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
1040
1041 real_master = mres->master;
1042 if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1043 /* cannot migrate a lockres with no master */
1044 BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
1045 }
1046
1047 mlog(0, "%s message received from node %u\n",
1048 (mres->flags & DLM_MRES_RECOVERY) ?
1049 "recovery" : "migration", mres->master);
1050 if (mres->flags & DLM_MRES_ALL_DONE)
1051 mlog(0, "all done flag. all lockres data received!\n");
1052
1053 ret = -ENOMEM;
1054 buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
1055 item = kcalloc(1, sizeof(*item), GFP_KERNEL);
1056 if (!buf || !item)
1057 goto leave;
1058
1059 /* lookup the lock to see if we have a secondary queue for this
1060 * already... just add the locks in and this will have its owner
1061 * and RECOVERY flag changed when it completes. */
1062 res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
1063 if (res) {
1064 /* this will get a ref on res */
1065 /* mark it as recovering/migrating and hash it */
1066 spin_lock(&res->spinlock);
1067 if (mres->flags & DLM_MRES_RECOVERY) {
1068 res->state |= DLM_LOCK_RES_RECOVERING;
1069 } else {
1070 if (res->state & DLM_LOCK_RES_MIGRATING) {
1071 /* this is at least the second
1072 * lockres message */
1073 mlog(0, "lock %.*s is already migrating\n",
1074 mres->lockname_len,
1075 mres->lockname);
1076 } else if (res->state & DLM_LOCK_RES_RECOVERING) {
1077 /* caller should BUG */
1078 mlog(ML_ERROR, "node is attempting to migrate "
1079 "lock %.*s, but marked as recovering!\n",
1080 mres->lockname_len, mres->lockname);
1081 ret = -EFAULT;
1082 spin_unlock(&res->spinlock);
1083 goto leave;
1084 }
1085 res->state |= DLM_LOCK_RES_MIGRATING;
1086 }
1087 spin_unlock(&res->spinlock);
1088 } else {
1089 /* need to allocate, just like if it was
1090 * mastered here normally */
1091 res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
1092 if (!res)
1093 goto leave;
1094
1095 /* to match the ref that we would have gotten if
1096 * dlm_lookup_lockres had succeeded */
1097 dlm_lockres_get(res);
1098
1099 /* mark it as recovering/migrating and hash it */
1100 if (mres->flags & DLM_MRES_RECOVERY)
1101 res->state |= DLM_LOCK_RES_RECOVERING;
1102 else
1103 res->state |= DLM_LOCK_RES_MIGRATING;
1104
1105 spin_lock(&dlm->spinlock);
1106 __dlm_insert_lockres(dlm, res);
1107 spin_unlock(&dlm->spinlock);
1108
1109 /* now that the new lockres is inserted,
1110 * make it usable by other processes */
1111 spin_lock(&res->spinlock);
1112 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
1113 spin_unlock(&res->spinlock);
1114
1115 /* add an extra ref for just-allocated lockres
1116 * otherwise the lockres will be purged immediately */
1117 dlm_lockres_get(res);
1118
1119 }
1120
1121 /* at this point we have allocated everything we need,
1122 * and we have a hashed lockres with an extra ref and
1123 * the proper res->state flags. */
1124 ret = 0;
1125 if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1126 /* migration cannot have an unknown master */
1127 BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
1128 mlog(0, "recovery has passed me a lockres with an "
1129 "unknown owner.. will need to requery: "
1130 "%.*s\n", mres->lockname_len, mres->lockname);
1131 } else {
1132 spin_lock(&res->spinlock);
1133 dlm_change_lockres_owner(dlm, res, dlm->node_num);
1134 spin_unlock(&res->spinlock);
1135 }
1136
1137 /* queue up work for dlm_mig_lockres_worker */
1138 dlm_grab(dlm); /* get an extra ref for the work item */
1139 memcpy(buf, msg->buf, be16_to_cpu(msg->data_len)); /* copy the whole message */
1140 dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
1141 item->u.ml.lockres = res; /* already have a ref */
1142 item->u.ml.real_master = real_master;
1143 spin_lock(&dlm->work_lock);
1144 list_add_tail(&item->list, &dlm->work_list);
1145 spin_unlock(&dlm->work_lock);
1146 schedule_work(&dlm->dispatched_work);
1147
1148leave:
1149 dlm_put(dlm);
1150 if (ret < 0) {
1151 if (buf)
1152 kfree(buf);
1153 if (item)
1154 kfree(item);
1155 }
1156
1157 mlog_exit(ret);
1158 return ret;
1159}
1160
1161
1162static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
1163{
1164 struct dlm_ctxt *dlm = data;
1165 struct dlm_migratable_lockres *mres;
1166 int ret = 0;
1167 struct dlm_lock_resource *res;
1168 u8 real_master;
1169
1170 dlm = item->dlm;
1171 mres = (struct dlm_migratable_lockres *)data;
1172
1173 res = item->u.ml.lockres;
1174 real_master = item->u.ml.real_master;
1175
1176 if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1177 /* this case is super-rare. only occurs if
1178 * node death happens during migration. */
1179again:
1180 ret = dlm_lockres_master_requery(dlm, res, &real_master);
1181 if (ret < 0) {
1182 mlog(0, "dlm_lockres_master_requery failure: %d\n",
1183 ret);
1184 goto again;
1185 }
1186 if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1187 mlog(0, "lockres %.*s not claimed. "
1188 "this node will take it.\n",
1189 res->lockname.len, res->lockname.name);
1190 } else {
1191 mlog(0, "master needs to respond to sender "
1192 "that node %u still owns %.*s\n",
1193 real_master, res->lockname.len,
1194 res->lockname.name);
1195 /* cannot touch this lockres */
1196 goto leave;
1197 }
1198 }
1199
1200 ret = dlm_process_recovery_data(dlm, res, mres);
1201 if (ret < 0)
1202 mlog(0, "dlm_process_recovery_data returned %d\n", ret);
1203 else
1204 mlog(0, "dlm_process_recovery_data succeeded\n");
1205
1206 if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) ==
1207 (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) {
1208 ret = dlm_finish_migration(dlm, res, mres->master);
1209 if (ret < 0)
1210 mlog_errno(ret);
1211 }
1212
1213leave:
1214 kfree(data);
1215 mlog_exit(ret);
1216}
1217
1218
1219
1220static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
1221 struct dlm_lock_resource *res,
1222 u8 *real_master)
1223{
1224 struct dlm_node_iter iter;
1225 int nodenum;
1226 int ret = 0;
1227
1228 *real_master = DLM_LOCK_RES_OWNER_UNKNOWN;
1229
1230 /* we only reach here if one of the two nodes in a
1231 * migration died while the migration was in progress.
1232 * at this point we need to requery the master. we
1233 * know that the new_master got as far as creating
1234 * an mle on at least one node, but we do not know
1235 * if any nodes had actually cleared the mle and set
1236 * the master to the new_master. the old master
1237 * is supposed to set the owner to UNKNOWN in the
1238 * event of a new_master death, so the only possible
1239 * responses that we can get from nodes here are
1240 * that the master is new_master, or that the master
1241 * is UNKNOWN.
1242 * if all nodes come back with UNKNOWN then we know
1243 * the lock needs remastering here.
1244 * if any node comes back with a valid master, check
1245 * to see if that master is the one that we are
1246 * recovering. if so, then the new_master died and
1247 * we need to remaster this lock. if not, then the
1248 * new_master survived and that node will respond to
1249 * other nodes about the owner.
1250 * if there is an owner, this node needs to dump this
1251 * lockres and alert the sender that this lockres
1252 * was rejected. */
1253 spin_lock(&dlm->spinlock);
1254 dlm_node_iter_init(dlm->domain_map, &iter);
1255 spin_unlock(&dlm->spinlock);
1256
1257 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
1258 /* do not send to self */
1259 if (nodenum == dlm->node_num)
1260 continue;
1261 ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
1262 if (ret < 0) {
1263 mlog_errno(ret);
1264 BUG();
1265 /* TODO: need to figure a way to restart this */
1266 }
1267 if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1268 mlog(0, "lock master is %u\n", *real_master);
1269 break;
1270 }
1271 }
1272 return ret;
1273}
1274
1275
1276static int dlm_do_master_requery(struct dlm_ctxt *dlm,
1277 struct dlm_lock_resource *res,
1278 u8 nodenum, u8 *real_master)
1279{
1280 int ret = -EINVAL;
1281 struct dlm_master_requery req;
1282 int status = DLM_LOCK_RES_OWNER_UNKNOWN;
1283
1284 memset(&req, 0, sizeof(req));
1285 req.node_idx = dlm->node_num;
1286 req.namelen = res->lockname.len;
1287 memcpy(req.name, res->lockname.name, res->lockname.len);
1288
1289 ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
1290 &req, sizeof(req), nodenum, &status);
1291 /* XXX: negative status not handled properly here. */
1292 if (ret < 0)
1293 mlog_errno(ret);
1294 else {
1295 BUG_ON(status < 0);
1296 BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
1297 *real_master = (u8) (status & 0xff);
1298 mlog(0, "node %u responded to master requery with %u\n",
1299 nodenum, *real_master);
1300 ret = 0;
1301 }
1302 return ret;
1303}
1304
1305
1306/* this function cannot error, so unless the sending
1307 * or receiving of the message failed, the owner can
1308 * be trusted */
1309int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
1310{
1311 struct dlm_ctxt *dlm = data;
1312 struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
1313 struct dlm_lock_resource *res = NULL;
1314 int master = DLM_LOCK_RES_OWNER_UNKNOWN;
1315 u32 flags = DLM_ASSERT_MASTER_REQUERY;
1316
1317 if (!dlm_grab(dlm)) {
1318 /* since the domain has gone away on this
1319 * node, the proper response is UNKNOWN */
1320 return master;
1321 }
1322
1323 spin_lock(&dlm->spinlock);
1324 res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
1325 if (res) {
1326 spin_lock(&res->spinlock);
1327 master = res->owner;
1328 if (master == dlm->node_num) {
1329 int ret = dlm_dispatch_assert_master(dlm, res,
1330 0, 0, flags);
1331 if (ret < 0) {
1332 mlog_errno(-ENOMEM);
1333 /* retry!? */
1334 BUG();
1335 }
1336 }
1337 spin_unlock(&res->spinlock);
1338 }
1339 spin_unlock(&dlm->spinlock);
1340
1341 dlm_put(dlm);
1342 return master;
1343}
1344
1345static inline struct list_head *
1346dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
1347{
1348 struct list_head *ret;
1349 BUG_ON(list_num < 0);
1350 BUG_ON(list_num > 2);
1351 ret = &(res->granted);
1352 ret += list_num;
1353 return ret;
1354}
1355/* TODO: do ast flush business
1356 * TODO: do MIGRATING and RECOVERING spinning
1357 */
1358
1359/*
1360* NOTE about in-flight requests during migration:
1361*
1362* Before attempting the migrate, the master has marked the lockres as
1363* MIGRATING and then flushed all of its pending ASTS. So any in-flight
1364* requests either got queued before the MIGRATING flag got set, in which
1365* case the lock data will reflect the change and a return message is on
1366* the way, or the request failed to get in before MIGRATING got set. In
1367* this case, the caller will be told to spin and wait for the MIGRATING
1368* flag to be dropped, then recheck the master.
1369* This holds true for the convert, cancel and unlock cases, and since lvb
1370* updates are tied to these same messages, it applies to lvb updates as
1371* well. For the lock case, there is no way a lock can be on the master
1372* queue and not be on the secondary queue since the lock is always added
1373* locally first. This means that the new target node will never be sent
1374* a lock that he doesn't already have on the list.
1375* In total, this means that the local lock is correct and should not be
1376* updated to match the one sent by the master. Any messages sent back
1377* from the master before the MIGRATING flag will bring the lock properly
1378* up-to-date, and the change will be ordered properly for the waiter.
1379* We will *not* attempt to modify the lock underneath the waiter.
1380*/
1381
1382static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1383 struct dlm_lock_resource *res,
1384 struct dlm_migratable_lockres *mres)
1385{
1386 struct dlm_migratable_lock *ml;
1387 struct list_head *queue;
1388 struct dlm_lock *newlock = NULL;
1389 struct dlm_lockstatus *lksb = NULL;
1390 int ret = 0;
1391 int i;
1392 struct list_head *iter;
1393 struct dlm_lock *lock = NULL;
1394
1395 mlog(0, "running %d locks for this lockres\n", mres->num_locks);
1396 for (i=0; i<mres->num_locks; i++) {
1397 ml = &(mres->ml[i]);
1398 BUG_ON(ml->highest_blocked != LKM_IVMODE);
1399 newlock = NULL;
1400 lksb = NULL;
1401
1402 queue = dlm_list_num_to_pointer(res, ml->list);
1403
1404 /* if the lock is for the local node it needs to
1405 * be moved to the proper location within the queue.
1406 * do not allocate a new lock structure. */
1407 if (ml->node == dlm->node_num) {
1408 /* MIGRATION ONLY! */
1409 BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
1410
1411 spin_lock(&res->spinlock);
1412 list_for_each(iter, queue) {
1413 lock = list_entry (iter, struct dlm_lock, list);
1414 if (lock->ml.cookie != ml->cookie)
1415 lock = NULL;
1416 else
1417 break;
1418 }
1419
1420 /* lock is always created locally first, and
1421 * destroyed locally last. it must be on the list */
1422 if (!lock) {
1423 mlog(ML_ERROR, "could not find local lock "
1424 "with cookie %"MLFu64"!\n",
1425 ml->cookie);
1426 BUG();
1427 }
1428 BUG_ON(lock->ml.node != ml->node);
1429
1430 /* see NOTE above about why we do not update
1431 * to match the master here */
1432
1433 /* move the lock to its proper place */
1434 /* do not alter lock refcount. switching lists. */
1435 list_del_init(&lock->list);
1436 list_add_tail(&lock->list, queue);
1437 spin_unlock(&res->spinlock);
1438
1439 mlog(0, "just reordered a local lock!\n");
1440 continue;
1441 }
1442
1443 /* lock is for another node. */
1444 newlock = dlm_new_lock(ml->type, ml->node,
1445 be64_to_cpu(ml->cookie), NULL);
1446 if (!newlock) {
1447 ret = -ENOMEM;
1448 goto leave;
1449 }
1450 lksb = newlock->lksb;
1451 dlm_lock_attach_lockres(newlock, res);
1452
1453 if (ml->convert_type != LKM_IVMODE) {
1454 BUG_ON(queue != &res->converting);
1455 newlock->ml.convert_type = ml->convert_type;
1456 }
1457 lksb->flags |= (ml->flags &
1458 (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
1459
1460 if (mres->lvb[0]) {
1461 if (lksb->flags & DLM_LKSB_PUT_LVB) {
1462 /* other node was trying to update
1463 * lvb when node died. recreate the
1464 * lksb with the updated lvb. */
1465 memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
1466 } else {
1467 /* otherwise, the node is sending its
1468 * most recent valid lvb info */
1469 BUG_ON(ml->type != LKM_EXMODE &&
1470 ml->type != LKM_PRMODE);
1471 if (res->lvb[0] && (ml->type == LKM_EXMODE ||
1472 memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
1473 mlog(ML_ERROR, "received bad lvb!\n");
1474 __dlm_print_one_lock_resource(res);
1475 BUG();
1476 }
1477 memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
1478 }
1479 }
1480
1481
1482 /* NOTE:
1483 * wrt lock queue ordering and recovery:
1484 * 1. order of locks on granted queue is
1485 * meaningless.
1486 * 2. order of locks on converting queue is
1487 * LOST with the node death. sorry charlie.
1488 * 3. order of locks on the blocked queue is
1489 * also LOST.
1490 * order of locks does not affect integrity, it
1491 * just means that a lock request may get pushed
1492 * back in line as a result of the node death.
1493 * also note that for a given node the lock order
1494 * for its secondary queue locks is preserved
1495 * relative to each other, but clearly *not*
1496 * preserved relative to locks from other nodes.
1497 */
1498 spin_lock(&res->spinlock);
1499 dlm_lock_get(newlock);
1500 list_add_tail(&newlock->list, queue);
1501 spin_unlock(&res->spinlock);
1502 }
1503 mlog(0, "done running all the locks\n");
1504
1505leave:
1506 if (ret < 0) {
1507 mlog_errno(ret);
1508 if (newlock)
1509 dlm_lock_put(newlock);
1510 }
1511
1512 mlog_exit(ret);
1513 return ret;
1514}
1515
1516void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
1517 struct dlm_lock_resource *res)
1518{
1519 int i;
1520 struct list_head *queue, *iter, *iter2;
1521 struct dlm_lock *lock;
1522
1523 res->state |= DLM_LOCK_RES_RECOVERING;
1524 if (!list_empty(&res->recovering))
1525 list_del_init(&res->recovering);
1526 list_add_tail(&res->recovering, &dlm->reco.resources);
1527
1528 /* find any pending locks and put them back on proper list */
1529 for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
1530 queue = dlm_list_idx_to_ptr(res, i);
1531 list_for_each_safe(iter, iter2, queue) {
1532 lock = list_entry (iter, struct dlm_lock, list);
1533 dlm_lock_get(lock);
1534 if (lock->convert_pending) {
1535 /* move converting lock back to granted */
1536 BUG_ON(i != DLM_CONVERTING_LIST);
1537 mlog(0, "node died with convert pending "
1538 "on %.*s. move back to granted list.\n",
1539 res->lockname.len, res->lockname.name);
1540 dlm_revert_pending_convert(res, lock);
1541 lock->convert_pending = 0;
1542 } else if (lock->lock_pending) {
1543 /* remove pending lock requests completely */
1544 BUG_ON(i != DLM_BLOCKED_LIST);
1545 mlog(0, "node died with lock pending "
1546 "on %.*s. remove from blocked list and skip.\n",
1547 res->lockname.len, res->lockname.name);
1548 /* lock will be floating until ref in
1549 * dlmlock_remote is freed after the network
1550 * call returns. ok for it to not be on any
1551 * list since no ast can be called
1552 * (the master is dead). */
1553 dlm_revert_pending_lock(res, lock);
1554 lock->lock_pending = 0;
1555 } else if (lock->unlock_pending) {
1556 /* if an unlock was in progress, treat as
1557 * if this had completed successfully
1558 * before sending this lock state to the
1559 * new master. note that the dlm_unlock
1560 * call is still responsible for calling
1561 * the unlockast. that will happen after
1562 * the network call times out. for now,
1563 * just move lists to prepare the new
1564 * recovery master. */
1565 BUG_ON(i != DLM_GRANTED_LIST);
1566 mlog(0, "node died with unlock pending "
1567 "on %.*s. remove from blocked list and skip.\n",
1568 res->lockname.len, res->lockname.name);
1569 dlm_commit_pending_unlock(res, lock);
1570 lock->unlock_pending = 0;
1571 } else if (lock->cancel_pending) {
1572 /* if a cancel was in progress, treat as
1573 * if this had completed successfully
1574 * before sending this lock state to the
1575 * new master */
1576 BUG_ON(i != DLM_CONVERTING_LIST);
1577 mlog(0, "node died with cancel pending "
1578 "on %.*s. move back to granted list.\n",
1579 res->lockname.len, res->lockname.name);
1580 dlm_commit_pending_cancel(res, lock);
1581 lock->cancel_pending = 0;
1582 }
1583 dlm_lock_put(lock);
1584 }
1585 }
1586}
1587
1588
1589
1590/* removes all recovered locks from the recovery list.
1591 * sets the res->owner to the new master.
1592 * unsets the RECOVERY flag and wakes waiters. */
1593static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
1594 u8 dead_node, u8 new_master)
1595{
1596 int i;
1597 struct list_head *iter, *iter2, *bucket;
1598 struct dlm_lock_resource *res;
1599
1600 mlog_entry_void();
1601
1602 assert_spin_locked(&dlm->spinlock);
1603
1604 list_for_each_safe(iter, iter2, &dlm->reco.resources) {
1605 res = list_entry (iter, struct dlm_lock_resource, recovering);
1606 if (res->owner == dead_node) {
1607 list_del_init(&res->recovering);
1608 spin_lock(&res->spinlock);
1609 dlm_change_lockres_owner(dlm, res, new_master);
1610 res->state &= ~DLM_LOCK_RES_RECOVERING;
1611 __dlm_dirty_lockres(dlm, res);
1612 spin_unlock(&res->spinlock);
1613 wake_up(&res->wq);
1614 }
1615 }
1616
1617 /* this will become unnecessary eventually, but
1618 * for now we need to run the whole hash, clear
1619 * the RECOVERING state and set the owner
1620 * if necessary */
1621 for (i=0; i<DLM_HASH_SIZE; i++) {
1622 bucket = &(dlm->resources[i]);
1623 list_for_each(iter, bucket) {
1624 res = list_entry (iter, struct dlm_lock_resource, list);
1625 if (res->state & DLM_LOCK_RES_RECOVERING) {
1626 if (res->owner == dead_node) {
1627 mlog(0, "(this=%u) res %.*s owner=%u "
1628 "was not on recovering list, but "
1629 "clearing state anyway\n",
1630 dlm->node_num, res->lockname.len,
1631 res->lockname.name, new_master);
1632 } else if (res->owner == dlm->node_num) {
1633 mlog(0, "(this=%u) res %.*s owner=%u "
1634 "was not on recovering list, "
1635 "owner is THIS node, clearing\n",
1636 dlm->node_num, res->lockname.len,
1637 res->lockname.name, new_master);
1638 } else
1639 continue;
1640
1641 spin_lock(&res->spinlock);
1642 dlm_change_lockres_owner(dlm, res, new_master);
1643 res->state &= ~DLM_LOCK_RES_RECOVERING;
1644 __dlm_dirty_lockres(dlm, res);
1645 spin_unlock(&res->spinlock);
1646 wake_up(&res->wq);
1647 }
1648 }
1649 }
1650}
1651
1652static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
1653{
1654 if (local) {
1655 if (lock->ml.type != LKM_EXMODE &&
1656 lock->ml.type != LKM_PRMODE)
1657 return 1;
1658 } else if (lock->ml.type == LKM_EXMODE)
1659 return 1;
1660 return 0;
1661}
1662
1663static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
1664 struct dlm_lock_resource *res, u8 dead_node)
1665{
1666 struct list_head *iter, *queue;
1667 struct dlm_lock *lock;
1668 int blank_lvb = 0, local = 0;
1669 int i;
1670 u8 search_node;
1671
1672 assert_spin_locked(&dlm->spinlock);
1673 assert_spin_locked(&res->spinlock);
1674
1675 if (res->owner == dlm->node_num)
1676 /* if this node owned the lockres, and if the dead node
1677 * had an EX when he died, blank out the lvb */
1678 search_node = dead_node;
1679 else {
1680 /* if this is a secondary lockres, and we had no EX or PR
1681 * locks granted, we can no longer trust the lvb */
1682 search_node = dlm->node_num;
1683 local = 1; /* check local state for valid lvb */
1684 }
1685
1686 for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
1687 queue = dlm_list_idx_to_ptr(res, i);
1688 list_for_each(iter, queue) {
1689 lock = list_entry (iter, struct dlm_lock, list);
1690 if (lock->ml.node == search_node) {
1691 if (dlm_lvb_needs_invalidation(lock, local)) {
1692 /* zero the lksb lvb and lockres lvb */
1693 blank_lvb = 1;
1694 memset(lock->lksb->lvb, 0, DLM_LVB_LEN);
1695 }
1696 }
1697 }
1698 }
1699
1700 if (blank_lvb) {
1701 mlog(0, "clearing %.*s lvb, dead node %u had EX\n",
1702 res->lockname.len, res->lockname.name, dead_node);
1703 memset(res->lvb, 0, DLM_LVB_LEN);
1704 }
1705}
1706
1707static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
1708 struct dlm_lock_resource *res, u8 dead_node)
1709{
1710 struct list_head *iter, *tmpiter;
1711 struct dlm_lock *lock;
1712
1713 /* this node is the lockres master:
1714 * 1) remove any stale locks for the dead node
1715 * 2) if the dead node had an EX when he died, blank out the lvb
1716 */
1717 assert_spin_locked(&dlm->spinlock);
1718 assert_spin_locked(&res->spinlock);
1719
1720 /* TODO: check pending_asts, pending_basts here */
1721 list_for_each_safe(iter, tmpiter, &res->granted) {
1722 lock = list_entry (iter, struct dlm_lock, list);
1723 if (lock->ml.node == dead_node) {
1724 list_del_init(&lock->list);
1725 dlm_lock_put(lock);
1726 }
1727 }
1728 list_for_each_safe(iter, tmpiter, &res->converting) {
1729 lock = list_entry (iter, struct dlm_lock, list);
1730 if (lock->ml.node == dead_node) {
1731 list_del_init(&lock->list);
1732 dlm_lock_put(lock);
1733 }
1734 }
1735 list_for_each_safe(iter, tmpiter, &res->blocked) {
1736 lock = list_entry (iter, struct dlm_lock, list);
1737 if (lock->ml.node == dead_node) {
1738 list_del_init(&lock->list);
1739 dlm_lock_put(lock);
1740 }
1741 }
1742
1743 /* do not kick thread yet */
1744 __dlm_dirty_lockres(dlm, res);
1745}
1746
1747/* if this node is the recovery master, and there are no
1748 * locks for a given lockres owned by this node that are in
1749 * either PR or EX mode, zero out the lvb before requesting.
1750 *
1751 */
1752
1753
1754static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
1755{
1756 struct list_head *iter;
1757 struct dlm_lock_resource *res;
1758 int i;
1759 struct list_head *bucket;
1760
1761
1762 /* purge any stale mles */
1763 dlm_clean_master_list(dlm, dead_node);
1764
1765 /*
1766 * now clean up all lock resources. there are two rules:
1767 *
1768 * 1) if the dead node was the master, move the lockres
1769 * to the recovering list. set the RECOVERING flag.
1770 * this lockres needs to be cleaned up before it can
1771 * be used further.
1772 *
1773 * 2) if this node was the master, remove all locks from
1774 * each of the lockres queues that were owned by the
1775 * dead node. once recovery finishes, the dlm thread
1776 * can be kicked again to see if any ASTs or BASTs
1777 * need to be fired as a result.
1778 */
1779 for (i=0; i<DLM_HASH_SIZE; i++) {
1780 bucket = &(dlm->resources[i]);
1781 list_for_each(iter, bucket) {
1782 res = list_entry (iter, struct dlm_lock_resource, list);
1783 if (dlm_is_recovery_lock(res->lockname.name,
1784 res->lockname.len))
1785 continue;
1786
1787 spin_lock(&res->spinlock);
1788 /* zero the lvb if necessary */
1789 dlm_revalidate_lvb(dlm, res, dead_node);
1790 if (res->owner == dead_node)
1791 dlm_move_lockres_to_recovery_list(dlm, res);
1792 else if (res->owner == dlm->node_num) {
1793 dlm_free_dead_locks(dlm, res, dead_node);
1794 __dlm_lockres_calc_usage(dlm, res);
1795 }
1796 spin_unlock(&res->spinlock);
1797 }
1798 }
1799
1800}
1801
1802static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
1803{
1804 assert_spin_locked(&dlm->spinlock);
1805
1806 /* check to see if the node is already considered dead */
1807 if (!test_bit(idx, dlm->live_nodes_map)) {
1808 mlog(0, "for domain %s, node %d is already dead. "
1809 "another node likely did recovery already.\n",
1810 dlm->name, idx);
1811 return;
1812 }
1813
1814 /* check to see if we do not care about this node */
1815 if (!test_bit(idx, dlm->domain_map)) {
1816 /* This also catches the case that we get a node down
1817 * but haven't joined the domain yet. */
1818 mlog(0, "node %u already removed from domain!\n", idx);
1819 return;
1820 }
1821
1822 clear_bit(idx, dlm->live_nodes_map);
1823
1824 /* Clean up join state on node death. */
1825 if (dlm->joining_node == idx) {
1826 mlog(0, "Clearing join state for node %u\n", idx);
1827 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
1828 }
1829
1830 /* make sure local cleanup occurs before the heartbeat events */
1831 if (!test_bit(idx, dlm->recovery_map))
1832 dlm_do_local_recovery_cleanup(dlm, idx);
1833
1834 /* notify anything attached to the heartbeat events */
1835 dlm_hb_event_notify_attached(dlm, idx, 0);
1836
1837 mlog(0, "node %u being removed from domain map!\n", idx);
1838 clear_bit(idx, dlm->domain_map);
1839 /* wake up migration waiters if a node goes down.
1840 * perhaps later we can genericize this for other waiters. */
1841 wake_up(&dlm->migration_wq);
1842
1843 if (test_bit(idx, dlm->recovery_map))
1844 mlog(0, "domain %s, node %u already added "
1845 "to recovery map!\n", dlm->name, idx);
1846 else
1847 set_bit(idx, dlm->recovery_map);
1848}
1849
1850void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
1851{
1852 struct dlm_ctxt *dlm = data;
1853
1854 if (!dlm_grab(dlm))
1855 return;
1856
1857 spin_lock(&dlm->spinlock);
1858 __dlm_hb_node_down(dlm, idx);
1859 spin_unlock(&dlm->spinlock);
1860
1861 dlm_put(dlm);
1862}
1863
1864void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
1865{
1866 struct dlm_ctxt *dlm = data;
1867
1868 if (!dlm_grab(dlm))
1869 return;
1870
1871 spin_lock(&dlm->spinlock);
1872
1873 set_bit(idx, dlm->live_nodes_map);
1874
1875 /* notify any mles attached to the heartbeat events */
1876 dlm_hb_event_notify_attached(dlm, idx, 1);
1877
1878 spin_unlock(&dlm->spinlock);
1879
1880 dlm_put(dlm);
1881}
1882
1883static void dlm_reco_ast(void *astdata)
1884{
1885 struct dlm_ctxt *dlm = astdata;
1886 mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n",
1887 dlm->node_num, dlm->name);
1888}
1889static void dlm_reco_bast(void *astdata, int blocked_type)
1890{
1891 struct dlm_ctxt *dlm = astdata;
1892 mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n",
1893 dlm->node_num, dlm->name);
1894}
1895static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
1896{
1897 mlog(0, "unlockast for recovery lock fired!\n");
1898}
1899
1900
1901static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
1902{
1903 enum dlm_status ret;
1904 struct dlm_lockstatus lksb;
1905 int status = -EINVAL;
1906
1907 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
1908 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
1909retry:
1910 memset(&lksb, 0, sizeof(lksb));
1911
1912 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
1913 DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
1914
1915 if (ret == DLM_NORMAL) {
1916 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
1917 dlm->name, dlm->node_num);
1918 /* I am master, send message to all nodes saying
1919 * that I am beginning a recovery session */
1920 status = dlm_send_begin_reco_message(dlm,
1921 dlm->reco.dead_node);
1922
1923 /* recovery lock is a special case. ast will not get fired,
1924 * so just go ahead and unlock it. */
1925 ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
1926 if (ret != DLM_NORMAL) {
1927 /* this would really suck. this could only happen
1928 * if there was a network error during the unlock
1929 * because of node death. this means the unlock
1930 * is actually "done" and the lock structure is
1931 * even freed. we can continue, but only
1932 * because this specific lock name is special. */
1933 mlog(0, "dlmunlock returned %d\n", ret);
1934 }
1935
1936 if (status < 0) {
1937 mlog(0, "failed to send recovery message. "
1938 "must retry with new node map.\n");
1939 goto retry;
1940 }
1941 } else if (ret == DLM_NOTQUEUED) {
1942 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
1943 dlm->name, dlm->node_num);
1944 /* another node is master. wait on
1945 * reco.new_master != O2NM_INVALID_NODE_NUM */
1946 status = -EEXIST;
1947 }
1948
1949 return status;
1950}
1951
1952static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
1953{
1954 struct dlm_begin_reco br;
1955 int ret = 0;
1956 struct dlm_node_iter iter;
1957 int nodenum;
1958 int status;
1959
1960 mlog_entry("%u\n", dead_node);
1961
1962 mlog(0, "dead node is %u\n", dead_node);
1963
1964 spin_lock(&dlm->spinlock);
1965 dlm_node_iter_init(dlm->domain_map, &iter);
1966 spin_unlock(&dlm->spinlock);
1967
1968 clear_bit(dead_node, iter.node_map);
1969
1970 memset(&br, 0, sizeof(br));
1971 br.node_idx = dlm->node_num;
1972 br.dead_node = dead_node;
1973
1974 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
1975 ret = 0;
1976 if (nodenum == dead_node) {
1977 mlog(0, "not sending begin reco to dead node "
1978 "%u\n", dead_node);
1979 continue;
1980 }
1981 if (nodenum == dlm->node_num) {
1982 mlog(0, "not sending begin reco to self\n");
1983 continue;
1984 }
1985
1986 ret = -EINVAL;
1987 mlog(0, "attempting to send begin reco msg to %d\n",
1988 nodenum);
1989 ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
1990 &br, sizeof(br), nodenum, &status);
1991 /* negative status is handled ok by caller here */
1992 if (ret >= 0)
1993 ret = status;
1994 if (ret < 0) {
1995 struct dlm_lock_resource *res;
1996 mlog_errno(ret);
1997 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
1998 " returned %d\n", dlm->name, nodenum, ret);
1999 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
2000 DLM_RECOVERY_LOCK_NAME_LEN);
2001 if (res) {
2002 dlm_print_one_lock_resource(res);
2003 dlm_lockres_put(res);
2004 } else {
2005 mlog(ML_ERROR, "recovery lock not found\n");
2006 }
2007 break;
2008 }
2009 }
2010
2011 return ret;
2012}
2013
2014int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2015{
2016 struct dlm_ctxt *dlm = data;
2017 struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
2018
2019 /* ok to return 0, domain has gone away */
2020 if (!dlm_grab(dlm))
2021 return 0;
2022
2023 mlog(0, "node %u wants to recover node %u\n",
2024 br->node_idx, br->dead_node);
2025
2026 dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
2027
2028 spin_lock(&dlm->spinlock);
2029 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2030 mlog(0, "new_master already set to %u!\n",
2031 dlm->reco.new_master);
2032 }
2033 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
2034 mlog(0, "dead_node already set to %u!\n",
2035 dlm->reco.dead_node);
2036 }
2037 dlm->reco.new_master = br->node_idx;
2038 dlm->reco.dead_node = br->dead_node;
2039 if (!test_bit(br->dead_node, dlm->recovery_map)) {
2040 mlog(ML_ERROR, "recovery master %u sees %u as dead, but this "
2041 "node has not yet. marking %u as dead\n",
2042 br->node_idx, br->dead_node, br->dead_node);
2043 __dlm_hb_node_down(dlm, br->dead_node);
2044 }
2045 spin_unlock(&dlm->spinlock);
2046
2047 dlm_kick_recovery_thread(dlm);
2048 dlm_put(dlm);
2049 return 0;
2050}
2051
2052static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
2053{
2054 int ret = 0;
2055 struct dlm_finalize_reco fr;
2056 struct dlm_node_iter iter;
2057 int nodenum;
2058 int status;
2059
2060 mlog(0, "finishing recovery for node %s:%u\n",
2061 dlm->name, dlm->reco.dead_node);
2062
2063 spin_lock(&dlm->spinlock);
2064 dlm_node_iter_init(dlm->domain_map, &iter);
2065 spin_unlock(&dlm->spinlock);
2066
2067 memset(&fr, 0, sizeof(fr));
2068 fr.node_idx = dlm->node_num;
2069 fr.dead_node = dlm->reco.dead_node;
2070
2071 while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
2072 if (nodenum == dlm->node_num)
2073 continue;
2074 ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
2075 &fr, sizeof(fr), nodenum, &status);
2076 if (ret >= 0) {
2077 ret = status;
2078 if (dlm_is_host_down(ret)) {
2079 /* this has no effect on this recovery
2080 * session, so set the status to zero to
2081 * finish out the last recovery */
2082 mlog(ML_ERROR, "node %u went down after this "
2083 "node finished recovery.\n", nodenum);
2084 ret = 0;
2085 }
2086 }
2087 if (ret < 0) {
2088 mlog_errno(ret);
2089 break;
2090 }
2091 }
2092
2093 return ret;
2094}
2095
2096int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2097{
2098 struct dlm_ctxt *dlm = data;
2099 struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
2100
2101 /* ok to return 0, domain has gone away */
2102 if (!dlm_grab(dlm))
2103 return 0;
2104
2105 mlog(0, "node %u finalizing recovery of node %u\n",
2106 fr->node_idx, fr->dead_node);
2107
2108 spin_lock(&dlm->spinlock);
2109
2110 if (dlm->reco.new_master != fr->node_idx) {
2111 mlog(ML_ERROR, "node %u sent recovery finalize msg, but node "
2112 "%u is supposed to be the new master, dead=%u\n",
2113 fr->node_idx, dlm->reco.new_master, fr->dead_node);
2114 BUG();
2115 }
2116 if (dlm->reco.dead_node != fr->dead_node) {
2117 mlog(ML_ERROR, "node %u sent recovery finalize msg for dead "
2118 "node %u, but node %u is supposed to be dead\n",
2119 fr->node_idx, fr->dead_node, dlm->reco.dead_node);
2120 BUG();
2121 }
2122
2123 dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
2124
2125 spin_unlock(&dlm->spinlock);
2126
2127 dlm_reset_recovery(dlm);
2128
2129 dlm_kick_recovery_thread(dlm);
2130 dlm_put(dlm);
2131 return 0;
2132}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
new file mode 100644
index 000000000000..5be9d14f12cb
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -0,0 +1,692 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmthread.c
5 *
6 * standalone DLM module
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/timer.h>
41#include <linux/kthread.h>
42
43
44#include "cluster/heartbeat.h"
45#include "cluster/nodemanager.h"
46#include "cluster/tcp.h"
47
48#include "dlmapi.h"
49#include "dlmcommon.h"
50#include "dlmdomain.h"
51
52#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
53#include "cluster/masklog.h"
54
55static int dlm_thread(void *data);
56
57static void dlm_flush_asts(struct dlm_ctxt *dlm);
58
59#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num)
60
61/* will exit holding res->spinlock, but may drop in function */
62/* waits until flags are cleared on res->state */
63void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
64{
65 DECLARE_WAITQUEUE(wait, current);
66
67 assert_spin_locked(&res->spinlock);
68
69 add_wait_queue(&res->wq, &wait);
70repeat:
71 set_current_state(TASK_UNINTERRUPTIBLE);
72 if (res->state & flags) {
73 spin_unlock(&res->spinlock);
74 schedule();
75 spin_lock(&res->spinlock);
76 goto repeat;
77 }
78 remove_wait_queue(&res->wq, &wait);
79 current->state = TASK_RUNNING;
80}
81
82
83static int __dlm_lockres_unused(struct dlm_lock_resource *res)
84{
85 if (list_empty(&res->granted) &&
86 list_empty(&res->converting) &&
87 list_empty(&res->blocked) &&
88 list_empty(&res->dirty))
89 return 1;
90 return 0;
91}
92
93
94/* Call whenever you may have added or deleted something from one of
95 * the lockres queue's. This will figure out whether it belongs on the
96 * unused list or not and does the appropriate thing. */
97void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
98 struct dlm_lock_resource *res)
99{
100 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
101
102 assert_spin_locked(&dlm->spinlock);
103 assert_spin_locked(&res->spinlock);
104
105 if (__dlm_lockres_unused(res)){
106 if (list_empty(&res->purge)) {
107 mlog(0, "putting lockres %.*s from purge list\n",
108 res->lockname.len, res->lockname.name);
109
110 res->last_used = jiffies;
111 list_add_tail(&res->purge, &dlm->purge_list);
112 dlm->purge_count++;
113 }
114 } else if (!list_empty(&res->purge)) {
115 mlog(0, "removing lockres %.*s from purge list\n",
116 res->lockname.len, res->lockname.name);
117
118 list_del_init(&res->purge);
119 dlm->purge_count--;
120 }
121}
122
123void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
124 struct dlm_lock_resource *res)
125{
126 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
127 spin_lock(&dlm->spinlock);
128 spin_lock(&res->spinlock);
129
130 __dlm_lockres_calc_usage(dlm, res);
131
132 spin_unlock(&res->spinlock);
133 spin_unlock(&dlm->spinlock);
134}
135
136/* TODO: Eventual API: Called with the dlm spinlock held, may drop it
137 * to do migration, but will re-acquire before exit. */
138void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres)
139{
140 int master;
141 int ret;
142
143 spin_lock(&lockres->spinlock);
144 master = lockres->owner == dlm->node_num;
145 spin_unlock(&lockres->spinlock);
146
147 mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len,
148 lockres->lockname.name, master);
149
150 /* Non master is the easy case -- no migration required, just
151 * quit. */
152 if (!master)
153 goto finish;
154
155 /* Wheee! Migrate lockres here! */
156 spin_unlock(&dlm->spinlock);
157again:
158
159 ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES);
160 if (ret == -ENOTEMPTY) {
161 mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
162 lockres->lockname.len, lockres->lockname.name);
163
164 BUG();
165 } else if (ret < 0) {
166 mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
167 lockres->lockname.len, lockres->lockname.name);
168 goto again;
169 }
170
171 spin_lock(&dlm->spinlock);
172
173finish:
174 if (!list_empty(&lockres->purge)) {
175 list_del_init(&lockres->purge);
176 dlm->purge_count--;
177 }
178 __dlm_unhash_lockres(lockres);
179}
180
181static void dlm_run_purge_list(struct dlm_ctxt *dlm,
182 int purge_now)
183{
184 unsigned int run_max, unused;
185 unsigned long purge_jiffies;
186 struct dlm_lock_resource *lockres;
187
188 spin_lock(&dlm->spinlock);
189 run_max = dlm->purge_count;
190
191 while(run_max && !list_empty(&dlm->purge_list)) {
192 run_max--;
193
194 lockres = list_entry(dlm->purge_list.next,
195 struct dlm_lock_resource, purge);
196
197 /* Status of the lockres *might* change so double
198 * check. If the lockres is unused, holding the dlm
199 * spinlock will prevent people from getting and more
200 * refs on it -- there's no need to keep the lockres
201 * spinlock. */
202 spin_lock(&lockres->spinlock);
203 unused = __dlm_lockres_unused(lockres);
204 spin_unlock(&lockres->spinlock);
205
206 if (!unused)
207 continue;
208
209 purge_jiffies = lockres->last_used +
210 msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
211
212 /* Make sure that we want to be processing this guy at
213 * this time. */
214 if (!purge_now && time_after(purge_jiffies, jiffies)) {
215 /* Since resources are added to the purge list
216 * in tail order, we can stop at the first
217 * unpurgable resource -- anyone added after
218 * him will have a greater last_used value */
219 break;
220 }
221
222 list_del_init(&lockres->purge);
223 dlm->purge_count--;
224
225 /* This may drop and reacquire the dlm spinlock if it
226 * has to do migration. */
227 mlog(0, "calling dlm_purge_lockres!\n");
228 dlm_purge_lockres(dlm, lockres);
229 mlog(0, "DONE calling dlm_purge_lockres!\n");
230
231 /* Avoid adding any scheduling latencies */
232 cond_resched_lock(&dlm->spinlock);
233 }
234
235 spin_unlock(&dlm->spinlock);
236}
237
238static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
239 struct dlm_lock_resource *res)
240{
241 struct dlm_lock *lock, *target;
242 struct list_head *iter;
243 struct list_head *head;
244 int can_grant = 1;
245
246 //mlog(0, "res->lockname.len=%d\n", res->lockname.len);
247 //mlog(0, "res->lockname.name=%p\n", res->lockname.name);
248 //mlog(0, "shuffle res %.*s\n", res->lockname.len,
249 // res->lockname.name);
250
251 /* because this function is called with the lockres
252 * spinlock, and because we know that it is not migrating/
253 * recovering/in-progress, it is fine to reserve asts and
254 * basts right before queueing them all throughout */
255 assert_spin_locked(&res->spinlock);
256 BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
257 DLM_LOCK_RES_RECOVERING|
258 DLM_LOCK_RES_IN_PROGRESS)));
259
260converting:
261 if (list_empty(&res->converting))
262 goto blocked;
263 mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
264 res->lockname.name);
265
266 target = list_entry(res->converting.next, struct dlm_lock, list);
267 if (target->ml.convert_type == LKM_IVMODE) {
268 mlog(ML_ERROR, "%.*s: converting a lock with no "
269 "convert_type!\n", res->lockname.len, res->lockname.name);
270 BUG();
271 }
272 head = &res->granted;
273 list_for_each(iter, head) {
274 lock = list_entry(iter, struct dlm_lock, list);
275 if (lock==target)
276 continue;
277 if (!dlm_lock_compatible(lock->ml.type,
278 target->ml.convert_type)) {
279 can_grant = 0;
280 /* queue the BAST if not already */
281 if (lock->ml.highest_blocked == LKM_IVMODE) {
282 __dlm_lockres_reserve_ast(res);
283 dlm_queue_bast(dlm, lock);
284 }
285 /* update the highest_blocked if needed */
286 if (lock->ml.highest_blocked < target->ml.convert_type)
287 lock->ml.highest_blocked =
288 target->ml.convert_type;
289 }
290 }
291 head = &res->converting;
292 list_for_each(iter, head) {
293 lock = list_entry(iter, struct dlm_lock, list);
294 if (lock==target)
295 continue;
296 if (!dlm_lock_compatible(lock->ml.type,
297 target->ml.convert_type)) {
298 can_grant = 0;
299 if (lock->ml.highest_blocked == LKM_IVMODE) {
300 __dlm_lockres_reserve_ast(res);
301 dlm_queue_bast(dlm, lock);
302 }
303 if (lock->ml.highest_blocked < target->ml.convert_type)
304 lock->ml.highest_blocked =
305 target->ml.convert_type;
306 }
307 }
308
309 /* we can convert the lock */
310 if (can_grant) {
311 spin_lock(&target->spinlock);
312 BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
313
314 mlog(0, "calling ast for converting lock: %.*s, have: %d, "
315 "granting: %d, node: %u\n", res->lockname.len,
316 res->lockname.name, target->ml.type,
317 target->ml.convert_type, target->ml.node);
318
319 target->ml.type = target->ml.convert_type;
320 target->ml.convert_type = LKM_IVMODE;
321 list_del_init(&target->list);
322 list_add_tail(&target->list, &res->granted);
323
324 BUG_ON(!target->lksb);
325 target->lksb->status = DLM_NORMAL;
326
327 spin_unlock(&target->spinlock);
328
329 __dlm_lockres_reserve_ast(res);
330 dlm_queue_ast(dlm, target);
331 /* go back and check for more */
332 goto converting;
333 }
334
335blocked:
336 if (list_empty(&res->blocked))
337 goto leave;
338 target = list_entry(res->blocked.next, struct dlm_lock, list);
339
340 head = &res->granted;
341 list_for_each(iter, head) {
342 lock = list_entry(iter, struct dlm_lock, list);
343 if (lock==target)
344 continue;
345 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
346 can_grant = 0;
347 if (lock->ml.highest_blocked == LKM_IVMODE) {
348 __dlm_lockres_reserve_ast(res);
349 dlm_queue_bast(dlm, lock);
350 }
351 if (lock->ml.highest_blocked < target->ml.type)
352 lock->ml.highest_blocked = target->ml.type;
353 }
354 }
355
356 head = &res->converting;
357 list_for_each(iter, head) {
358 lock = list_entry(iter, struct dlm_lock, list);
359 if (lock==target)
360 continue;
361 if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
362 can_grant = 0;
363 if (lock->ml.highest_blocked == LKM_IVMODE) {
364 __dlm_lockres_reserve_ast(res);
365 dlm_queue_bast(dlm, lock);
366 }
367 if (lock->ml.highest_blocked < target->ml.type)
368 lock->ml.highest_blocked = target->ml.type;
369 }
370 }
371
372 /* we can grant the blocked lock (only
373 * possible if converting list empty) */
374 if (can_grant) {
375 spin_lock(&target->spinlock);
376 BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
377
378 mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
379 "node: %u\n", res->lockname.len, res->lockname.name,
380 target->ml.type, target->ml.node);
381
382 // target->ml.type is already correct
383 list_del_init(&target->list);
384 list_add_tail(&target->list, &res->granted);
385
386 BUG_ON(!target->lksb);
387 target->lksb->status = DLM_NORMAL;
388
389 spin_unlock(&target->spinlock);
390
391 __dlm_lockres_reserve_ast(res);
392 dlm_queue_ast(dlm, target);
393 /* go back and check for more */
394 goto converting;
395 }
396
397leave:
398 return;
399}
400
401/* must have NO locks when calling this with res !=NULL * */
402void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
403{
404 mlog_entry("dlm=%p, res=%p\n", dlm, res);
405 if (res) {
406 spin_lock(&dlm->spinlock);
407 spin_lock(&res->spinlock);
408 __dlm_dirty_lockres(dlm, res);
409 spin_unlock(&res->spinlock);
410 spin_unlock(&dlm->spinlock);
411 }
412 wake_up(&dlm->dlm_thread_wq);
413}
414
415void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
416{
417 mlog_entry("dlm=%p, res=%p\n", dlm, res);
418
419 assert_spin_locked(&dlm->spinlock);
420 assert_spin_locked(&res->spinlock);
421
422 /* don't shuffle secondary queues */
423 if ((res->owner == dlm->node_num) &&
424 !(res->state & DLM_LOCK_RES_DIRTY)) {
425 list_add_tail(&res->dirty, &dlm->dirty_list);
426 res->state |= DLM_LOCK_RES_DIRTY;
427 }
428}
429
430
431/* Launch the NM thread for the mounted volume */
432int dlm_launch_thread(struct dlm_ctxt *dlm)
433{
434 mlog(0, "starting dlm thread...\n");
435
436 dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
437 if (IS_ERR(dlm->dlm_thread_task)) {
438 mlog_errno(PTR_ERR(dlm->dlm_thread_task));
439 dlm->dlm_thread_task = NULL;
440 return -EINVAL;
441 }
442
443 return 0;
444}
445
446void dlm_complete_thread(struct dlm_ctxt *dlm)
447{
448 if (dlm->dlm_thread_task) {
449 mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
450 kthread_stop(dlm->dlm_thread_task);
451 dlm->dlm_thread_task = NULL;
452 }
453}
454
455static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
456{
457 int empty;
458
459 spin_lock(&dlm->spinlock);
460 empty = list_empty(&dlm->dirty_list);
461 spin_unlock(&dlm->spinlock);
462
463 return empty;
464}
465
466static void dlm_flush_asts(struct dlm_ctxt *dlm)
467{
468 int ret;
469 struct dlm_lock *lock;
470 struct dlm_lock_resource *res;
471 u8 hi;
472
473 spin_lock(&dlm->ast_lock);
474 while (!list_empty(&dlm->pending_asts)) {
475 lock = list_entry(dlm->pending_asts.next,
476 struct dlm_lock, ast_list);
477 /* get an extra ref on lock */
478 dlm_lock_get(lock);
479 res = lock->lockres;
480 mlog(0, "delivering an ast for this lockres\n");
481
482 BUG_ON(!lock->ast_pending);
483
484 /* remove from list (including ref) */
485 list_del_init(&lock->ast_list);
486 dlm_lock_put(lock);
487 spin_unlock(&dlm->ast_lock);
488
489 if (lock->ml.node != dlm->node_num) {
490 ret = dlm_do_remote_ast(dlm, res, lock);
491 if (ret < 0)
492 mlog_errno(ret);
493 } else
494 dlm_do_local_ast(dlm, res, lock);
495
496 spin_lock(&dlm->ast_lock);
497
498 /* possible that another ast was queued while
499 * we were delivering the last one */
500 if (!list_empty(&lock->ast_list)) {
501 mlog(0, "aha another ast got queued while "
502 "we were finishing the last one. will "
503 "keep the ast_pending flag set.\n");
504 } else
505 lock->ast_pending = 0;
506
507 /* drop the extra ref.
508 * this may drop it completely. */
509 dlm_lock_put(lock);
510 dlm_lockres_release_ast(dlm, res);
511 }
512
513 while (!list_empty(&dlm->pending_basts)) {
514 lock = list_entry(dlm->pending_basts.next,
515 struct dlm_lock, bast_list);
516 /* get an extra ref on lock */
517 dlm_lock_get(lock);
518 res = lock->lockres;
519
520 BUG_ON(!lock->bast_pending);
521
522 /* get the highest blocked lock, and reset */
523 spin_lock(&lock->spinlock);
524 BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
525 hi = lock->ml.highest_blocked;
526 lock->ml.highest_blocked = LKM_IVMODE;
527 spin_unlock(&lock->spinlock);
528
529 /* remove from list (including ref) */
530 list_del_init(&lock->bast_list);
531 dlm_lock_put(lock);
532 spin_unlock(&dlm->ast_lock);
533
534 mlog(0, "delivering a bast for this lockres "
535 "(blocked = %d\n", hi);
536
537 if (lock->ml.node != dlm->node_num) {
538 ret = dlm_send_proxy_bast(dlm, res, lock, hi);
539 if (ret < 0)
540 mlog_errno(ret);
541 } else
542 dlm_do_local_bast(dlm, res, lock, hi);
543
544 spin_lock(&dlm->ast_lock);
545
546 /* possible that another bast was queued while
547 * we were delivering the last one */
548 if (!list_empty(&lock->bast_list)) {
549 mlog(0, "aha another bast got queued while "
550 "we were finishing the last one. will "
551 "keep the bast_pending flag set.\n");
552 } else
553 lock->bast_pending = 0;
554
555 /* drop the extra ref.
556 * this may drop it completely. */
557 dlm_lock_put(lock);
558 dlm_lockres_release_ast(dlm, res);
559 }
560 wake_up(&dlm->ast_wq);
561 spin_unlock(&dlm->ast_lock);
562}
563
564
565#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
566#define DLM_THREAD_MAX_DIRTY 100
567#define DLM_THREAD_MAX_ASTS 10
568
569static int dlm_thread(void *data)
570{
571 struct dlm_lock_resource *res;
572 struct dlm_ctxt *dlm = data;
573 unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS);
574
575 mlog(0, "dlm thread running for %s...\n", dlm->name);
576
577 while (!kthread_should_stop()) {
578 int n = DLM_THREAD_MAX_DIRTY;
579
580 /* dlm_shutting_down is very point-in-time, but that
581 * doesn't matter as we'll just loop back around if we
582 * get false on the leading edge of a state
583 * transition. */
584 dlm_run_purge_list(dlm, dlm_shutting_down(dlm));
585
586 /* We really don't want to hold dlm->spinlock while
587 * calling dlm_shuffle_lists on each lockres that
588 * needs to have its queues adjusted and AST/BASTs
589 * run. So let's pull each entry off the dirty_list
590 * and drop dlm->spinlock ASAP. Once off the list,
591 * res->spinlock needs to be taken again to protect
592 * the queues while calling dlm_shuffle_lists. */
593 spin_lock(&dlm->spinlock);
594 while (!list_empty(&dlm->dirty_list)) {
595 int delay = 0;
596 res = list_entry(dlm->dirty_list.next,
597 struct dlm_lock_resource, dirty);
598
599 /* peel a lockres off, remove it from the list,
600 * unset the dirty flag and drop the dlm lock */
601 BUG_ON(!res);
602 dlm_lockres_get(res);
603
604 spin_lock(&res->spinlock);
605 res->state &= ~DLM_LOCK_RES_DIRTY;
606 list_del_init(&res->dirty);
607 spin_unlock(&res->spinlock);
608 spin_unlock(&dlm->spinlock);
609
610 /* lockres can be re-dirtied/re-added to the
611 * dirty_list in this gap, but that is ok */
612
613 spin_lock(&res->spinlock);
614 if (res->owner != dlm->node_num) {
615 __dlm_print_one_lock_resource(res);
616 mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
617 res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
618 res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
619 res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
620 res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
621 }
622 BUG_ON(res->owner != dlm->node_num);
623
624 /* it is now ok to move lockreses in these states
625 * to the dirty list, assuming that they will only be
626 * dirty for a short while. */
627 if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
628 DLM_LOCK_RES_MIGRATING |
629 DLM_LOCK_RES_RECOVERING)) {
630 /* move it to the tail and keep going */
631 spin_unlock(&res->spinlock);
632 mlog(0, "delaying list shuffling for in-"
633 "progress lockres %.*s, state=%d\n",
634 res->lockname.len, res->lockname.name,
635 res->state);
636 delay = 1;
637 goto in_progress;
638 }
639
640 /* at this point the lockres is not migrating/
641 * recovering/in-progress. we have the lockres
642 * spinlock and do NOT have the dlm lock.
643 * safe to reserve/queue asts and run the lists. */
644
645 mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
646 "res=%p\n", dlm, res);
647
648 /* called while holding lockres lock */
649 dlm_shuffle_lists(dlm, res);
650 spin_unlock(&res->spinlock);
651
652 dlm_lockres_calc_usage(dlm, res);
653
654in_progress:
655
656 spin_lock(&dlm->spinlock);
657 /* if the lock was in-progress, stick
658 * it on the back of the list */
659 if (delay) {
660 spin_lock(&res->spinlock);
661 list_add_tail(&res->dirty, &dlm->dirty_list);
662 res->state |= DLM_LOCK_RES_DIRTY;
663 spin_unlock(&res->spinlock);
664 }
665 dlm_lockres_put(res);
666
667 /* unlikely, but we may need to give time to
668 * other tasks */
669 if (!--n) {
670 mlog(0, "throttling dlm_thread\n");
671 break;
672 }
673 }
674
675 spin_unlock(&dlm->spinlock);
676 dlm_flush_asts(dlm);
677
678 /* yield and continue right away if there is more work to do */
679 if (!n) {
680 yield();
681 continue;
682 }
683
684 wait_event_interruptible_timeout(dlm->dlm_thread_wq,
685 !dlm_dirty_list_empty(dlm) ||
686 kthread_should_stop(),
687 timeout);
688 }
689
690 mlog(0, "quitting DLM thread\n");
691 return 0;
692}
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
new file mode 100644
index 000000000000..cec2ce1cd318
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -0,0 +1,672 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmunlock.c
5 *
6 * underlying calls for unlocking locks
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27
28#include <linux/module.h>
29#include <linux/fs.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h>
35#include <linux/sysctl.h>
36#include <linux/random.h>
37#include <linux/blkdev.h>
38#include <linux/socket.h>
39#include <linux/inet.h>
40#include <linux/spinlock.h>
41#include <linux/delay.h>
42
43#include "cluster/heartbeat.h"
44#include "cluster/nodemanager.h"
45#include "cluster/tcp.h"
46
47#include "dlmapi.h"
48#include "dlmcommon.h"
49
50#define MLOG_MASK_PREFIX ML_DLM
51#include "cluster/masklog.h"
52
53#define DLM_UNLOCK_FREE_LOCK 0x00000001
54#define DLM_UNLOCK_CALL_AST 0x00000002
55#define DLM_UNLOCK_REMOVE_LOCK 0x00000004
56#define DLM_UNLOCK_REGRANT_LOCK 0x00000008
57#define DLM_UNLOCK_CLEAR_CONVERT_TYPE 0x00000010
58
59
60static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
61 struct dlm_lock_resource *res,
62 struct dlm_lock *lock,
63 struct dlm_lockstatus *lksb,
64 int *actions);
65static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
66 struct dlm_lock_resource *res,
67 struct dlm_lock *lock,
68 struct dlm_lockstatus *lksb,
69 int *actions);
70
71static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
72 struct dlm_lock_resource *res,
73 struct dlm_lock *lock,
74 struct dlm_lockstatus *lksb,
75 int flags,
76 u8 owner);
77
78
79/*
80 * according to the spec:
81 * http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
82 *
83 * flags & LKM_CANCEL != 0: must be converting or blocked
84 * flags & LKM_CANCEL == 0: must be granted
85 *
86 * So to unlock a converting lock, you must first cancel the
87 * convert (passing LKM_CANCEL in flags), then call the unlock
88 * again (with no LKM_CANCEL in flags).
89 */
90
91
92/*
93 * locking:
94 * caller needs: none
95 * taken: res->spinlock and lock->spinlock taken and dropped
96 * held on exit: none
97 * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
98 * all callers should have taken an extra ref on lock coming in
99 */
100static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
101 struct dlm_lock_resource *res,
102 struct dlm_lock *lock,
103 struct dlm_lockstatus *lksb,
104 int flags, int *call_ast,
105 int master_node)
106{
107 enum dlm_status status;
108 int actions = 0;
109 int in_use;
110 u8 owner;
111
112 mlog(0, "master_node = %d, valblk = %d\n", master_node,
113 flags & LKM_VALBLK);
114
115 if (master_node)
116 BUG_ON(res->owner != dlm->node_num);
117 else
118 BUG_ON(res->owner == dlm->node_num);
119
120 spin_lock(&dlm->spinlock);
121 /* We want to be sure that we're not freeing a lock
122 * that still has AST's pending... */
123 in_use = !list_empty(&lock->ast_list);
124 spin_unlock(&dlm->spinlock);
125 if (in_use) {
126 mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
127 "while waiting for an ast!", res->lockname.len,
128 res->lockname.name);
129 return DLM_BADPARAM;
130 }
131
132 spin_lock(&res->spinlock);
133 if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
134 if (master_node) {
135 mlog(ML_ERROR, "lockres in progress!\n");
136 spin_unlock(&res->spinlock);
137 return DLM_FORWARD;
138 }
139 /* ok for this to sleep if not in a network handler */
140 __dlm_wait_on_lockres(res);
141 res->state |= DLM_LOCK_RES_IN_PROGRESS;
142 }
143 spin_lock(&lock->spinlock);
144
145 if (res->state & DLM_LOCK_RES_RECOVERING) {
146 status = DLM_RECOVERING;
147 goto leave;
148 }
149
150
151 /* see above for what the spec says about
152 * LKM_CANCEL and the lock queue state */
153 if (flags & LKM_CANCEL)
154 status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions);
155 else
156 status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions);
157
158 if (status != DLM_NORMAL)
159 goto leave;
160
161 /* By now this has been masked out of cancel requests. */
162 if (flags & LKM_VALBLK) {
163 /* make the final update to the lvb */
164 if (master_node)
165 memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
166 else
167 flags |= LKM_PUT_LVB; /* let the send function
168 * handle it. */
169 }
170
171 if (!master_node) {
172 owner = res->owner;
173 /* drop locks and send message */
174 if (flags & LKM_CANCEL)
175 lock->cancel_pending = 1;
176 else
177 lock->unlock_pending = 1;
178 spin_unlock(&lock->spinlock);
179 spin_unlock(&res->spinlock);
180 status = dlm_send_remote_unlock_request(dlm, res, lock, lksb,
181 flags, owner);
182 spin_lock(&res->spinlock);
183 spin_lock(&lock->spinlock);
184 /* if the master told us the lock was already granted,
185 * let the ast handle all of these actions */
186 if (status == DLM_NORMAL &&
187 lksb->status == DLM_CANCELGRANT) {
188 actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
189 DLM_UNLOCK_REGRANT_LOCK|
190 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
191 }
192 if (flags & LKM_CANCEL)
193 lock->cancel_pending = 0;
194 else
195 lock->unlock_pending = 0;
196
197 }
198
199 /* get an extra ref on lock. if we are just switching
200 * lists here, we dont want the lock to go away. */
201 dlm_lock_get(lock);
202
203 if (actions & DLM_UNLOCK_REMOVE_LOCK) {
204 list_del_init(&lock->list);
205 dlm_lock_put(lock);
206 }
207 if (actions & DLM_UNLOCK_REGRANT_LOCK) {
208 dlm_lock_get(lock);
209 list_add_tail(&lock->list, &res->granted);
210 }
211 if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) {
212 mlog(0, "clearing convert_type at %smaster node\n",
213 master_node ? "" : "non-");
214 lock->ml.convert_type = LKM_IVMODE;
215 }
216
217 /* remove the extra ref on lock */
218 dlm_lock_put(lock);
219
220leave:
221 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
222 if (!dlm_lock_on_list(&res->converting, lock))
223 BUG_ON(lock->ml.convert_type != LKM_IVMODE);
224 else
225 BUG_ON(lock->ml.convert_type == LKM_IVMODE);
226 spin_unlock(&lock->spinlock);
227 spin_unlock(&res->spinlock);
228 wake_up(&res->wq);
229
230 /* let the caller's final dlm_lock_put handle the actual kfree */
231 if (actions & DLM_UNLOCK_FREE_LOCK) {
232 /* this should always be coupled with list removal */
233 BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
234 mlog(0, "lock %"MLFu64" should be gone now! refs=%d\n",
235 lock->ml.cookie, atomic_read(&lock->lock_refs.refcount)-1);
236 dlm_lock_put(lock);
237 }
238 if (actions & DLM_UNLOCK_CALL_AST)
239 *call_ast = 1;
240
241 /* if cancel or unlock succeeded, lvb work is done */
242 if (status == DLM_NORMAL)
243 lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
244
245 return status;
246}
247
248void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
249 struct dlm_lock *lock)
250{
251 /* leave DLM_LKSB_PUT_LVB on the lksb so any final
252 * update of the lvb will be sent to the new master */
253 list_del_init(&lock->list);
254}
255
256void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
257 struct dlm_lock *lock)
258{
259 list_del_init(&lock->list);
260 list_add_tail(&lock->list, &res->granted);
261 lock->ml.convert_type = LKM_IVMODE;
262}
263
264
265static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm,
266 struct dlm_lock_resource *res,
267 struct dlm_lock *lock,
268 struct dlm_lockstatus *lksb,
269 int flags,
270 int *call_ast)
271{
272 return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1);
273}
274
275static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm,
276 struct dlm_lock_resource *res,
277 struct dlm_lock *lock,
278 struct dlm_lockstatus *lksb,
279 int flags, int *call_ast)
280{
281 return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0);
282}
283
284/*
285 * locking:
286 * caller needs: none
287 * taken: none
288 * held on exit: none
289 * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
290 */
291static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
292 struct dlm_lock_resource *res,
293 struct dlm_lock *lock,
294 struct dlm_lockstatus *lksb,
295 int flags,
296 u8 owner)
297{
298 struct dlm_unlock_lock unlock;
299 int tmpret;
300 enum dlm_status ret;
301 int status = 0;
302 struct kvec vec[2];
303 size_t veclen = 1;
304
305 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
306
307 memset(&unlock, 0, sizeof(unlock));
308 unlock.node_idx = dlm->node_num;
309 unlock.flags = cpu_to_be32(flags);
310 unlock.cookie = lock->ml.cookie;
311 unlock.namelen = res->lockname.len;
312 memcpy(unlock.name, res->lockname.name, unlock.namelen);
313
314 vec[0].iov_len = sizeof(struct dlm_unlock_lock);
315 vec[0].iov_base = &unlock;
316
317 if (flags & LKM_PUT_LVB) {
318 /* extra data to send if we are updating lvb */
319 vec[1].iov_len = DLM_LVB_LEN;
320 vec[1].iov_base = lock->lksb->lvb;
321 veclen++;
322 }
323
324 tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key,
325 vec, veclen, owner, &status);
326 if (tmpret >= 0) {
327 // successfully sent and received
328 if (status == DLM_CANCELGRANT)
329 ret = DLM_NORMAL;
330 else if (status == DLM_FORWARD) {
331 mlog(0, "master was in-progress. retry\n");
332 ret = DLM_FORWARD;
333 } else
334 ret = status;
335 lksb->status = status;
336 } else {
337 mlog_errno(tmpret);
338 if (dlm_is_host_down(tmpret)) {
339 /* NOTE: this seems strange, but it is what we want.
340 * when the master goes down during a cancel or
341 * unlock, the recovery code completes the operation
342 * as if the master had not died, then passes the
343 * updated state to the recovery master. this thread
344 * just needs to finish out the operation and call
345 * the unlockast. */
346 ret = DLM_NORMAL;
347 } else {
348 /* something bad. this will BUG in ocfs2 */
349 ret = dlm_err_to_dlm_status(tmpret);
350 }
351 lksb->status = ret;
352 }
353
354 return ret;
355}
356
357/*
358 * locking:
359 * caller needs: none
360 * taken: takes and drops res->spinlock
361 * held on exit: none
362 * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
363 * return value from dlmunlock_master
364 */
365int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data)
366{
367 struct dlm_ctxt *dlm = data;
368 struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
369 struct dlm_lock_resource *res = NULL;
370 struct list_head *iter;
371 struct dlm_lock *lock = NULL;
372 enum dlm_status status = DLM_NORMAL;
373 int found = 0, i;
374 struct dlm_lockstatus *lksb = NULL;
375 int ignore;
376 u32 flags;
377 struct list_head *queue;
378
379 flags = be32_to_cpu(unlock->flags);
380
381 if (flags & LKM_GET_LVB) {
382 mlog(ML_ERROR, "bad args! GET_LVB specified on unlock!\n");
383 return DLM_BADARGS;
384 }
385
386 if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) {
387 mlog(ML_ERROR, "bad args! cannot modify lvb on a CANCEL "
388 "request!\n");
389 return DLM_BADARGS;
390 }
391
392 if (unlock->namelen > DLM_LOCKID_NAME_MAX) {
393 mlog(ML_ERROR, "Invalid name length in unlock handler!\n");
394 return DLM_IVBUFLEN;
395 }
396
397 if (!dlm_grab(dlm))
398 return DLM_REJECTED;
399
400 mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
401 "Domain %s not fully joined!\n", dlm->name);
402
403 mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none");
404
405 res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen);
406 if (!res) {
407 /* We assume here that a no lock resource simply means
408 * it was migrated away and destroyed before the other
409 * node could detect it. */
410 mlog(0, "returning DLM_FORWARD -- res no longer exists\n");
411 status = DLM_FORWARD;
412 goto not_found;
413 }
414
415 queue=&res->granted;
416 found = 0;
417 spin_lock(&res->spinlock);
418 if (res->state & DLM_LOCK_RES_RECOVERING) {
419 spin_unlock(&res->spinlock);
420 mlog(0, "returning DLM_RECOVERING\n");
421 status = DLM_RECOVERING;
422 goto leave;
423 }
424
425 if (res->state & DLM_LOCK_RES_MIGRATING) {
426 spin_unlock(&res->spinlock);
427 mlog(0, "returning DLM_MIGRATING\n");
428 status = DLM_MIGRATING;
429 goto leave;
430 }
431
432 if (res->owner != dlm->node_num) {
433 spin_unlock(&res->spinlock);
434 mlog(0, "returning DLM_FORWARD -- not master\n");
435 status = DLM_FORWARD;
436 goto leave;
437 }
438
439 for (i=0; i<3; i++) {
440 list_for_each(iter, queue) {
441 lock = list_entry(iter, struct dlm_lock, list);
442 if (lock->ml.cookie == unlock->cookie &&
443 lock->ml.node == unlock->node_idx) {
444 dlm_lock_get(lock);
445 found = 1;
446 break;
447 }
448 }
449 if (found)
450 break;
451 /* scan granted -> converting -> blocked queues */
452 queue++;
453 }
454 spin_unlock(&res->spinlock);
455 if (!found) {
456 status = DLM_IVLOCKID;
457 goto not_found;
458 }
459
460 /* lock was found on queue */
461 lksb = lock->lksb;
462 /* unlockast only called on originating node */
463 if (flags & LKM_PUT_LVB) {
464 lksb->flags |= DLM_LKSB_PUT_LVB;
465 memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN);
466 }
467
468 /* if this is in-progress, propagate the DLM_FORWARD
469 * all the way back out */
470 status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore);
471 if (status == DLM_FORWARD)
472 mlog(0, "lockres is in progress\n");
473
474 if (flags & LKM_PUT_LVB)
475 lksb->flags &= ~DLM_LKSB_PUT_LVB;
476
477 dlm_lockres_calc_usage(dlm, res);
478 dlm_kick_thread(dlm, res);
479
480not_found:
481 if (!found)
482 mlog(ML_ERROR, "failed to find lock to unlock! "
483 "cookie=%"MLFu64"\n",
484 unlock->cookie);
485 else {
486 /* send the lksb->status back to the other node */
487 status = lksb->status;
488 dlm_lock_put(lock);
489 }
490
491leave:
492 if (res)
493 dlm_lockres_put(res);
494
495 dlm_put(dlm);
496
497 return status;
498}
499
500
501static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
502 struct dlm_lock_resource *res,
503 struct dlm_lock *lock,
504 struct dlm_lockstatus *lksb,
505 int *actions)
506{
507 enum dlm_status status;
508
509 if (dlm_lock_on_list(&res->blocked, lock)) {
510 /* cancel this outright */
511 lksb->status = DLM_NORMAL;
512 status = DLM_NORMAL;
513 *actions = (DLM_UNLOCK_CALL_AST |
514 DLM_UNLOCK_REMOVE_LOCK);
515 } else if (dlm_lock_on_list(&res->converting, lock)) {
516 /* cancel the request, put back on granted */
517 lksb->status = DLM_NORMAL;
518 status = DLM_NORMAL;
519 *actions = (DLM_UNLOCK_CALL_AST |
520 DLM_UNLOCK_REMOVE_LOCK |
521 DLM_UNLOCK_REGRANT_LOCK |
522 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
523 } else if (dlm_lock_on_list(&res->granted, lock)) {
524 /* too late, already granted. DLM_CANCELGRANT */
525 lksb->status = DLM_CANCELGRANT;
526 status = DLM_NORMAL;
527 *actions = DLM_UNLOCK_CALL_AST;
528 } else {
529 mlog(ML_ERROR, "lock to cancel is not on any list!\n");
530 lksb->status = DLM_IVLOCKID;
531 status = DLM_IVLOCKID;
532 *actions = 0;
533 }
534 return status;
535}
536
537static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
538 struct dlm_lock_resource *res,
539 struct dlm_lock *lock,
540 struct dlm_lockstatus *lksb,
541 int *actions)
542{
543 enum dlm_status status;
544
545 /* unlock request */
546 if (!dlm_lock_on_list(&res->granted, lock)) {
547 lksb->status = DLM_DENIED;
548 status = DLM_DENIED;
549 dlm_error(status);
550 *actions = 0;
551 } else {
552 /* unlock granted lock */
553 lksb->status = DLM_NORMAL;
554 status = DLM_NORMAL;
555 *actions = (DLM_UNLOCK_FREE_LOCK |
556 DLM_UNLOCK_CALL_AST |
557 DLM_UNLOCK_REMOVE_LOCK);
558 }
559 return status;
560}
561
562/* there seems to be no point in doing this async
563 * since (even for the remote case) there is really
564 * no work to queue up... so just do it and fire the
565 * unlockast by hand when done... */
566enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
567 int flags, dlm_astunlockfunc_t *unlockast, void *data)
568{
569 enum dlm_status status;
570 struct dlm_lock_resource *res;
571 struct dlm_lock *lock = NULL;
572 int call_ast, is_master;
573
574 mlog_entry_void();
575
576 if (!lksb) {
577 dlm_error(DLM_BADARGS);
578 return DLM_BADARGS;
579 }
580
581 if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) {
582 dlm_error(DLM_BADPARAM);
583 return DLM_BADPARAM;
584 }
585
586 if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
587 mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n");
588 flags &= ~LKM_VALBLK;
589 }
590
591 if (!lksb->lockid || !lksb->lockid->lockres) {
592 dlm_error(DLM_BADPARAM);
593 return DLM_BADPARAM;
594 }
595
596 lock = lksb->lockid;
597 BUG_ON(!lock);
598 dlm_lock_get(lock);
599
600 res = lock->lockres;
601 BUG_ON(!res);
602 dlm_lockres_get(res);
603retry:
604 call_ast = 0;
605 /* need to retry up here because owner may have changed */
606 mlog(0, "lock=%p res=%p\n", lock, res);
607
608 spin_lock(&res->spinlock);
609 is_master = (res->owner == dlm->node_num);
610 spin_unlock(&res->spinlock);
611
612 if (is_master) {
613 status = dlmunlock_master(dlm, res, lock, lksb, flags,
614 &call_ast);
615 mlog(0, "done calling dlmunlock_master: returned %d, "
616 "call_ast is %d\n", status, call_ast);
617 } else {
618 status = dlmunlock_remote(dlm, res, lock, lksb, flags,
619 &call_ast);
620 mlog(0, "done calling dlmunlock_remote: returned %d, "
621 "call_ast is %d\n", status, call_ast);
622 }
623
624 if (status == DLM_RECOVERING ||
625 status == DLM_MIGRATING ||
626 status == DLM_FORWARD) {
627 /* We want to go away for a tiny bit to allow recovery
628 * / migration to complete on this resource. I don't
629 * know of any wait queue we could sleep on as this
630 * may be happening on another node. Perhaps the
631 * proper solution is to queue up requests on the
632 * other end? */
633
634 /* do we want to yield(); ?? */
635 msleep(50);
636
637 mlog(0, "retrying unlock due to pending recovery/"
638 "migration/in-progress\n");
639 goto retry;
640 }
641
642 if (call_ast) {
643 mlog(0, "calling unlockast(%p, %d)\n", data, lksb->status);
644 if (is_master) {
645 /* it is possible that there is one last bast
646 * pending. make sure it is flushed, then
647 * call the unlockast.
648 * not an issue if this is a mastered remotely,
649 * since this lock has been removed from the
650 * lockres queues and cannot be found. */
651 dlm_kick_thread(dlm, NULL);
652 wait_event(dlm->ast_wq,
653 dlm_lock_basts_flushed(dlm, lock));
654 }
655 (*unlockast)(data, lksb->status);
656 }
657
658 if (status == DLM_NORMAL) {
659 mlog(0, "kicking the thread\n");
660 dlm_kick_thread(dlm, res);
661 } else
662 dlm_error(status);
663
664 dlm_lockres_calc_usage(dlm, res);
665 dlm_lockres_put(res);
666 dlm_lock_put(lock);
667
668 mlog(0, "returning status=%d!\n", status);
669 return status;
670}
671EXPORT_SYMBOL_GPL(dlmunlock);
672
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
new file mode 100644
index 000000000000..7ef2653f8f41
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmver.c
@@ -0,0 +1,42 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/kernel.h>
28
29#include "dlmver.h"
30
31#define DLM_BUILD_VERSION "1.3.3"
32
33#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
34
35void dlm_print_version(void)
36{
37 printk(KERN_INFO "%s\n", VERSION_STR);
38}
39
40MODULE_DESCRIPTION(VERSION_STR);
41
42MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
new file mode 100644
index 000000000000..f674aee77a16
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmver.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmfsver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef DLM_VER_H
27#define DLM_VER_H
28
29void dlm_print_version(void);
30
31#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
new file mode 100644
index 000000000000..e1fdd288796e
--- /dev/null
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -0,0 +1,658 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * userdlm.c
5 *
6 * Code which implements the kernel side of a minimal userspace
7 * interface to our DLM.
8 *
9 * Many of the functions here are pared down versions of dlmglue.c
10 * functions.
11 *
12 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public
16 * License as published by the Free Software Foundation; either
17 * version 2 of the License, or (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 * General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public
25 * License along with this program; if not, write to the
26 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27 * Boston, MA 021110-1307, USA.
28 */
29
30#include <asm/signal.h>
31
32#include <linux/module.h>
33#include <linux/fs.h>
34#include <linux/types.h>
35#include <linux/crc32.h>
36
37
38#include "cluster/nodemanager.h"
39#include "cluster/heartbeat.h"
40#include "cluster/tcp.h"
41
42#include "dlmapi.h"
43
44#include "userdlm.h"
45
46#define MLOG_MASK_PREFIX ML_DLMFS
47#include "cluster/masklog.h"
48
49static inline int user_check_wait_flag(struct user_lock_res *lockres,
50 int flag)
51{
52 int ret;
53
54 spin_lock(&lockres->l_lock);
55 ret = lockres->l_flags & flag;
56 spin_unlock(&lockres->l_lock);
57
58 return ret;
59}
60
61static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
62
63{
64 wait_event(lockres->l_event,
65 !user_check_wait_flag(lockres, USER_LOCK_BUSY));
66}
67
68static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
69
70{
71 wait_event(lockres->l_event,
72 !user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
73}
74
75/* I heart container_of... */
76static inline struct dlm_ctxt *
77dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
78{
79 struct dlmfs_inode_private *ip;
80
81 ip = container_of(lockres,
82 struct dlmfs_inode_private,
83 ip_lockres);
84 return ip->ip_dlm;
85}
86
87static struct inode *
88user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
89{
90 struct dlmfs_inode_private *ip;
91
92 ip = container_of(lockres,
93 struct dlmfs_inode_private,
94 ip_lockres);
95 return &ip->ip_vfs_inode;
96}
97
98static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
99{
100 spin_lock(&lockres->l_lock);
101 lockres->l_flags &= ~USER_LOCK_BUSY;
102 spin_unlock(&lockres->l_lock);
103}
104
105#define user_log_dlm_error(_func, _stat, _lockres) do { \
106 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
107 "resource %s: %s\n", dlm_errname(_stat), _func, \
108 _lockres->l_name, dlm_errmsg(_stat)); \
109} while (0)
110
111/* WARNING: This function lives in a world where the only three lock
112 * levels are EX, PR, and NL. It *will* have to be adjusted when more
113 * lock types are added. */
114static inline int user_highest_compat_lock_level(int level)
115{
116 int new_level = LKM_EXMODE;
117
118 if (level == LKM_EXMODE)
119 new_level = LKM_NLMODE;
120 else if (level == LKM_PRMODE)
121 new_level = LKM_PRMODE;
122 return new_level;
123}
124
125static void user_ast(void *opaque)
126{
127 struct user_lock_res *lockres = opaque;
128 struct dlm_lockstatus *lksb;
129
130 mlog(0, "AST fired for lockres %s\n", lockres->l_name);
131
132 spin_lock(&lockres->l_lock);
133
134 lksb = &(lockres->l_lksb);
135 if (lksb->status != DLM_NORMAL) {
136 mlog(ML_ERROR, "lksb status value of %u on lockres %s\n",
137 lksb->status, lockres->l_name);
138 spin_unlock(&lockres->l_lock);
139 return;
140 }
141
142 /* we're downconverting. */
143 if (lockres->l_requested < lockres->l_level) {
144 if (lockres->l_requested <=
145 user_highest_compat_lock_level(lockres->l_blocking)) {
146 lockres->l_blocking = LKM_NLMODE;
147 lockres->l_flags &= ~USER_LOCK_BLOCKED;
148 }
149 }
150
151 lockres->l_level = lockres->l_requested;
152 lockres->l_requested = LKM_IVMODE;
153 lockres->l_flags |= USER_LOCK_ATTACHED;
154 lockres->l_flags &= ~USER_LOCK_BUSY;
155
156 spin_unlock(&lockres->l_lock);
157
158 wake_up(&lockres->l_event);
159}
160
161static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
162{
163 struct inode *inode;
164 inode = user_dlm_inode_from_user_lockres(lockres);
165 if (!igrab(inode))
166 BUG();
167}
168
169static void user_dlm_unblock_lock(void *opaque);
170
171static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
172{
173 if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
174 user_dlm_grab_inode_ref(lockres);
175
176 INIT_WORK(&lockres->l_work, user_dlm_unblock_lock,
177 lockres);
178
179 queue_work(user_dlm_worker, &lockres->l_work);
180 lockres->l_flags |= USER_LOCK_QUEUED;
181 }
182}
183
184static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
185{
186 int queue = 0;
187
188 if (!(lockres->l_flags & USER_LOCK_BLOCKED))
189 return;
190
191 switch (lockres->l_blocking) {
192 case LKM_EXMODE:
193 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
194 queue = 1;
195 break;
196 case LKM_PRMODE:
197 if (!lockres->l_ex_holders)
198 queue = 1;
199 break;
200 default:
201 BUG();
202 }
203
204 if (queue)
205 __user_dlm_queue_lockres(lockres);
206}
207
208static void user_bast(void *opaque, int level)
209{
210 struct user_lock_res *lockres = opaque;
211
212 mlog(0, "Blocking AST fired for lockres %s. Blocking level %d\n",
213 lockres->l_name, level);
214
215 spin_lock(&lockres->l_lock);
216 lockres->l_flags |= USER_LOCK_BLOCKED;
217 if (level > lockres->l_blocking)
218 lockres->l_blocking = level;
219
220 __user_dlm_queue_lockres(lockres);
221 spin_unlock(&lockres->l_lock);
222
223 wake_up(&lockres->l_event);
224}
225
226static void user_unlock_ast(void *opaque, enum dlm_status status)
227{
228 struct user_lock_res *lockres = opaque;
229
230 mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
231
232 if (status != DLM_NORMAL)
233 mlog(ML_ERROR, "Dlm returns status %d\n", status);
234
235 spin_lock(&lockres->l_lock);
236 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
237 lockres->l_level = LKM_IVMODE;
238 else {
239 lockres->l_requested = LKM_IVMODE; /* cancel an
240 * upconvert
241 * request. */
242 lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
243 /* we want the unblock thread to look at it again
244 * now. */
245 __user_dlm_queue_lockres(lockres);
246 }
247
248 lockres->l_flags &= ~USER_LOCK_BUSY;
249 spin_unlock(&lockres->l_lock);
250
251 wake_up(&lockres->l_event);
252}
253
254static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
255{
256 struct inode *inode;
257 inode = user_dlm_inode_from_user_lockres(lockres);
258 iput(inode);
259}
260
261static void user_dlm_unblock_lock(void *opaque)
262{
263 int new_level, status;
264 struct user_lock_res *lockres = (struct user_lock_res *) opaque;
265 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
266
267 mlog(0, "processing lockres %s\n", lockres->l_name);
268
269 spin_lock(&lockres->l_lock);
270
271 BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
272 BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED));
273
274 /* notice that we don't clear USER_LOCK_BLOCKED here. That's
275 * for user_ast to do. */
276 lockres->l_flags &= ~USER_LOCK_QUEUED;
277
278 if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
279 mlog(0, "lock is in teardown so we do nothing\n");
280 spin_unlock(&lockres->l_lock);
281 goto drop_ref;
282 }
283
284 if (lockres->l_flags & USER_LOCK_BUSY) {
285 mlog(0, "BUSY flag detected...\n");
286 if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
287 spin_unlock(&lockres->l_lock);
288 goto drop_ref;
289 }
290
291 lockres->l_flags |= USER_LOCK_IN_CANCEL;
292 spin_unlock(&lockres->l_lock);
293
294 status = dlmunlock(dlm,
295 &lockres->l_lksb,
296 LKM_CANCEL,
297 user_unlock_ast,
298 lockres);
299 if (status == DLM_CANCELGRANT) {
300 /* If we got this, then the ast was fired
301 * before we could cancel. We cleanup our
302 * state, and restart the function. */
303 spin_lock(&lockres->l_lock);
304 lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
305 spin_unlock(&lockres->l_lock);
306 } else if (status != DLM_NORMAL)
307 user_log_dlm_error("dlmunlock", status, lockres);
308 goto drop_ref;
309 }
310
311 /* If there are still incompat holders, we can exit safely
312 * without worrying about re-queueing this lock as that will
313 * happen on the last call to user_cluster_unlock. */
314 if ((lockres->l_blocking == LKM_EXMODE)
315 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
316 spin_unlock(&lockres->l_lock);
317 mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
318 lockres->l_ro_holders, lockres->l_ex_holders);
319 goto drop_ref;
320 }
321
322 if ((lockres->l_blocking == LKM_PRMODE)
323 && lockres->l_ex_holders) {
324 spin_unlock(&lockres->l_lock);
325 mlog(0, "can't downconvert for pr: ex = %u\n",
326 lockres->l_ex_holders);
327 goto drop_ref;
328 }
329
330 /* yay, we can downconvert now. */
331 new_level = user_highest_compat_lock_level(lockres->l_blocking);
332 lockres->l_requested = new_level;
333 lockres->l_flags |= USER_LOCK_BUSY;
334 mlog(0, "Downconvert lock from %d to %d\n",
335 lockres->l_level, new_level);
336 spin_unlock(&lockres->l_lock);
337
338 /* need lock downconvert request now... */
339 status = dlmlock(dlm,
340 new_level,
341 &lockres->l_lksb,
342 LKM_CONVERT|LKM_VALBLK,
343 lockres->l_name,
344 user_ast,
345 lockres,
346 user_bast);
347 if (status != DLM_NORMAL) {
348 user_log_dlm_error("dlmlock", status, lockres);
349 user_recover_from_dlm_error(lockres);
350 }
351
352drop_ref:
353 user_dlm_drop_inode_ref(lockres);
354}
355
356static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
357 int level)
358{
359 switch(level) {
360 case LKM_EXMODE:
361 lockres->l_ex_holders++;
362 break;
363 case LKM_PRMODE:
364 lockres->l_ro_holders++;
365 break;
366 default:
367 BUG();
368 }
369}
370
371/* predict what lock level we'll be dropping down to on behalf
372 * of another node, and return true if the currently wanted
373 * level will be compatible with it. */
374static inline int
375user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
376 int wanted)
377{
378 BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
379
380 return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
381}
382
383int user_dlm_cluster_lock(struct user_lock_res *lockres,
384 int level,
385 int lkm_flags)
386{
387 int status, local_flags;
388 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
389
390 if (level != LKM_EXMODE &&
391 level != LKM_PRMODE) {
392 mlog(ML_ERROR, "lockres %s: invalid request!\n",
393 lockres->l_name);
394 status = -EINVAL;
395 goto bail;
396 }
397
398 mlog(0, "lockres %s: asking for %s lock, passed flags = 0x%x\n",
399 lockres->l_name,
400 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
401 lkm_flags);
402
403again:
404 if (signal_pending(current)) {
405 status = -ERESTARTSYS;
406 goto bail;
407 }
408
409 spin_lock(&lockres->l_lock);
410
411 /* We only compare against the currently granted level
412 * here. If the lock is blocked waiting on a downconvert,
413 * we'll get caught below. */
414 if ((lockres->l_flags & USER_LOCK_BUSY) &&
415 (level > lockres->l_level)) {
416 /* is someone sitting in dlm_lock? If so, wait on
417 * them. */
418 spin_unlock(&lockres->l_lock);
419
420 user_wait_on_busy_lock(lockres);
421 goto again;
422 }
423
424 if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
425 (!user_may_continue_on_blocked_lock(lockres, level))) {
426 /* is the lock is currently blocked on behalf of
427 * another node */
428 spin_unlock(&lockres->l_lock);
429
430 user_wait_on_blocked_lock(lockres);
431 goto again;
432 }
433
434 if (level > lockres->l_level) {
435 local_flags = lkm_flags | LKM_VALBLK;
436 if (lockres->l_level != LKM_IVMODE)
437 local_flags |= LKM_CONVERT;
438
439 lockres->l_requested = level;
440 lockres->l_flags |= USER_LOCK_BUSY;
441 spin_unlock(&lockres->l_lock);
442
443 BUG_ON(level == LKM_IVMODE);
444 BUG_ON(level == LKM_NLMODE);
445
446 mlog(0, "lock %s, get lock from %d to level = %d\n",
447 lockres->l_name, lockres->l_level, level);
448
449 /* call dlm_lock to upgrade lock now */
450 status = dlmlock(dlm,
451 level,
452 &lockres->l_lksb,
453 local_flags,
454 lockres->l_name,
455 user_ast,
456 lockres,
457 user_bast);
458 if (status != DLM_NORMAL) {
459 if ((lkm_flags & LKM_NOQUEUE) &&
460 (status == DLM_NOTQUEUED))
461 status = -EAGAIN;
462 else {
463 user_log_dlm_error("dlmlock", status, lockres);
464 status = -EINVAL;
465 }
466 user_recover_from_dlm_error(lockres);
467 goto bail;
468 }
469
470 mlog(0, "lock %s, successfull return from dlmlock\n",
471 lockres->l_name);
472
473 user_wait_on_busy_lock(lockres);
474 goto again;
475 }
476
477 user_dlm_inc_holders(lockres, level);
478 spin_unlock(&lockres->l_lock);
479
480 mlog(0, "lockres %s: Got %s lock!\n", lockres->l_name,
481 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
482
483 status = 0;
484bail:
485 return status;
486}
487
488static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
489 int level)
490{
491 switch(level) {
492 case LKM_EXMODE:
493 BUG_ON(!lockres->l_ex_holders);
494 lockres->l_ex_holders--;
495 break;
496 case LKM_PRMODE:
497 BUG_ON(!lockres->l_ro_holders);
498 lockres->l_ro_holders--;
499 break;
500 default:
501 BUG();
502 }
503}
504
505void user_dlm_cluster_unlock(struct user_lock_res *lockres,
506 int level)
507{
508 if (level != LKM_EXMODE &&
509 level != LKM_PRMODE) {
510 mlog(ML_ERROR, "lockres %s: invalid request!\n", lockres->l_name);
511 return;
512 }
513
514 mlog(0, "lockres %s: dropping %s lock\n", lockres->l_name,
515 (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
516
517 spin_lock(&lockres->l_lock);
518 user_dlm_dec_holders(lockres, level);
519 __user_dlm_cond_queue_lockres(lockres);
520 spin_unlock(&lockres->l_lock);
521}
522
523void user_dlm_write_lvb(struct inode *inode,
524 const char *val,
525 unsigned int len)
526{
527 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
528 char *lvb = lockres->l_lksb.lvb;
529
530 BUG_ON(len > DLM_LVB_LEN);
531
532 spin_lock(&lockres->l_lock);
533
534 BUG_ON(lockres->l_level < LKM_EXMODE);
535 memcpy(lvb, val, len);
536
537 spin_unlock(&lockres->l_lock);
538}
539
540void user_dlm_read_lvb(struct inode *inode,
541 char *val,
542 unsigned int len)
543{
544 struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
545 char *lvb = lockres->l_lksb.lvb;
546
547 BUG_ON(len > DLM_LVB_LEN);
548
549 spin_lock(&lockres->l_lock);
550
551 BUG_ON(lockres->l_level < LKM_PRMODE);
552 memcpy(val, lvb, len);
553
554 spin_unlock(&lockres->l_lock);
555}
556
557void user_dlm_lock_res_init(struct user_lock_res *lockres,
558 struct dentry *dentry)
559{
560 memset(lockres, 0, sizeof(*lockres));
561
562 spin_lock_init(&lockres->l_lock);
563 init_waitqueue_head(&lockres->l_event);
564 lockres->l_level = LKM_IVMODE;
565 lockres->l_requested = LKM_IVMODE;
566 lockres->l_blocking = LKM_IVMODE;
567
568 /* should have been checked before getting here. */
569 BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
570
571 memcpy(lockres->l_name,
572 dentry->d_name.name,
573 dentry->d_name.len);
574}
575
576int user_dlm_destroy_lock(struct user_lock_res *lockres)
577{
578 int status = -EBUSY;
579 struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
580
581 mlog(0, "asked to destroy %s\n", lockres->l_name);
582
583 spin_lock(&lockres->l_lock);
584 while (lockres->l_flags & USER_LOCK_BUSY) {
585 spin_unlock(&lockres->l_lock);
586
587 mlog(0, "lock %s is busy\n", lockres->l_name);
588
589 user_wait_on_busy_lock(lockres);
590
591 spin_lock(&lockres->l_lock);
592 }
593
594 if (lockres->l_ro_holders || lockres->l_ex_holders) {
595 spin_unlock(&lockres->l_lock);
596 mlog(0, "lock %s has holders\n", lockres->l_name);
597 goto bail;
598 }
599
600 status = 0;
601 if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
602 spin_unlock(&lockres->l_lock);
603 mlog(0, "lock %s is not attached\n", lockres->l_name);
604 goto bail;
605 }
606
607 lockres->l_flags &= ~USER_LOCK_ATTACHED;
608 lockres->l_flags |= USER_LOCK_BUSY;
609 lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
610 spin_unlock(&lockres->l_lock);
611
612 mlog(0, "unlocking lockres %s\n", lockres->l_name);
613 status = dlmunlock(dlm,
614 &lockres->l_lksb,
615 LKM_VALBLK,
616 user_unlock_ast,
617 lockres);
618 if (status != DLM_NORMAL) {
619 user_log_dlm_error("dlmunlock", status, lockres);
620 status = -EINVAL;
621 goto bail;
622 }
623
624 user_wait_on_busy_lock(lockres);
625
626 status = 0;
627bail:
628 return status;
629}
630
631struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
632{
633 struct dlm_ctxt *dlm;
634 u32 dlm_key;
635 char *domain;
636
637 domain = kmalloc(name->len + 1, GFP_KERNEL);
638 if (!domain) {
639 mlog_errno(-ENOMEM);
640 return ERR_PTR(-ENOMEM);
641 }
642
643 dlm_key = crc32_le(0, name->name, name->len);
644
645 snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
646
647 dlm = dlm_register_domain(domain, dlm_key);
648 if (IS_ERR(dlm))
649 mlog_errno(PTR_ERR(dlm));
650
651 kfree(domain);
652 return dlm;
653}
654
655void user_dlm_unregister_context(struct dlm_ctxt *dlm)
656{
657 dlm_unregister_domain(dlm);
658}
diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h
new file mode 100644
index 000000000000..04178bc40b76
--- /dev/null
+++ b/fs/ocfs2/dlm/userdlm.h
@@ -0,0 +1,111 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * userdlm.h
5 *
6 * Userspace dlm defines
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef USERDLM_H
28#define USERDLM_H
29
30#include <linux/module.h>
31#include <linux/fs.h>
32#include <linux/types.h>
33#include <linux/workqueue.h>
34
35/* user_lock_res->l_flags flags. */
36#define USER_LOCK_ATTACHED (0x00000001) /* have we initialized
37 * the lvb */
38#define USER_LOCK_BUSY (0x00000002) /* we are currently in
39 * dlm_lock */
40#define USER_LOCK_BLOCKED (0x00000004) /* blocked waiting to
41 * downconvert*/
42#define USER_LOCK_IN_TEARDOWN (0x00000008) /* we're currently
43 * destroying this
44 * lock. */
45#define USER_LOCK_QUEUED (0x00000010) /* lock is on the
46 * workqueue */
47#define USER_LOCK_IN_CANCEL (0x00000020)
48
49struct user_lock_res {
50 spinlock_t l_lock;
51
52 int l_flags;
53
54#define USER_DLM_LOCK_ID_MAX_LEN 32
55 char l_name[USER_DLM_LOCK_ID_MAX_LEN];
56 int l_level;
57 unsigned int l_ro_holders;
58 unsigned int l_ex_holders;
59 struct dlm_lockstatus l_lksb;
60
61 int l_requested;
62 int l_blocking;
63
64 wait_queue_head_t l_event;
65
66 struct work_struct l_work;
67};
68
69extern struct workqueue_struct *user_dlm_worker;
70
71void user_dlm_lock_res_init(struct user_lock_res *lockres,
72 struct dentry *dentry);
73int user_dlm_destroy_lock(struct user_lock_res *lockres);
74int user_dlm_cluster_lock(struct user_lock_res *lockres,
75 int level,
76 int lkm_flags);
77void user_dlm_cluster_unlock(struct user_lock_res *lockres,
78 int level);
79void user_dlm_write_lvb(struct inode *inode,
80 const char *val,
81 unsigned int len);
82void user_dlm_read_lvb(struct inode *inode,
83 char *val,
84 unsigned int len);
85struct dlm_ctxt *user_dlm_register_context(struct qstr *name);
86void user_dlm_unregister_context(struct dlm_ctxt *dlm);
87
88struct dlmfs_inode_private {
89 struct dlm_ctxt *ip_dlm;
90
91 struct user_lock_res ip_lockres; /* unused for directories. */
92 struct inode *ip_parent;
93
94 struct inode ip_vfs_inode;
95};
96
97static inline struct dlmfs_inode_private *
98DLMFS_I(struct inode *inode)
99{
100 return container_of(inode,
101 struct dlmfs_inode_private,
102 ip_vfs_inode);
103}
104
105struct dlmfs_filp_private {
106 int fp_lock_level;
107};
108
109#define DLMFS_MAGIC 0x76a9f425
110
111#endif /* USERDLM_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
new file mode 100644
index 000000000000..e971ec2f8407
--- /dev/null
+++ b/fs/ocfs2/dlmglue.c
@@ -0,0 +1,2904 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmglue.c
5 *
6 * Code which implements an OCFS2 specific interface to our DLM.
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/mm.h>
30#include <linux/smp_lock.h>
31#include <linux/crc32.h>
32#include <linux/kthread.h>
33#include <linux/pagemap.h>
34#include <linux/debugfs.h>
35#include <linux/seq_file.h>
36
37#include <cluster/heartbeat.h>
38#include <cluster/nodemanager.h>
39#include <cluster/tcp.h>
40
41#include <dlm/dlmapi.h>
42
43#define MLOG_MASK_PREFIX ML_DLM_GLUE
44#include <cluster/masklog.h>
45
46#include "ocfs2.h"
47
48#include "alloc.h"
49#include "dlmglue.h"
50#include "extent_map.h"
51#include "heartbeat.h"
52#include "inode.h"
53#include "journal.h"
54#include "slot_map.h"
55#include "super.h"
56#include "uptodate.h"
57#include "vote.h"
58
59#include "buffer_head_io.h"
60
61struct ocfs2_mask_waiter {
62 struct list_head mw_item;
63 int mw_status;
64 struct completion mw_complete;
65 unsigned long mw_mask;
66 unsigned long mw_goal;
67};
68
69static void ocfs2_inode_ast_func(void *opaque);
70static void ocfs2_inode_bast_func(void *opaque,
71 int level);
72static void ocfs2_super_ast_func(void *opaque);
73static void ocfs2_super_bast_func(void *opaque,
74 int level);
75static void ocfs2_rename_ast_func(void *opaque);
76static void ocfs2_rename_bast_func(void *opaque,
77 int level);
78
79/* so far, all locks have gotten along with the same unlock ast */
80static void ocfs2_unlock_ast_func(void *opaque,
81 enum dlm_status status);
82static int ocfs2_do_unblock_meta(struct inode *inode,
83 int *requeue);
84static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
85 int *requeue);
86static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
87 int *requeue);
88static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
89 int *requeue);
90static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
91 int *requeue);
92typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
93static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
94 struct ocfs2_lock_res *lockres,
95 int *requeue,
96 ocfs2_convert_worker_t *worker);
97
98struct ocfs2_lock_res_ops {
99 void (*ast)(void *);
100 void (*bast)(void *, int);
101 void (*unlock_ast)(void *, enum dlm_status);
102 int (*unblock)(struct ocfs2_lock_res *, int *);
103};
104
105static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
106 .ast = ocfs2_inode_ast_func,
107 .bast = ocfs2_inode_bast_func,
108 .unlock_ast = ocfs2_unlock_ast_func,
109 .unblock = ocfs2_unblock_inode_lock,
110};
111
112static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
113 .ast = ocfs2_inode_ast_func,
114 .bast = ocfs2_inode_bast_func,
115 .unlock_ast = ocfs2_unlock_ast_func,
116 .unblock = ocfs2_unblock_meta,
117};
118
119static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
120 int blocking);
121
122static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
123 .ast = ocfs2_inode_ast_func,
124 .bast = ocfs2_inode_bast_func,
125 .unlock_ast = ocfs2_unlock_ast_func,
126 .unblock = ocfs2_unblock_data,
127};
128
129static struct ocfs2_lock_res_ops ocfs2_super_lops = {
130 .ast = ocfs2_super_ast_func,
131 .bast = ocfs2_super_bast_func,
132 .unlock_ast = ocfs2_unlock_ast_func,
133 .unblock = ocfs2_unblock_osb_lock,
134};
135
136static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
137 .ast = ocfs2_rename_ast_func,
138 .bast = ocfs2_rename_bast_func,
139 .unlock_ast = ocfs2_unlock_ast_func,
140 .unblock = ocfs2_unblock_osb_lock,
141};
142
143static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
144{
145 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
146 lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
147 lockres->l_type == OCFS2_LOCK_TYPE_RW;
148}
149
150static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
151{
152 return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
153}
154
155static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
156{
157 return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
158}
159
160static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
161{
162 BUG_ON(!ocfs2_is_super_lock(lockres)
163 && !ocfs2_is_rename_lock(lockres));
164
165 return (struct ocfs2_super *) lockres->l_priv;
166}
167
168static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
169{
170 BUG_ON(!ocfs2_is_inode_lock(lockres));
171
172 return (struct inode *) lockres->l_priv;
173}
174
175static int ocfs2_lock_create(struct ocfs2_super *osb,
176 struct ocfs2_lock_res *lockres,
177 int level,
178 int dlm_flags);
179static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
180 int wanted);
181static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
182 struct ocfs2_lock_res *lockres,
183 int level);
184static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
185static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
186static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
187static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
188static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
189 struct ocfs2_lock_res *lockres);
190static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
191 int convert);
192#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \
193 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
194 "resource %s: %s\n", dlm_errname(_stat), _func, \
195 _lockres->l_name, dlm_errmsg(_stat)); \
196} while (0)
197static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
198 struct ocfs2_lock_res *lockres);
199static int ocfs2_meta_lock_update(struct inode *inode,
200 struct buffer_head **bh);
201static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
202static inline int ocfs2_highest_compat_lock_level(int level);
203static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
204 struct ocfs2_lock_res *lockres,
205 int new_level);
206
207static char *ocfs2_lock_type_strings[] = {
208 [OCFS2_LOCK_TYPE_META] = "Meta",
209 [OCFS2_LOCK_TYPE_DATA] = "Data",
210 [OCFS2_LOCK_TYPE_SUPER] = "Super",
211 [OCFS2_LOCK_TYPE_RENAME] = "Rename",
212 /* Need to differntiate from [R]ename.. serializing writes is the
213 * important job it does, anyway. */
214 [OCFS2_LOCK_TYPE_RW] = "Write/Read",
215};
216
217static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
218{
219 mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
220 return ocfs2_lock_type_strings[type];
221}
222
223static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
224 u64 blkno,
225 u32 generation,
226 char *name)
227{
228 int len;
229
230 mlog_entry_void();
231
232 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
233
234 len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016"MLFx64"%08x",
235 ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, blkno,
236 generation);
237
238 BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
239
240 mlog(0, "built lock resource with name: %s\n", name);
241
242 mlog_exit_void();
243}
244
245static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
246
247static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
248 struct ocfs2_dlm_debug *dlm_debug)
249{
250 mlog(0, "Add tracking for lockres %s\n", res->l_name);
251
252 spin_lock(&ocfs2_dlm_tracking_lock);
253 list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
254 spin_unlock(&ocfs2_dlm_tracking_lock);
255}
256
257static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
258{
259 spin_lock(&ocfs2_dlm_tracking_lock);
260 if (!list_empty(&res->l_debug_list))
261 list_del_init(&res->l_debug_list);
262 spin_unlock(&ocfs2_dlm_tracking_lock);
263}
264
265static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
266 struct ocfs2_lock_res *res,
267 enum ocfs2_lock_type type,
268 u64 blkno,
269 u32 generation,
270 struct ocfs2_lock_res_ops *ops,
271 void *priv)
272{
273 ocfs2_build_lock_name(type, blkno, generation, res->l_name);
274
275 res->l_type = type;
276 res->l_ops = ops;
277 res->l_priv = priv;
278
279 res->l_level = LKM_IVMODE;
280 res->l_requested = LKM_IVMODE;
281 res->l_blocking = LKM_IVMODE;
282 res->l_action = OCFS2_AST_INVALID;
283 res->l_unlock_action = OCFS2_UNLOCK_INVALID;
284
285 res->l_flags = OCFS2_LOCK_INITIALIZED;
286
287 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
288}
289
290void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
291{
292 /* This also clears out the lock status block */
293 memset(res, 0, sizeof(struct ocfs2_lock_res));
294 spin_lock_init(&res->l_lock);
295 init_waitqueue_head(&res->l_event);
296 INIT_LIST_HEAD(&res->l_blocked_list);
297 INIT_LIST_HEAD(&res->l_mask_waiters);
298}
299
300void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
301 enum ocfs2_lock_type type,
302 struct inode *inode)
303{
304 struct ocfs2_lock_res_ops *ops;
305
306 switch(type) {
307 case OCFS2_LOCK_TYPE_RW:
308 ops = &ocfs2_inode_rw_lops;
309 break;
310 case OCFS2_LOCK_TYPE_META:
311 ops = &ocfs2_inode_meta_lops;
312 break;
313 case OCFS2_LOCK_TYPE_DATA:
314 ops = &ocfs2_inode_data_lops;
315 break;
316 default:
317 mlog_bug_on_msg(1, "type: %d\n", type);
318 ops = NULL; /* thanks, gcc */
319 break;
320 };
321
322 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type,
323 OCFS2_I(inode)->ip_blkno,
324 inode->i_generation, ops, inode);
325}
326
327static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
328 struct ocfs2_super *osb)
329{
330 /* Superblock lockres doesn't come from a slab so we call init
331 * once on it manually. */
332 ocfs2_lock_res_init_once(res);
333 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
334 OCFS2_SUPER_BLOCK_BLKNO, 0,
335 &ocfs2_super_lops, osb);
336}
337
338static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
339 struct ocfs2_super *osb)
340{
341 /* Rename lockres doesn't come from a slab so we call init
342 * once on it manually. */
343 ocfs2_lock_res_init_once(res);
344 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0,
345 &ocfs2_rename_lops, osb);
346}
347
348void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
349{
350 mlog_entry_void();
351
352 if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
353 return;
354
355 ocfs2_remove_lockres_tracking(res);
356
357 mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
358 "Lockres %s is on the blocked list\n",
359 res->l_name);
360 mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
361 "Lockres %s has mask waiters pending\n",
362 res->l_name);
363 mlog_bug_on_msg(spin_is_locked(&res->l_lock),
364 "Lockres %s is locked\n",
365 res->l_name);
366 mlog_bug_on_msg(res->l_ro_holders,
367 "Lockres %s has %u ro holders\n",
368 res->l_name, res->l_ro_holders);
369 mlog_bug_on_msg(res->l_ex_holders,
370 "Lockres %s has %u ex holders\n",
371 res->l_name, res->l_ex_holders);
372
373 /* Need to clear out the lock status block for the dlm */
374 memset(&res->l_lksb, 0, sizeof(res->l_lksb));
375
376 res->l_flags = 0UL;
377 mlog_exit_void();
378}
379
380static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
381 int level)
382{
383 mlog_entry_void();
384
385 BUG_ON(!lockres);
386
387 switch(level) {
388 case LKM_EXMODE:
389 lockres->l_ex_holders++;
390 break;
391 case LKM_PRMODE:
392 lockres->l_ro_holders++;
393 break;
394 default:
395 BUG();
396 }
397
398 mlog_exit_void();
399}
400
401static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
402 int level)
403{
404 mlog_entry_void();
405
406 BUG_ON(!lockres);
407
408 switch(level) {
409 case LKM_EXMODE:
410 BUG_ON(!lockres->l_ex_holders);
411 lockres->l_ex_holders--;
412 break;
413 case LKM_PRMODE:
414 BUG_ON(!lockres->l_ro_holders);
415 lockres->l_ro_holders--;
416 break;
417 default:
418 BUG();
419 }
420 mlog_exit_void();
421}
422
423/* WARNING: This function lives in a world where the only three lock
424 * levels are EX, PR, and NL. It *will* have to be adjusted when more
425 * lock types are added. */
426static inline int ocfs2_highest_compat_lock_level(int level)
427{
428 int new_level = LKM_EXMODE;
429
430 if (level == LKM_EXMODE)
431 new_level = LKM_NLMODE;
432 else if (level == LKM_PRMODE)
433 new_level = LKM_PRMODE;
434 return new_level;
435}
436
437static void lockres_set_flags(struct ocfs2_lock_res *lockres,
438 unsigned long newflags)
439{
440 struct list_head *pos, *tmp;
441 struct ocfs2_mask_waiter *mw;
442
443 assert_spin_locked(&lockres->l_lock);
444
445 lockres->l_flags = newflags;
446
447 list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
448 mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
449 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
450 continue;
451
452 list_del_init(&mw->mw_item);
453 mw->mw_status = 0;
454 complete(&mw->mw_complete);
455 }
456}
457static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
458{
459 lockres_set_flags(lockres, lockres->l_flags | or);
460}
461static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
462 unsigned long clear)
463{
464 lockres_set_flags(lockres, lockres->l_flags & ~clear);
465}
466
467static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
468{
469 mlog_entry_void();
470
471 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
472 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
473 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
474 BUG_ON(lockres->l_blocking <= LKM_NLMODE);
475
476 lockres->l_level = lockres->l_requested;
477 if (lockres->l_level <=
478 ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
479 lockres->l_blocking = LKM_NLMODE;
480 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
481 }
482 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
483
484 mlog_exit_void();
485}
486
487static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
488{
489 mlog_entry_void();
490
491 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
492 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
493
494 /* Convert from RO to EX doesn't really need anything as our
495 * information is already up to data. Convert from NL to
496 * *anything* however should mark ourselves as needing an
497 * update */
498 if (lockres->l_level == LKM_NLMODE)
499 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
500
501 lockres->l_level = lockres->l_requested;
502 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
503
504 mlog_exit_void();
505}
506
507static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
508{
509 mlog_entry_void();
510
511 BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
512 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
513
514 if (lockres->l_requested > LKM_NLMODE &&
515 !(lockres->l_flags & OCFS2_LOCK_LOCAL))
516 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
517
518 lockres->l_level = lockres->l_requested;
519 lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
520 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
521
522 mlog_exit_void();
523}
524
525static void ocfs2_inode_ast_func(void *opaque)
526{
527 struct ocfs2_lock_res *lockres = opaque;
528 struct inode *inode;
529 struct dlm_lockstatus *lksb;
530 unsigned long flags;
531
532 mlog_entry_void();
533
534 inode = ocfs2_lock_res_inode(lockres);
535
536 mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n",
537 OCFS2_I(inode)->ip_blkno, lockres->l_action,
538 ocfs2_lock_type_string(lockres->l_type));
539
540 BUG_ON(!ocfs2_is_inode_lock(lockres));
541
542 spin_lock_irqsave(&lockres->l_lock, flags);
543
544 lksb = &(lockres->l_lksb);
545 if (lksb->status != DLM_NORMAL) {
546 mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
547 "on inode %"MLFu64"\n", lksb->status,
548 OCFS2_I(inode)->ip_blkno);
549 spin_unlock_irqrestore(&lockres->l_lock, flags);
550 mlog_exit_void();
551 return;
552 }
553
554 switch(lockres->l_action) {
555 case OCFS2_AST_ATTACH:
556 ocfs2_generic_handle_attach_action(lockres);
557 lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
558 break;
559 case OCFS2_AST_CONVERT:
560 ocfs2_generic_handle_convert_action(lockres);
561 break;
562 case OCFS2_AST_DOWNCONVERT:
563 ocfs2_generic_handle_downconvert_action(lockres);
564 break;
565 default:
566 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
567 "lockres flags = 0x%lx, unlock action: %u\n",
568 lockres->l_name, lockres->l_action, lockres->l_flags,
569 lockres->l_unlock_action);
570
571 BUG();
572 }
573
574 /* data and rw locking ignores refresh flag for now. */
575 if (lockres->l_type != OCFS2_LOCK_TYPE_META)
576 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
577
578 /* set it to something invalid so if we get called again we
579 * can catch it. */
580 lockres->l_action = OCFS2_AST_INVALID;
581 spin_unlock_irqrestore(&lockres->l_lock, flags);
582 wake_up(&lockres->l_event);
583
584 mlog_exit_void();
585}
586
587static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
588 int level)
589{
590 int needs_downconvert = 0;
591 mlog_entry_void();
592
593 assert_spin_locked(&lockres->l_lock);
594
595 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
596
597 if (level > lockres->l_blocking) {
598 /* only schedule a downconvert if we haven't already scheduled
599 * one that goes low enough to satisfy the level we're
600 * blocking. this also catches the case where we get
601 * duplicate BASTs */
602 if (ocfs2_highest_compat_lock_level(level) <
603 ocfs2_highest_compat_lock_level(lockres->l_blocking))
604 needs_downconvert = 1;
605
606 lockres->l_blocking = level;
607 }
608
609 mlog_exit(needs_downconvert);
610 return needs_downconvert;
611}
612
613static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
614 struct ocfs2_lock_res *lockres,
615 int level)
616{
617 int needs_downconvert;
618 unsigned long flags;
619
620 mlog_entry_void();
621
622 BUG_ON(level <= LKM_NLMODE);
623
624 spin_lock_irqsave(&lockres->l_lock, flags);
625 needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
626 if (needs_downconvert)
627 ocfs2_schedule_blocked_lock(osb, lockres);
628 spin_unlock_irqrestore(&lockres->l_lock, flags);
629
630 ocfs2_kick_vote_thread(osb);
631
632 wake_up(&lockres->l_event);
633 mlog_exit_void();
634}
635
636static void ocfs2_inode_bast_func(void *opaque, int level)
637{
638 struct ocfs2_lock_res *lockres = opaque;
639 struct inode *inode;
640 struct ocfs2_super *osb;
641
642 mlog_entry_void();
643
644 BUG_ON(!ocfs2_is_inode_lock(lockres));
645
646 inode = ocfs2_lock_res_inode(lockres);
647 osb = OCFS2_SB(inode->i_sb);
648
649 mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d "
650 "type = %s\n", OCFS2_I(inode)->ip_blkno, level,
651 lockres->l_level,
652 ocfs2_lock_type_string(lockres->l_type));
653
654 ocfs2_generic_bast_func(osb, lockres, level);
655
656 mlog_exit_void();
657}
658
659static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
660 int ignore_refresh)
661{
662 struct dlm_lockstatus *lksb = &lockres->l_lksb;
663 unsigned long flags;
664
665 spin_lock_irqsave(&lockres->l_lock, flags);
666
667 if (lksb->status != DLM_NORMAL) {
668 mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
669 lockres->l_name, lksb->status);
670 spin_unlock_irqrestore(&lockres->l_lock, flags);
671 return;
672 }
673
674 switch(lockres->l_action) {
675 case OCFS2_AST_ATTACH:
676 ocfs2_generic_handle_attach_action(lockres);
677 break;
678 case OCFS2_AST_CONVERT:
679 ocfs2_generic_handle_convert_action(lockres);
680 break;
681 case OCFS2_AST_DOWNCONVERT:
682 ocfs2_generic_handle_downconvert_action(lockres);
683 break;
684 default:
685 BUG();
686 }
687
688 if (ignore_refresh)
689 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
690
691 /* set it to something invalid so if we get called again we
692 * can catch it. */
693 lockres->l_action = OCFS2_AST_INVALID;
694 spin_unlock_irqrestore(&lockres->l_lock, flags);
695
696 wake_up(&lockres->l_event);
697}
698
699static void ocfs2_super_ast_func(void *opaque)
700{
701 struct ocfs2_lock_res *lockres = opaque;
702
703 mlog_entry_void();
704 mlog(0, "Superblock AST fired\n");
705
706 BUG_ON(!ocfs2_is_super_lock(lockres));
707 ocfs2_generic_ast_func(lockres, 0);
708
709 mlog_exit_void();
710}
711
712static void ocfs2_super_bast_func(void *opaque,
713 int level)
714{
715 struct ocfs2_lock_res *lockres = opaque;
716 struct ocfs2_super *osb;
717
718 mlog_entry_void();
719 mlog(0, "Superblock BAST fired\n");
720
721 BUG_ON(!ocfs2_is_super_lock(lockres));
722 osb = ocfs2_lock_res_super(lockres);
723 ocfs2_generic_bast_func(osb, lockres, level);
724
725 mlog_exit_void();
726}
727
728static void ocfs2_rename_ast_func(void *opaque)
729{
730 struct ocfs2_lock_res *lockres = opaque;
731
732 mlog_entry_void();
733
734 mlog(0, "Rename AST fired\n");
735
736 BUG_ON(!ocfs2_is_rename_lock(lockres));
737
738 ocfs2_generic_ast_func(lockres, 1);
739
740 mlog_exit_void();
741}
742
743static void ocfs2_rename_bast_func(void *opaque,
744 int level)
745{
746 struct ocfs2_lock_res *lockres = opaque;
747 struct ocfs2_super *osb;
748
749 mlog_entry_void();
750
751 mlog(0, "Rename BAST fired\n");
752
753 BUG_ON(!ocfs2_is_rename_lock(lockres));
754
755 osb = ocfs2_lock_res_super(lockres);
756 ocfs2_generic_bast_func(osb, lockres, level);
757
758 mlog_exit_void();
759}
760
761static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
762 int convert)
763{
764 unsigned long flags;
765
766 mlog_entry_void();
767 spin_lock_irqsave(&lockres->l_lock, flags);
768 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
769 if (convert)
770 lockres->l_action = OCFS2_AST_INVALID;
771 else
772 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
773 spin_unlock_irqrestore(&lockres->l_lock, flags);
774
775 wake_up(&lockres->l_event);
776 mlog_exit_void();
777}
778
779/* Note: If we detect another process working on the lock (i.e.,
780 * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
781 * to do the right thing in that case.
782 */
783static int ocfs2_lock_create(struct ocfs2_super *osb,
784 struct ocfs2_lock_res *lockres,
785 int level,
786 int dlm_flags)
787{
788 int ret = 0;
789 enum dlm_status status;
790 unsigned long flags;
791
792 mlog_entry_void();
793
794 mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
795 dlm_flags);
796
797 spin_lock_irqsave(&lockres->l_lock, flags);
798 if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
799 (lockres->l_flags & OCFS2_LOCK_BUSY)) {
800 spin_unlock_irqrestore(&lockres->l_lock, flags);
801 goto bail;
802 }
803
804 lockres->l_action = OCFS2_AST_ATTACH;
805 lockres->l_requested = level;
806 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
807 spin_unlock_irqrestore(&lockres->l_lock, flags);
808
809 status = dlmlock(osb->dlm,
810 level,
811 &lockres->l_lksb,
812 dlm_flags,
813 lockres->l_name,
814 lockres->l_ops->ast,
815 lockres,
816 lockres->l_ops->bast);
817 if (status != DLM_NORMAL) {
818 ocfs2_log_dlm_error("dlmlock", status, lockres);
819 ret = -EINVAL;
820 ocfs2_recover_from_dlm_error(lockres, 1);
821 }
822
823 mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
824
825bail:
826 mlog_exit(ret);
827 return ret;
828}
829
830static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
831 int flag)
832{
833 unsigned long flags;
834 int ret;
835
836 spin_lock_irqsave(&lockres->l_lock, flags);
837 ret = lockres->l_flags & flag;
838 spin_unlock_irqrestore(&lockres->l_lock, flags);
839
840 return ret;
841}
842
843static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
844
845{
846 wait_event(lockres->l_event,
847 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
848}
849
850static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
851
852{
853 wait_event(lockres->l_event,
854 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
855}
856
857/* predict what lock level we'll be dropping down to on behalf
858 * of another node, and return true if the currently wanted
859 * level will be compatible with it. */
860static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
861 int wanted)
862{
863 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
864
865 return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
866}
867
868static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
869{
870 INIT_LIST_HEAD(&mw->mw_item);
871 init_completion(&mw->mw_complete);
872}
873
874static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
875{
876 wait_for_completion(&mw->mw_complete);
877 /* Re-arm the completion in case we want to wait on it again */
878 INIT_COMPLETION(mw->mw_complete);
879 return mw->mw_status;
880}
881
882static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
883 struct ocfs2_mask_waiter *mw,
884 unsigned long mask,
885 unsigned long goal)
886{
887 BUG_ON(!list_empty(&mw->mw_item));
888
889 assert_spin_locked(&lockres->l_lock);
890
891 list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
892 mw->mw_mask = mask;
893 mw->mw_goal = goal;
894}
895
896/* returns 0 if the mw that was removed was already satisfied, -EBUSY
897 * if the mask still hadn't reached its goal */
898static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
899 struct ocfs2_mask_waiter *mw)
900{
901 unsigned long flags;
902 int ret = 0;
903
904 spin_lock_irqsave(&lockres->l_lock, flags);
905 if (!list_empty(&mw->mw_item)) {
906 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
907 ret = -EBUSY;
908
909 list_del_init(&mw->mw_item);
910 init_completion(&mw->mw_complete);
911 }
912 spin_unlock_irqrestore(&lockres->l_lock, flags);
913
914 return ret;
915
916}
917
918static int ocfs2_cluster_lock(struct ocfs2_super *osb,
919 struct ocfs2_lock_res *lockres,
920 int level,
921 int lkm_flags,
922 int arg_flags)
923{
924 struct ocfs2_mask_waiter mw;
925 enum dlm_status status;
926 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
927 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
928 unsigned long flags;
929
930 mlog_entry_void();
931
932 ocfs2_init_mask_waiter(&mw);
933
934again:
935 wait = 0;
936
937 if (catch_signals && signal_pending(current)) {
938 ret = -ERESTARTSYS;
939 goto out;
940 }
941
942 spin_lock_irqsave(&lockres->l_lock, flags);
943
944 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
945 "Cluster lock called on freeing lockres %s! flags "
946 "0x%lx\n", lockres->l_name, lockres->l_flags);
947
948 /* We only compare against the currently granted level
949 * here. If the lock is blocked waiting on a downconvert,
950 * we'll get caught below. */
951 if (lockres->l_flags & OCFS2_LOCK_BUSY &&
952 level > lockres->l_level) {
953 /* is someone sitting in dlm_lock? If so, wait on
954 * them. */
955 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
956 wait = 1;
957 goto unlock;
958 }
959
960 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
961 /* lock has not been created yet. */
962 spin_unlock_irqrestore(&lockres->l_lock, flags);
963
964 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
965 if (ret < 0) {
966 mlog_errno(ret);
967 goto out;
968 }
969 goto again;
970 }
971
972 if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
973 !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
974 /* is the lock is currently blocked on behalf of
975 * another node */
976 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
977 wait = 1;
978 goto unlock;
979 }
980
981 if (level > lockres->l_level) {
982 if (lockres->l_action != OCFS2_AST_INVALID)
983 mlog(ML_ERROR, "lockres %s has action %u pending\n",
984 lockres->l_name, lockres->l_action);
985
986 lockres->l_action = OCFS2_AST_CONVERT;
987 lockres->l_requested = level;
988 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
989 spin_unlock_irqrestore(&lockres->l_lock, flags);
990
991 BUG_ON(level == LKM_IVMODE);
992 BUG_ON(level == LKM_NLMODE);
993
994 mlog(0, "lock %s, convert from %d to level = %d\n",
995 lockres->l_name, lockres->l_level, level);
996
997 /* call dlm_lock to upgrade lock now */
998 status = dlmlock(osb->dlm,
999 level,
1000 &lockres->l_lksb,
1001 lkm_flags|LKM_CONVERT|LKM_VALBLK,
1002 lockres->l_name,
1003 lockres->l_ops->ast,
1004 lockres,
1005 lockres->l_ops->bast);
1006 if (status != DLM_NORMAL) {
1007 if ((lkm_flags & LKM_NOQUEUE) &&
1008 (status == DLM_NOTQUEUED))
1009 ret = -EAGAIN;
1010 else {
1011 ocfs2_log_dlm_error("dlmlock", status,
1012 lockres);
1013 ret = -EINVAL;
1014 }
1015 ocfs2_recover_from_dlm_error(lockres, 1);
1016 goto out;
1017 }
1018
1019 mlog(0, "lock %s, successfull return from dlmlock\n",
1020 lockres->l_name);
1021
1022 /* At this point we've gone inside the dlm and need to
1023 * complete our work regardless. */
1024 catch_signals = 0;
1025
1026 /* wait for busy to clear and carry on */
1027 goto again;
1028 }
1029
1030 /* Ok, if we get here then we're good to go. */
1031 ocfs2_inc_holders(lockres, level);
1032
1033 ret = 0;
1034unlock:
1035 spin_unlock_irqrestore(&lockres->l_lock, flags);
1036out:
1037 /*
1038 * This is helping work around a lock inversion between the page lock
1039 * and dlm locks. One path holds the page lock while calling aops
1040 * which block acquiring dlm locks. The voting thread holds dlm
1041 * locks while acquiring page locks while down converting data locks.
1042 * This block is helping an aop path notice the inversion and back
1043 * off to unlock its page lock before trying the dlm lock again.
1044 */
1045 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
1046 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
1047 wait = 0;
1048 if (lockres_remove_mask_waiter(lockres, &mw))
1049 ret = -EAGAIN;
1050 else
1051 goto again;
1052 }
1053 if (wait) {
1054 ret = ocfs2_wait_for_mask(&mw);
1055 if (ret == 0)
1056 goto again;
1057 mlog_errno(ret);
1058 }
1059
1060 mlog_exit(ret);
1061 return ret;
1062}
1063
1064static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1065 struct ocfs2_lock_res *lockres,
1066 int level)
1067{
1068 unsigned long flags;
1069
1070 mlog_entry_void();
1071 spin_lock_irqsave(&lockres->l_lock, flags);
1072 ocfs2_dec_holders(lockres, level);
1073 ocfs2_vote_on_unlock(osb, lockres);
1074 spin_unlock_irqrestore(&lockres->l_lock, flags);
1075 mlog_exit_void();
1076}
1077
1078static int ocfs2_create_new_inode_lock(struct inode *inode,
1079 struct ocfs2_lock_res *lockres)
1080{
1081 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1082 unsigned long flags;
1083
1084 spin_lock_irqsave(&lockres->l_lock, flags);
1085 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
1086 lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
1087 spin_unlock_irqrestore(&lockres->l_lock, flags);
1088
1089 return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
1090}
1091
1092/* Grants us an EX lock on the data and metadata resources, skipping
1093 * the normal cluster directory lookup. Use this ONLY on newly created
1094 * inodes which other nodes can't possibly see, and which haven't been
1095 * hashed in the inode hash yet. This can give us a good performance
1096 * increase as it'll skip the network broadcast normally associated
1097 * with creating a new lock resource. */
1098int ocfs2_create_new_inode_locks(struct inode *inode)
1099{
1100 int ret;
1101
1102 BUG_ON(!inode);
1103 BUG_ON(!ocfs2_inode_is_new(inode));
1104
1105 mlog_entry_void();
1106
1107 mlog(0, "Inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
1108
1109 /* NOTE: That we don't increment any of the holder counts, nor
1110 * do we add anything to a journal handle. Since this is
1111 * supposed to be a new inode which the cluster doesn't know
1112 * about yet, there is no need to. As far as the LVB handling
1113 * is concerned, this is basically like acquiring an EX lock
1114 * on a resource which has an invalid one -- we'll set it
1115 * valid when we release the EX. */
1116
1117 ret = ocfs2_create_new_inode_lock(inode,
1118 &OCFS2_I(inode)->ip_rw_lockres);
1119 if (ret) {
1120 mlog_errno(ret);
1121 goto bail;
1122 }
1123
1124 ret = ocfs2_create_new_inode_lock(inode,
1125 &OCFS2_I(inode)->ip_meta_lockres);
1126 if (ret) {
1127 mlog_errno(ret);
1128 goto bail;
1129 }
1130
1131 ret = ocfs2_create_new_inode_lock(inode,
1132 &OCFS2_I(inode)->ip_data_lockres);
1133 if (ret) {
1134 mlog_errno(ret);
1135 goto bail;
1136 }
1137
1138bail:
1139 mlog_exit(ret);
1140 return ret;
1141}
1142
1143int ocfs2_rw_lock(struct inode *inode, int write)
1144{
1145 int status, level;
1146 struct ocfs2_lock_res *lockres;
1147
1148 BUG_ON(!inode);
1149
1150 mlog_entry_void();
1151
1152 mlog(0, "inode %"MLFu64" take %s RW lock\n",
1153 OCFS2_I(inode)->ip_blkno,
1154 write ? "EXMODE" : "PRMODE");
1155
1156 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1157
1158 level = write ? LKM_EXMODE : LKM_PRMODE;
1159
1160 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
1161 0);
1162 if (status < 0)
1163 mlog_errno(status);
1164
1165 mlog_exit(status);
1166 return status;
1167}
1168
1169void ocfs2_rw_unlock(struct inode *inode, int write)
1170{
1171 int level = write ? LKM_EXMODE : LKM_PRMODE;
1172 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
1173
1174 mlog_entry_void();
1175
1176 mlog(0, "inode %"MLFu64" drop %s RW lock\n",
1177 OCFS2_I(inode)->ip_blkno,
1178 write ? "EXMODE" : "PRMODE");
1179
1180 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1181
1182 mlog_exit_void();
1183}
1184
1185int ocfs2_data_lock_full(struct inode *inode,
1186 int write,
1187 int arg_flags)
1188{
1189 int status = 0, level;
1190 struct ocfs2_lock_res *lockres;
1191
1192 BUG_ON(!inode);
1193
1194 mlog_entry_void();
1195
1196 mlog(0, "inode %"MLFu64" take %s DATA lock\n",
1197 OCFS2_I(inode)->ip_blkno,
1198 write ? "EXMODE" : "PRMODE");
1199
1200 /* We'll allow faking a readonly data lock for
1201 * rodevices. */
1202 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
1203 if (write) {
1204 status = -EROFS;
1205 mlog_errno(status);
1206 }
1207 goto out;
1208 }
1209
1210 lockres = &OCFS2_I(inode)->ip_data_lockres;
1211
1212 level = write ? LKM_EXMODE : LKM_PRMODE;
1213
1214 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
1215 0, arg_flags);
1216 if (status < 0 && status != -EAGAIN)
1217 mlog_errno(status);
1218
1219out:
1220 mlog_exit(status);
1221 return status;
1222}
1223
1224/* see ocfs2_meta_lock_with_page() */
1225int ocfs2_data_lock_with_page(struct inode *inode,
1226 int write,
1227 struct page *page)
1228{
1229 int ret;
1230
1231 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
1232 if (ret == -EAGAIN) {
1233 unlock_page(page);
1234 if (ocfs2_data_lock(inode, write) == 0)
1235 ocfs2_data_unlock(inode, write);
1236 ret = AOP_TRUNCATED_PAGE;
1237 }
1238
1239 return ret;
1240}
1241
1242static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1243 struct ocfs2_lock_res *lockres)
1244{
1245 int kick = 0;
1246
1247 mlog_entry_void();
1248
1249 /* If we know that another node is waiting on our lock, kick
1250 * the vote thread * pre-emptively when we reach a release
1251 * condition. */
1252 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1253 switch(lockres->l_blocking) {
1254 case LKM_EXMODE:
1255 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
1256 kick = 1;
1257 break;
1258 case LKM_PRMODE:
1259 if (!lockres->l_ex_holders)
1260 kick = 1;
1261 break;
1262 default:
1263 BUG();
1264 }
1265 }
1266
1267 if (kick)
1268 ocfs2_kick_vote_thread(osb);
1269
1270 mlog_exit_void();
1271}
1272
1273void ocfs2_data_unlock(struct inode *inode,
1274 int write)
1275{
1276 int level = write ? LKM_EXMODE : LKM_PRMODE;
1277 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1278
1279 mlog_entry_void();
1280
1281 mlog(0, "inode %"MLFu64" drop %s DATA lock\n",
1282 OCFS2_I(inode)->ip_blkno,
1283 write ? "EXMODE" : "PRMODE");
1284
1285 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1286 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1287
1288 mlog_exit_void();
1289}
1290
1291#define OCFS2_SEC_BITS 34
1292#define OCFS2_SEC_SHIFT (64 - 34)
1293#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1)
1294
1295/* LVB only has room for 64 bits of time here so we pack it for
1296 * now. */
1297static u64 ocfs2_pack_timespec(struct timespec *spec)
1298{
1299 u64 res;
1300 u64 sec = spec->tv_sec;
1301 u32 nsec = spec->tv_nsec;
1302
1303 res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
1304
1305 return res;
1306}
1307
1308/* Call this with the lockres locked. I am reasonably sure we don't
1309 * need ip_lock in this function as anyone who would be changing those
1310 * values is supposed to be blocked in ocfs2_meta_lock right now. */
1311static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1312{
1313 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1314 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1315 struct ocfs2_meta_lvb *lvb;
1316
1317 mlog_entry_void();
1318
1319 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1320
1321 lvb->lvb_version = cpu_to_be32(OCFS2_LVB_VERSION);
1322 lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
1323 lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
1324 lvb->lvb_iuid = cpu_to_be32(inode->i_uid);
1325 lvb->lvb_igid = cpu_to_be32(inode->i_gid);
1326 lvb->lvb_imode = cpu_to_be16(inode->i_mode);
1327 lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
1328 lvb->lvb_iatime_packed =
1329 cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
1330 lvb->lvb_ictime_packed =
1331 cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
1332 lvb->lvb_imtime_packed =
1333 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
1334
1335 mlog_meta_lvb(0, lockres);
1336
1337 mlog_exit_void();
1338}
1339
1340static void ocfs2_unpack_timespec(struct timespec *spec,
1341 u64 packed_time)
1342{
1343 spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
1344 spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
1345}
1346
1347static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1348{
1349 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1350 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1351 struct ocfs2_meta_lvb *lvb;
1352
1353 mlog_entry_void();
1354
1355 mlog_meta_lvb(0, lockres);
1356
1357 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1358
1359 /* We're safe here without the lockres lock... */
1360 spin_lock(&oi->ip_lock);
1361 oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
1362 i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
1363
1364 /* fast-symlinks are a special case */
1365 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
1366 inode->i_blocks = 0;
1367 else
1368 inode->i_blocks =
1369 ocfs2_align_bytes_to_sectors(i_size_read(inode));
1370
1371 inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
1372 inode->i_gid = be32_to_cpu(lvb->lvb_igid);
1373 inode->i_mode = be16_to_cpu(lvb->lvb_imode);
1374 inode->i_nlink = be16_to_cpu(lvb->lvb_inlink);
1375 ocfs2_unpack_timespec(&inode->i_atime,
1376 be64_to_cpu(lvb->lvb_iatime_packed));
1377 ocfs2_unpack_timespec(&inode->i_mtime,
1378 be64_to_cpu(lvb->lvb_imtime_packed));
1379 ocfs2_unpack_timespec(&inode->i_ctime,
1380 be64_to_cpu(lvb->lvb_ictime_packed));
1381 spin_unlock(&oi->ip_lock);
1382
1383 mlog_exit_void();
1384}
1385
1386static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres)
1387{
1388 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1389
1390 if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
1391 return 1;
1392 return 0;
1393}
1394
1395/* Determine whether a lock resource needs to be refreshed, and
1396 * arbitrate who gets to refresh it.
1397 *
1398 * 0 means no refresh needed.
1399 *
1400 * > 0 means you need to refresh this and you MUST call
1401 * ocfs2_complete_lock_res_refresh afterwards. */
1402static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
1403{
1404 unsigned long flags;
1405 int status = 0;
1406
1407 mlog_entry_void();
1408
1409refresh_check:
1410 spin_lock_irqsave(&lockres->l_lock, flags);
1411 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
1412 spin_unlock_irqrestore(&lockres->l_lock, flags);
1413 goto bail;
1414 }
1415
1416 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
1417 spin_unlock_irqrestore(&lockres->l_lock, flags);
1418
1419 ocfs2_wait_on_refreshing_lock(lockres);
1420 goto refresh_check;
1421 }
1422
1423 /* Ok, I'll be the one to refresh this lock. */
1424 lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
1425 spin_unlock_irqrestore(&lockres->l_lock, flags);
1426
1427 status = 1;
1428bail:
1429 mlog_exit(status);
1430 return status;
1431}
1432
1433/* If status is non zero, I'll mark it as not being in refresh
1434 * anymroe, but i won't clear the needs refresh flag. */
1435static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
1436 int status)
1437{
1438 unsigned long flags;
1439 mlog_entry_void();
1440
1441 spin_lock_irqsave(&lockres->l_lock, flags);
1442 lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
1443 if (!status)
1444 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
1445 spin_unlock_irqrestore(&lockres->l_lock, flags);
1446
1447 wake_up(&lockres->l_event);
1448
1449 mlog_exit_void();
1450}
1451
1452/* may or may not return a bh if it went to disk. */
1453static int ocfs2_meta_lock_update(struct inode *inode,
1454 struct buffer_head **bh)
1455{
1456 int status = 0;
1457 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1458 struct ocfs2_lock_res *lockres;
1459 struct ocfs2_dinode *fe;
1460
1461 mlog_entry_void();
1462
1463 spin_lock(&oi->ip_lock);
1464 if (oi->ip_flags & OCFS2_INODE_DELETED) {
1465 mlog(0, "Orphaned inode %"MLFu64" was deleted while we "
1466 "were waiting on a lock. ip_flags = 0x%x\n",
1467 oi->ip_blkno, oi->ip_flags);
1468 spin_unlock(&oi->ip_lock);
1469 status = -ENOENT;
1470 goto bail;
1471 }
1472 spin_unlock(&oi->ip_lock);
1473
1474 lockres = &oi->ip_meta_lockres;
1475
1476 if (!ocfs2_should_refresh_lock_res(lockres))
1477 goto bail;
1478
1479 /* This will discard any caching information we might have had
1480 * for the inode metadata. */
1481 ocfs2_metadata_cache_purge(inode);
1482
1483 /* will do nothing for inode types that don't use the extent
1484 * map (directories, bitmap files, etc) */
1485 ocfs2_extent_map_trunc(inode, 0);
1486
1487 if (ocfs2_meta_lvb_is_trustable(lockres)) {
1488 mlog(0, "Trusting LVB on inode %"MLFu64"\n",
1489 oi->ip_blkno);
1490 ocfs2_refresh_inode_from_lvb(inode);
1491 } else {
1492 /* Boo, we have to go to disk. */
1493 /* read bh, cast, ocfs2_refresh_inode */
1494 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
1495 bh, OCFS2_BH_CACHED, inode);
1496 if (status < 0) {
1497 mlog_errno(status);
1498 goto bail_refresh;
1499 }
1500 fe = (struct ocfs2_dinode *) (*bh)->b_data;
1501
1502 /* This is a good chance to make sure we're not
1503 * locking an invalid object.
1504 *
1505 * We bug on a stale inode here because we checked
1506 * above whether it was wiped from disk. The wiping
1507 * node provides a guarantee that we receive that
1508 * message and can mark the inode before dropping any
1509 * locks associated with it. */
1510 if (!OCFS2_IS_VALID_DINODE(fe)) {
1511 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1512 status = -EIO;
1513 goto bail_refresh;
1514 }
1515 mlog_bug_on_msg(inode->i_generation !=
1516 le32_to_cpu(fe->i_generation),
1517 "Invalid dinode %"MLFu64" disk generation: %u "
1518 "inode->i_generation: %u\n",
1519 oi->ip_blkno, le32_to_cpu(fe->i_generation),
1520 inode->i_generation);
1521 mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
1522 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
1523 "Stale dinode %"MLFu64" dtime: %"MLFu64" "
1524 "flags: 0x%x\n", oi->ip_blkno,
1525 le64_to_cpu(fe->i_dtime),
1526 le32_to_cpu(fe->i_flags));
1527
1528 ocfs2_refresh_inode(inode, fe);
1529 }
1530
1531 status = 0;
1532bail_refresh:
1533 ocfs2_complete_lock_res_refresh(lockres, status);
1534bail:
1535 mlog_exit(status);
1536 return status;
1537}
1538
1539static int ocfs2_assign_bh(struct inode *inode,
1540 struct buffer_head **ret_bh,
1541 struct buffer_head *passed_bh)
1542{
1543 int status;
1544
1545 if (passed_bh) {
1546 /* Ok, the update went to disk for us, use the
1547 * returned bh. */
1548 *ret_bh = passed_bh;
1549 get_bh(*ret_bh);
1550
1551 return 0;
1552 }
1553
1554 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1555 OCFS2_I(inode)->ip_blkno,
1556 ret_bh,
1557 OCFS2_BH_CACHED,
1558 inode);
1559 if (status < 0)
1560 mlog_errno(status);
1561
1562 return status;
1563}
1564
1565/*
1566 * returns < 0 error if the callback will never be called, otherwise
1567 * the result of the lock will be communicated via the callback.
1568 */
1569int ocfs2_meta_lock_full(struct inode *inode,
1570 struct ocfs2_journal_handle *handle,
1571 struct buffer_head **ret_bh,
1572 int ex,
1573 int arg_flags)
1574{
1575 int status, level, dlm_flags, acquired;
1576 struct ocfs2_lock_res *lockres;
1577 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1578 struct buffer_head *local_bh = NULL;
1579
1580 BUG_ON(!inode);
1581
1582 mlog_entry_void();
1583
1584 mlog(0, "inode %"MLFu64", take %s META lock\n",
1585 OCFS2_I(inode)->ip_blkno,
1586 ex ? "EXMODE" : "PRMODE");
1587
1588 status = 0;
1589 acquired = 0;
1590 /* We'll allow faking a readonly metadata lock for
1591 * rodevices. */
1592 if (ocfs2_is_hard_readonly(osb)) {
1593 if (ex)
1594 status = -EROFS;
1595 goto bail;
1596 }
1597
1598 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1599 wait_event(osb->recovery_event,
1600 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1601
1602 acquired = 0;
1603 lockres = &OCFS2_I(inode)->ip_meta_lockres;
1604 level = ex ? LKM_EXMODE : LKM_PRMODE;
1605 dlm_flags = 0;
1606 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
1607 dlm_flags |= LKM_NOQUEUE;
1608
1609 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
1610 if (status < 0) {
1611 if (status != -EAGAIN && status != -EIOCBRETRY)
1612 mlog_errno(status);
1613 goto bail;
1614 }
1615
1616 /* Notify the error cleanup path to drop the cluster lock. */
1617 acquired = 1;
1618
1619 /* We wait twice because a node may have died while we were in
1620 * the lower dlm layers. The second time though, we've
1621 * committed to owning this lock so we don't allow signals to
1622 * abort the operation. */
1623 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1624 wait_event(osb->recovery_event,
1625 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1626
1627 /* This is fun. The caller may want a bh back, or it may
1628 * not. ocfs2_meta_lock_update definitely wants one in, but
1629 * may or may not read one, depending on what's in the
1630 * LVB. The result of all of this is that we've *only* gone to
1631 * disk if we have to, so the complexity is worthwhile. */
1632 status = ocfs2_meta_lock_update(inode, &local_bh);
1633 if (status < 0) {
1634 if (status != -ENOENT)
1635 mlog_errno(status);
1636 goto bail;
1637 }
1638
1639 if (ret_bh) {
1640 status = ocfs2_assign_bh(inode, ret_bh, local_bh);
1641 if (status < 0) {
1642 mlog_errno(status);
1643 goto bail;
1644 }
1645 }
1646
1647 if (handle) {
1648 status = ocfs2_handle_add_lock(handle, inode);
1649 if (status < 0)
1650 mlog_errno(status);
1651 }
1652
1653bail:
1654 if (status < 0) {
1655 if (ret_bh && (*ret_bh)) {
1656 brelse(*ret_bh);
1657 *ret_bh = NULL;
1658 }
1659 if (acquired)
1660 ocfs2_meta_unlock(inode, ex);
1661 }
1662
1663 if (local_bh)
1664 brelse(local_bh);
1665
1666 mlog_exit(status);
1667 return status;
1668}
1669
1670/*
1671 * This is working around a lock inversion between tasks acquiring DLM locks
1672 * while holding a page lock and the vote thread which blocks dlm lock acquiry
1673 * while acquiring page locks.
1674 *
1675 * ** These _with_page variantes are only intended to be called from aop
1676 * methods that hold page locks and return a very specific *positive* error
1677 * code that aop methods pass up to the VFS -- test for errors with != 0. **
1678 *
1679 * The DLM is called such that it returns -EAGAIN if it would have blocked
1680 * waiting for the vote thread. In that case we unlock our page so the vote
1681 * thread can make progress. Once we've done this we have to return
1682 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
1683 * into the VFS who will then immediately retry the aop call.
1684 *
1685 * We do a blocking lock and immediate unlock before returning, though, so that
1686 * the lock has a great chance of being cached on this node by the time the VFS
1687 * calls back to retry the aop. This has a potential to livelock as nodes
1688 * ping locks back and forth, but that's a risk we're willing to take to avoid
1689 * the lock inversion simply.
1690 */
1691int ocfs2_meta_lock_with_page(struct inode *inode,
1692 struct ocfs2_journal_handle *handle,
1693 struct buffer_head **ret_bh,
1694 int ex,
1695 struct page *page)
1696{
1697 int ret;
1698
1699 ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
1700 OCFS2_LOCK_NONBLOCK);
1701 if (ret == -EAGAIN) {
1702 unlock_page(page);
1703 if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
1704 ocfs2_meta_unlock(inode, ex);
1705 ret = AOP_TRUNCATED_PAGE;
1706 }
1707
1708 return ret;
1709}
1710
1711void ocfs2_meta_unlock(struct inode *inode,
1712 int ex)
1713{
1714 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1715 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
1716
1717 mlog_entry_void();
1718
1719 mlog(0, "inode %"MLFu64" drop %s META lock\n",
1720 OCFS2_I(inode)->ip_blkno,
1721 ex ? "EXMODE" : "PRMODE");
1722
1723 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1724 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1725
1726 mlog_exit_void();
1727}
1728
1729int ocfs2_super_lock(struct ocfs2_super *osb,
1730 int ex)
1731{
1732 int status;
1733 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1734 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1735 struct buffer_head *bh;
1736 struct ocfs2_slot_info *si = osb->slot_info;
1737
1738 mlog_entry_void();
1739
1740 if (ocfs2_is_hard_readonly(osb))
1741 return -EROFS;
1742
1743 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
1744 if (status < 0) {
1745 mlog_errno(status);
1746 goto bail;
1747 }
1748
1749 /* The super block lock path is really in the best position to
1750 * know when resources covered by the lock need to be
1751 * refreshed, so we do it here. Of course, making sense of
1752 * everything is up to the caller :) */
1753 status = ocfs2_should_refresh_lock_res(lockres);
1754 if (status < 0) {
1755 mlog_errno(status);
1756 goto bail;
1757 }
1758 if (status) {
1759 bh = si->si_bh;
1760 status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
1761 si->si_inode);
1762 if (status == 0)
1763 ocfs2_update_slot_info(si);
1764
1765 ocfs2_complete_lock_res_refresh(lockres, status);
1766
1767 if (status < 0)
1768 mlog_errno(status);
1769 }
1770bail:
1771 mlog_exit(status);
1772 return status;
1773}
1774
1775void ocfs2_super_unlock(struct ocfs2_super *osb,
1776 int ex)
1777{
1778 int level = ex ? LKM_EXMODE : LKM_PRMODE;
1779 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1780
1781 ocfs2_cluster_unlock(osb, lockres, level);
1782}
1783
1784int ocfs2_rename_lock(struct ocfs2_super *osb)
1785{
1786 int status;
1787 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1788
1789 if (ocfs2_is_hard_readonly(osb))
1790 return -EROFS;
1791
1792 status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
1793 if (status < 0)
1794 mlog_errno(status);
1795
1796 return status;
1797}
1798
1799void ocfs2_rename_unlock(struct ocfs2_super *osb)
1800{
1801 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1802
1803 ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
1804}
1805
1806/* Reference counting of the dlm debug structure. We want this because
1807 * open references on the debug inodes can live on after a mount, so
1808 * we can't rely on the ocfs2_super to always exist. */
1809static void ocfs2_dlm_debug_free(struct kref *kref)
1810{
1811 struct ocfs2_dlm_debug *dlm_debug;
1812
1813 dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
1814
1815 kfree(dlm_debug);
1816}
1817
1818void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
1819{
1820 if (dlm_debug)
1821 kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
1822}
1823
1824static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
1825{
1826 kref_get(&debug->d_refcnt);
1827}
1828
1829struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
1830{
1831 struct ocfs2_dlm_debug *dlm_debug;
1832
1833 dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
1834 if (!dlm_debug) {
1835 mlog_errno(-ENOMEM);
1836 goto out;
1837 }
1838
1839 kref_init(&dlm_debug->d_refcnt);
1840 INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
1841 dlm_debug->d_locking_state = NULL;
1842out:
1843 return dlm_debug;
1844}
1845
1846/* Access to this is arbitrated for us via seq_file->sem. */
1847struct ocfs2_dlm_seq_priv {
1848 struct ocfs2_dlm_debug *p_dlm_debug;
1849 struct ocfs2_lock_res p_iter_res;
1850 struct ocfs2_lock_res p_tmp_res;
1851};
1852
1853static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
1854 struct ocfs2_dlm_seq_priv *priv)
1855{
1856 struct ocfs2_lock_res *iter, *ret = NULL;
1857 struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
1858
1859 assert_spin_locked(&ocfs2_dlm_tracking_lock);
1860
1861 list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
1862 /* discover the head of the list */
1863 if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
1864 mlog(0, "End of list found, %p\n", ret);
1865 break;
1866 }
1867
1868 /* We track our "dummy" iteration lockres' by a NULL
1869 * l_ops field. */
1870 if (iter->l_ops != NULL) {
1871 ret = iter;
1872 break;
1873 }
1874 }
1875
1876 return ret;
1877}
1878
1879static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
1880{
1881 struct ocfs2_dlm_seq_priv *priv = m->private;
1882 struct ocfs2_lock_res *iter;
1883
1884 spin_lock(&ocfs2_dlm_tracking_lock);
1885 iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
1886 if (iter) {
1887 /* Since lockres' have the lifetime of their container
1888 * (which can be inodes, ocfs2_supers, etc) we want to
1889 * copy this out to a temporary lockres while still
1890 * under the spinlock. Obviously after this we can't
1891 * trust any pointers on the copy returned, but that's
1892 * ok as the information we want isn't typically held
1893 * in them. */
1894 priv->p_tmp_res = *iter;
1895 iter = &priv->p_tmp_res;
1896 }
1897 spin_unlock(&ocfs2_dlm_tracking_lock);
1898
1899 return iter;
1900}
1901
1902static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
1903{
1904}
1905
1906static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
1907{
1908 struct ocfs2_dlm_seq_priv *priv = m->private;
1909 struct ocfs2_lock_res *iter = v;
1910 struct ocfs2_lock_res *dummy = &priv->p_iter_res;
1911
1912 spin_lock(&ocfs2_dlm_tracking_lock);
1913 iter = ocfs2_dlm_next_res(iter, priv);
1914 list_del_init(&dummy->l_debug_list);
1915 if (iter) {
1916 list_add(&dummy->l_debug_list, &iter->l_debug_list);
1917 priv->p_tmp_res = *iter;
1918 iter = &priv->p_tmp_res;
1919 }
1920 spin_unlock(&ocfs2_dlm_tracking_lock);
1921
1922 return iter;
1923}
1924
1925/* So that debugfs.ocfs2 can determine which format is being used */
1926#define OCFS2_DLM_DEBUG_STR_VERSION 1
1927static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
1928{
1929 int i;
1930 char *lvb;
1931 struct ocfs2_lock_res *lockres = v;
1932
1933 if (!lockres)
1934 return -EINVAL;
1935
1936 seq_printf(m, "0x%x\t"
1937 "%.*s\t"
1938 "%d\t"
1939 "0x%lx\t"
1940 "0x%x\t"
1941 "0x%x\t"
1942 "%u\t"
1943 "%u\t"
1944 "%d\t"
1945 "%d\t",
1946 OCFS2_DLM_DEBUG_STR_VERSION,
1947 OCFS2_LOCK_ID_MAX_LEN, lockres->l_name,
1948 lockres->l_level,
1949 lockres->l_flags,
1950 lockres->l_action,
1951 lockres->l_unlock_action,
1952 lockres->l_ro_holders,
1953 lockres->l_ex_holders,
1954 lockres->l_requested,
1955 lockres->l_blocking);
1956
1957 /* Dump the raw LVB */
1958 lvb = lockres->l_lksb.lvb;
1959 for(i = 0; i < DLM_LVB_LEN; i++)
1960 seq_printf(m, "0x%x\t", lvb[i]);
1961
1962 /* End the line */
1963 seq_printf(m, "\n");
1964 return 0;
1965}
1966
1967static struct seq_operations ocfs2_dlm_seq_ops = {
1968 .start = ocfs2_dlm_seq_start,
1969 .stop = ocfs2_dlm_seq_stop,
1970 .next = ocfs2_dlm_seq_next,
1971 .show = ocfs2_dlm_seq_show,
1972};
1973
1974static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
1975{
1976 struct seq_file *seq = (struct seq_file *) file->private_data;
1977 struct ocfs2_dlm_seq_priv *priv = seq->private;
1978 struct ocfs2_lock_res *res = &priv->p_iter_res;
1979
1980 ocfs2_remove_lockres_tracking(res);
1981 ocfs2_put_dlm_debug(priv->p_dlm_debug);
1982 return seq_release_private(inode, file);
1983}
1984
1985static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
1986{
1987 int ret;
1988 struct ocfs2_dlm_seq_priv *priv;
1989 struct seq_file *seq;
1990 struct ocfs2_super *osb;
1991
1992 priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
1993 if (!priv) {
1994 ret = -ENOMEM;
1995 mlog_errno(ret);
1996 goto out;
1997 }
1998 osb = (struct ocfs2_super *) inode->u.generic_ip;
1999 ocfs2_get_dlm_debug(osb->osb_dlm_debug);
2000 priv->p_dlm_debug = osb->osb_dlm_debug;
2001 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
2002
2003 ret = seq_open(file, &ocfs2_dlm_seq_ops);
2004 if (ret) {
2005 kfree(priv);
2006 mlog_errno(ret);
2007 goto out;
2008 }
2009
2010 seq = (struct seq_file *) file->private_data;
2011 seq->private = priv;
2012
2013 ocfs2_add_lockres_tracking(&priv->p_iter_res,
2014 priv->p_dlm_debug);
2015
2016out:
2017 return ret;
2018}
2019
2020static struct file_operations ocfs2_dlm_debug_fops = {
2021 .open = ocfs2_dlm_debug_open,
2022 .release = ocfs2_dlm_debug_release,
2023 .read = seq_read,
2024 .llseek = seq_lseek,
2025};
2026
2027static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
2028{
2029 int ret = 0;
2030 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2031
2032 dlm_debug->d_locking_state = debugfs_create_file("locking_state",
2033 S_IFREG|S_IRUSR,
2034 osb->osb_debug_root,
2035 osb,
2036 &ocfs2_dlm_debug_fops);
2037 if (!dlm_debug->d_locking_state) {
2038 ret = -EINVAL;
2039 mlog(ML_ERROR,
2040 "Unable to create locking state debugfs file.\n");
2041 goto out;
2042 }
2043
2044 ocfs2_get_dlm_debug(dlm_debug);
2045out:
2046 return ret;
2047}
2048
2049static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
2050{
2051 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2052
2053 if (dlm_debug) {
2054 debugfs_remove(dlm_debug->d_locking_state);
2055 ocfs2_put_dlm_debug(dlm_debug);
2056 }
2057}
2058
2059int ocfs2_dlm_init(struct ocfs2_super *osb)
2060{
2061 int status;
2062 u32 dlm_key;
2063 struct dlm_ctxt *dlm;
2064
2065 mlog_entry_void();
2066
2067 status = ocfs2_dlm_init_debug(osb);
2068 if (status < 0) {
2069 mlog_errno(status);
2070 goto bail;
2071 }
2072
2073 /* launch vote thread */
2074 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d",
2075 osb->osb_id);
2076 if (IS_ERR(osb->vote_task)) {
2077 status = PTR_ERR(osb->vote_task);
2078 osb->vote_task = NULL;
2079 mlog_errno(status);
2080 goto bail;
2081 }
2082
2083 /* used by the dlm code to make message headers unique, each
2084 * node in this domain must agree on this. */
2085 dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
2086
2087 /* for now, uuid == domain */
2088 dlm = dlm_register_domain(osb->uuid_str, dlm_key);
2089 if (IS_ERR(dlm)) {
2090 status = PTR_ERR(dlm);
2091 mlog_errno(status);
2092 goto bail;
2093 }
2094
2095 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2096 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2097
2098 dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
2099
2100 osb->dlm = dlm;
2101
2102 status = 0;
2103bail:
2104 if (status < 0) {
2105 ocfs2_dlm_shutdown_debug(osb);
2106 if (osb->vote_task)
2107 kthread_stop(osb->vote_task);
2108 }
2109
2110 mlog_exit(status);
2111 return status;
2112}
2113
2114void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2115{
2116 mlog_entry_void();
2117
2118 dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
2119
2120 ocfs2_drop_osb_locks(osb);
2121
2122 if (osb->vote_task) {
2123 kthread_stop(osb->vote_task);
2124 osb->vote_task = NULL;
2125 }
2126
2127 ocfs2_lock_res_free(&osb->osb_super_lockres);
2128 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2129
2130 dlm_unregister_domain(osb->dlm);
2131 osb->dlm = NULL;
2132
2133 ocfs2_dlm_shutdown_debug(osb);
2134
2135 mlog_exit_void();
2136}
2137
2138static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
2139{
2140 struct ocfs2_lock_res *lockres = opaque;
2141 unsigned long flags;
2142
2143 mlog_entry_void();
2144
2145 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
2146 lockres->l_unlock_action);
2147
2148 spin_lock_irqsave(&lockres->l_lock, flags);
2149 /* We tried to cancel a convert request, but it was already
2150 * granted. All we want to do here is clear our unlock
2151 * state. The wake_up call done at the bottom is redundant
2152 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
2153 * hurt anything anyway */
2154 if (status == DLM_CANCELGRANT &&
2155 lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2156 mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
2157
2158 /* We don't clear the busy flag in this case as it
2159 * should have been cleared by the ast which the dlm
2160 * has called. */
2161 goto complete_unlock;
2162 }
2163
2164 if (status != DLM_NORMAL) {
2165 mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
2166 "unlock_action %d\n", status, lockres->l_name,
2167 lockres->l_unlock_action);
2168 spin_unlock_irqrestore(&lockres->l_lock, flags);
2169 return;
2170 }
2171
2172 switch(lockres->l_unlock_action) {
2173 case OCFS2_UNLOCK_CANCEL_CONVERT:
2174 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
2175 lockres->l_action = OCFS2_AST_INVALID;
2176 break;
2177 case OCFS2_UNLOCK_DROP_LOCK:
2178 lockres->l_level = LKM_IVMODE;
2179 break;
2180 default:
2181 BUG();
2182 }
2183
2184 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
2185complete_unlock:
2186 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
2187 spin_unlock_irqrestore(&lockres->l_lock, flags);
2188
2189 wake_up(&lockres->l_event);
2190
2191 mlog_exit_void();
2192}
2193
2194typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
2195
2196struct drop_lock_cb {
2197 ocfs2_pre_drop_cb_t *drop_func;
2198 void *drop_data;
2199};
2200
2201static int ocfs2_drop_lock(struct ocfs2_super *osb,
2202 struct ocfs2_lock_res *lockres,
2203 struct drop_lock_cb *dcb)
2204{
2205 enum dlm_status status;
2206 unsigned long flags;
2207
2208 /* We didn't get anywhere near actually using this lockres. */
2209 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
2210 goto out;
2211
2212 spin_lock_irqsave(&lockres->l_lock, flags);
2213
2214 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
2215 "lockres %s, flags 0x%lx\n",
2216 lockres->l_name, lockres->l_flags);
2217
2218 while (lockres->l_flags & OCFS2_LOCK_BUSY) {
2219 mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
2220 "%u, unlock_action = %u\n",
2221 lockres->l_name, lockres->l_flags, lockres->l_action,
2222 lockres->l_unlock_action);
2223
2224 spin_unlock_irqrestore(&lockres->l_lock, flags);
2225
2226 /* XXX: Today we just wait on any busy
2227 * locks... Perhaps we need to cancel converts in the
2228 * future? */
2229 ocfs2_wait_on_busy_lock(lockres);
2230
2231 spin_lock_irqsave(&lockres->l_lock, flags);
2232 }
2233
2234 if (dcb)
2235 dcb->drop_func(lockres, dcb->drop_data);
2236
2237 if (lockres->l_flags & OCFS2_LOCK_BUSY)
2238 mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
2239 lockres->l_name);
2240 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
2241 mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
2242
2243 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
2244 spin_unlock_irqrestore(&lockres->l_lock, flags);
2245 goto out;
2246 }
2247
2248 lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
2249
2250 /* make sure we never get here while waiting for an ast to
2251 * fire. */
2252 BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
2253
2254 /* is this necessary? */
2255 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2256 lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
2257 spin_unlock_irqrestore(&lockres->l_lock, flags);
2258
2259 mlog(0, "lock %s\n", lockres->l_name);
2260
2261 status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
2262 lockres->l_ops->unlock_ast, lockres);
2263 if (status != DLM_NORMAL) {
2264 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2265 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
2266 dlm_print_one_lock(lockres->l_lksb.lockid);
2267 BUG();
2268 }
2269 mlog(0, "lock %s, successfull return from dlmunlock\n",
2270 lockres->l_name);
2271
2272 ocfs2_wait_on_busy_lock(lockres);
2273out:
2274 mlog_exit(0);
2275 return 0;
2276}
2277
2278/* Mark the lockres as being dropped. It will no longer be
2279 * queued if blocking, but we still may have to wait on it
2280 * being dequeued from the vote thread before we can consider
2281 * it safe to drop.
2282 *
2283 * You can *not* attempt to call cluster_lock on this lockres anymore. */
2284void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
2285{
2286 int status;
2287 struct ocfs2_mask_waiter mw;
2288 unsigned long flags;
2289
2290 ocfs2_init_mask_waiter(&mw);
2291
2292 spin_lock_irqsave(&lockres->l_lock, flags);
2293 lockres->l_flags |= OCFS2_LOCK_FREEING;
2294 while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
2295 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
2296 spin_unlock_irqrestore(&lockres->l_lock, flags);
2297
2298 mlog(0, "Waiting on lockres %s\n", lockres->l_name);
2299
2300 status = ocfs2_wait_for_mask(&mw);
2301 if (status)
2302 mlog_errno(status);
2303
2304 spin_lock_irqsave(&lockres->l_lock, flags);
2305 }
2306 spin_unlock_irqrestore(&lockres->l_lock, flags);
2307}
2308
2309static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
2310{
2311 int status;
2312
2313 mlog_entry_void();
2314
2315 ocfs2_mark_lockres_freeing(&osb->osb_super_lockres);
2316
2317 status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
2318 if (status < 0)
2319 mlog_errno(status);
2320
2321 ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres);
2322
2323 status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
2324 if (status < 0)
2325 mlog_errno(status);
2326
2327 mlog_exit(status);
2328}
2329
2330static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
2331{
2332 struct inode *inode = data;
2333
2334 /* the metadata lock requires a bit more work as we have an
2335 * LVB to worry about. */
2336 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
2337 lockres->l_level == LKM_EXMODE &&
2338 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
2339 __ocfs2_stuff_meta_lvb(inode);
2340}
2341
2342int ocfs2_drop_inode_locks(struct inode *inode)
2343{
2344 int status, err;
2345 struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
2346
2347 mlog_entry_void();
2348
2349 /* No need to call ocfs2_mark_lockres_freeing here -
2350 * ocfs2_clear_inode has done it for us. */
2351
2352 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2353 &OCFS2_I(inode)->ip_data_lockres,
2354 NULL);
2355 if (err < 0)
2356 mlog_errno(err);
2357
2358 status = err;
2359
2360 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2361 &OCFS2_I(inode)->ip_meta_lockres,
2362 &meta_dcb);
2363 if (err < 0)
2364 mlog_errno(err);
2365 if (err < 0 && !status)
2366 status = err;
2367
2368 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2369 &OCFS2_I(inode)->ip_rw_lockres,
2370 NULL);
2371 if (err < 0)
2372 mlog_errno(err);
2373 if (err < 0 && !status)
2374 status = err;
2375
2376 mlog_exit(status);
2377 return status;
2378}
2379
2380static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2381 int new_level)
2382{
2383 assert_spin_locked(&lockres->l_lock);
2384
2385 BUG_ON(lockres->l_blocking <= LKM_NLMODE);
2386
2387 if (lockres->l_level <= new_level) {
2388 mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
2389 lockres->l_level, new_level);
2390 BUG();
2391 }
2392
2393 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
2394 lockres->l_name, new_level, lockres->l_blocking);
2395
2396 lockres->l_action = OCFS2_AST_DOWNCONVERT;
2397 lockres->l_requested = new_level;
2398 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2399}
2400
2401static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
2402 struct ocfs2_lock_res *lockres,
2403 int new_level,
2404 int lvb)
2405{
2406 int ret, dlm_flags = LKM_CONVERT;
2407 enum dlm_status status;
2408
2409 mlog_entry_void();
2410
2411 if (lvb)
2412 dlm_flags |= LKM_VALBLK;
2413
2414 status = dlmlock(osb->dlm,
2415 new_level,
2416 &lockres->l_lksb,
2417 dlm_flags,
2418 lockres->l_name,
2419 lockres->l_ops->ast,
2420 lockres,
2421 lockres->l_ops->bast);
2422 if (status != DLM_NORMAL) {
2423 ocfs2_log_dlm_error("dlmlock", status, lockres);
2424 ret = -EINVAL;
2425 ocfs2_recover_from_dlm_error(lockres, 1);
2426 goto bail;
2427 }
2428
2429 ret = 0;
2430bail:
2431 mlog_exit(ret);
2432 return ret;
2433}
2434
2435/* returns 1 when the caller should unlock and call dlmunlock */
2436static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
2437 struct ocfs2_lock_res *lockres)
2438{
2439 assert_spin_locked(&lockres->l_lock);
2440
2441 mlog_entry_void();
2442 mlog(0, "lock %s\n", lockres->l_name);
2443
2444 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2445 /* If we're already trying to cancel a lock conversion
2446 * then just drop the spinlock and allow the caller to
2447 * requeue this lock. */
2448
2449 mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
2450 return 0;
2451 }
2452
2453 /* were we in a convert when we got the bast fire? */
2454 BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
2455 lockres->l_action != OCFS2_AST_DOWNCONVERT);
2456 /* set things up for the unlockast to know to just
2457 * clear out the ast_action and unset busy, etc. */
2458 lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
2459
2460 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
2461 "lock %s, invalid flags: 0x%lx\n",
2462 lockres->l_name, lockres->l_flags);
2463
2464 return 1;
2465}
2466
2467static int ocfs2_cancel_convert(struct ocfs2_super *osb,
2468 struct ocfs2_lock_res *lockres)
2469{
2470 int ret;
2471 enum dlm_status status;
2472
2473 mlog_entry_void();
2474 mlog(0, "lock %s\n", lockres->l_name);
2475
2476 ret = 0;
2477 status = dlmunlock(osb->dlm,
2478 &lockres->l_lksb,
2479 LKM_CANCEL,
2480 lockres->l_ops->unlock_ast,
2481 lockres);
2482 if (status != DLM_NORMAL) {
2483 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2484 ret = -EINVAL;
2485 ocfs2_recover_from_dlm_error(lockres, 0);
2486 }
2487
2488 mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
2489
2490 mlog_exit(ret);
2491 return ret;
2492}
2493
2494static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
2495 struct ocfs2_lock_res *lockres,
2496 int new_level)
2497{
2498 int ret;
2499
2500 mlog_entry_void();
2501
2502 BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
2503
2504 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
2505 ret = 0;
2506 mlog(0, "lockres %s currently being refreshed -- backing "
2507 "off!\n", lockres->l_name);
2508 } else if (new_level == LKM_PRMODE)
2509 ret = !lockres->l_ex_holders &&
2510 ocfs2_inode_fully_checkpointed(inode);
2511 else /* Must be NLMODE we're converting to. */
2512 ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
2513 ocfs2_inode_fully_checkpointed(inode);
2514
2515 mlog_exit(ret);
2516 return ret;
2517}
2518
2519static int ocfs2_do_unblock_meta(struct inode *inode,
2520 int *requeue)
2521{
2522 int new_level;
2523 int set_lvb = 0;
2524 int ret = 0;
2525 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
2526 unsigned long flags;
2527
2528 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2529
2530 mlog_entry_void();
2531
2532 spin_lock_irqsave(&lockres->l_lock, flags);
2533
2534 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2535
2536 mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
2537 lockres->l_blocking);
2538
2539 BUG_ON(lockres->l_level != LKM_EXMODE &&
2540 lockres->l_level != LKM_PRMODE);
2541
2542 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2543 *requeue = 1;
2544 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2545 spin_unlock_irqrestore(&lockres->l_lock, flags);
2546 if (ret) {
2547 ret = ocfs2_cancel_convert(osb, lockres);
2548 if (ret < 0)
2549 mlog_errno(ret);
2550 }
2551 goto leave;
2552 }
2553
2554 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2555
2556 mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
2557 lockres->l_level, lockres->l_blocking, new_level);
2558
2559 if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
2560 if (lockres->l_level == LKM_EXMODE)
2561 set_lvb = 1;
2562
2563 /* If the lock hasn't been refreshed yet (rare), then
2564 * our memory inode values are old and we skip
2565 * stuffing the lvb. There's no need to actually clear
2566 * out the lvb here as it's value is still valid. */
2567 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
2568 if (set_lvb)
2569 __ocfs2_stuff_meta_lvb(inode);
2570 } else
2571 mlog(0, "lockres %s: downconverting stale lock!\n",
2572 lockres->l_name);
2573
2574 mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
2575 "l_blocking=%d, new_level=%d\n",
2576 lockres->l_level, lockres->l_blocking, new_level);
2577
2578 ocfs2_prepare_downconvert(lockres, new_level);
2579 spin_unlock_irqrestore(&lockres->l_lock, flags);
2580 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
2581 goto leave;
2582 }
2583 if (!ocfs2_inode_fully_checkpointed(inode))
2584 ocfs2_start_checkpoint(osb);
2585
2586 *requeue = 1;
2587 spin_unlock_irqrestore(&lockres->l_lock, flags);
2588 ret = 0;
2589leave:
2590 mlog_exit(ret);
2591 return ret;
2592}
2593
2594static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
2595 struct ocfs2_lock_res *lockres,
2596 int *requeue,
2597 ocfs2_convert_worker_t *worker)
2598{
2599 unsigned long flags;
2600 int blocking;
2601 int new_level;
2602 int ret = 0;
2603
2604 mlog_entry_void();
2605
2606 spin_lock_irqsave(&lockres->l_lock, flags);
2607
2608 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2609
2610recheck:
2611 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2612 *requeue = 1;
2613 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2614 spin_unlock_irqrestore(&lockres->l_lock, flags);
2615 if (ret) {
2616 ret = ocfs2_cancel_convert(osb, lockres);
2617 if (ret < 0)
2618 mlog_errno(ret);
2619 }
2620 goto leave;
2621 }
2622
2623 /* if we're blocking an exclusive and we have *any* holders,
2624 * then requeue. */
2625 if ((lockres->l_blocking == LKM_EXMODE)
2626 && (lockres->l_ex_holders || lockres->l_ro_holders)) {
2627 spin_unlock_irqrestore(&lockres->l_lock, flags);
2628 *requeue = 1;
2629 ret = 0;
2630 goto leave;
2631 }
2632
2633 /* If it's a PR we're blocking, then only
2634 * requeue if we've got any EX holders */
2635 if (lockres->l_blocking == LKM_PRMODE &&
2636 lockres->l_ex_holders) {
2637 spin_unlock_irqrestore(&lockres->l_lock, flags);
2638 *requeue = 1;
2639 ret = 0;
2640 goto leave;
2641 }
2642
2643 /* If we get here, then we know that there are no more
2644 * incompatible holders (and anyone asking for an incompatible
2645 * lock is blocked). We can now downconvert the lock */
2646 if (!worker)
2647 goto downconvert;
2648
2649 /* Some lockres types want to do a bit of work before
2650 * downconverting a lock. Allow that here. The worker function
2651 * may sleep, so we save off a copy of what we're blocking as
2652 * it may change while we're not holding the spin lock. */
2653 blocking = lockres->l_blocking;
2654 spin_unlock_irqrestore(&lockres->l_lock, flags);
2655
2656 worker(lockres, blocking);
2657
2658 spin_lock_irqsave(&lockres->l_lock, flags);
2659 if (blocking != lockres->l_blocking) {
2660 /* If this changed underneath us, then we can't drop
2661 * it just yet. */
2662 goto recheck;
2663 }
2664
2665downconvert:
2666 *requeue = 0;
2667 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2668
2669 ocfs2_prepare_downconvert(lockres, new_level);
2670 spin_unlock_irqrestore(&lockres->l_lock, flags);
2671 ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
2672leave:
2673 mlog_exit(ret);
2674 return ret;
2675}
2676
2677static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2678 int blocking)
2679{
2680 struct inode *inode;
2681 struct address_space *mapping;
2682
2683 mlog_entry_void();
2684
2685 inode = ocfs2_lock_res_inode(lockres);
2686 mapping = inode->i_mapping;
2687
2688 if (filemap_fdatawrite(mapping)) {
2689 mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!",
2690 OCFS2_I(inode)->ip_blkno);
2691 }
2692 sync_mapping_buffers(mapping);
2693 if (blocking == LKM_EXMODE) {
2694 truncate_inode_pages(mapping, 0);
2695 unmap_mapping_range(mapping, 0, 0, 0);
2696 } else {
2697 /* We only need to wait on the I/O if we're not also
2698 * truncating pages because truncate_inode_pages waits
2699 * for us above. We don't truncate pages if we're
2700 * blocking anything < EXMODE because we want to keep
2701 * them around in that case. */
2702 filemap_fdatawait(mapping);
2703 }
2704
2705 mlog_exit_void();
2706}
2707
2708int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
2709 int *requeue)
2710{
2711 int status;
2712 struct inode *inode;
2713 struct ocfs2_super *osb;
2714
2715 mlog_entry_void();
2716
2717 inode = ocfs2_lock_res_inode(lockres);
2718 osb = OCFS2_SB(inode->i_sb);
2719
2720 mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
2721
2722 status = ocfs2_generic_unblock_lock(osb,
2723 lockres,
2724 requeue,
2725 ocfs2_data_convert_worker);
2726 if (status < 0)
2727 mlog_errno(status);
2728
2729 mlog(0, "inode %"MLFu64", requeue = %d\n",
2730 OCFS2_I(inode)->ip_blkno, *requeue);
2731
2732 mlog_exit(status);
2733 return status;
2734}
2735
2736static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
2737 int *requeue)
2738{
2739 int status;
2740 struct inode *inode;
2741
2742 mlog_entry_void();
2743
2744 mlog(0, "Unblock lockres %s\n", lockres->l_name);
2745
2746 inode = ocfs2_lock_res_inode(lockres);
2747
2748 status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
2749 lockres,
2750 requeue,
2751 NULL);
2752 if (status < 0)
2753 mlog_errno(status);
2754
2755 mlog_exit(status);
2756 return status;
2757}
2758
2759
2760int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
2761 int *requeue)
2762{
2763 int status;
2764 struct inode *inode;
2765
2766 mlog_entry_void();
2767
2768 inode = ocfs2_lock_res_inode(lockres);
2769
2770 mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
2771
2772 status = ocfs2_do_unblock_meta(inode, requeue);
2773 if (status < 0)
2774 mlog_errno(status);
2775
2776 mlog(0, "inode %"MLFu64", requeue = %d\n",
2777 OCFS2_I(inode)->ip_blkno, *requeue);
2778
2779 mlog_exit(status);
2780 return status;
2781}
2782
2783/* Generic unblock function for any lockres whose private data is an
2784 * ocfs2_super pointer. */
2785static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
2786 int *requeue)
2787{
2788 int status;
2789 struct ocfs2_super *osb;
2790
2791 mlog_entry_void();
2792
2793 mlog(0, "Unblock lockres %s\n", lockres->l_name);
2794
2795 osb = ocfs2_lock_res_super(lockres);
2796
2797 status = ocfs2_generic_unblock_lock(osb,
2798 lockres,
2799 requeue,
2800 NULL);
2801 if (status < 0)
2802 mlog_errno(status);
2803
2804 mlog_exit(status);
2805 return status;
2806}
2807
2808void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
2809 struct ocfs2_lock_res *lockres)
2810{
2811 int status;
2812 int requeue = 0;
2813 unsigned long flags;
2814
2815 /* Our reference to the lockres in this function can be
2816 * considered valid until we remove the OCFS2_LOCK_QUEUED
2817 * flag. */
2818
2819 mlog_entry_void();
2820
2821 BUG_ON(!lockres);
2822 BUG_ON(!lockres->l_ops);
2823 BUG_ON(!lockres->l_ops->unblock);
2824
2825 mlog(0, "lockres %s blocked.\n", lockres->l_name);
2826
2827 /* Detect whether a lock has been marked as going away while
2828 * the vote thread was processing other things. A lock can
2829 * still be marked with OCFS2_LOCK_FREEING after this check,
2830 * but short circuiting here will still save us some
2831 * performance. */
2832 spin_lock_irqsave(&lockres->l_lock, flags);
2833 if (lockres->l_flags & OCFS2_LOCK_FREEING)
2834 goto unqueue;
2835 spin_unlock_irqrestore(&lockres->l_lock, flags);
2836
2837 status = lockres->l_ops->unblock(lockres, &requeue);
2838 if (status < 0)
2839 mlog_errno(status);
2840
2841 spin_lock_irqsave(&lockres->l_lock, flags);
2842unqueue:
2843 if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
2844 lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
2845 } else
2846 ocfs2_schedule_blocked_lock(osb, lockres);
2847
2848 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
2849 requeue ? "yes" : "no");
2850 spin_unlock_irqrestore(&lockres->l_lock, flags);
2851
2852 mlog_exit_void();
2853}
2854
2855static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
2856 struct ocfs2_lock_res *lockres)
2857{
2858 mlog_entry_void();
2859
2860 assert_spin_locked(&lockres->l_lock);
2861
2862 if (lockres->l_flags & OCFS2_LOCK_FREEING) {
2863 /* Do not schedule a lock for downconvert when it's on
2864 * the way to destruction - any nodes wanting access
2865 * to the resource will get it soon. */
2866 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
2867 lockres->l_name, lockres->l_flags);
2868 return;
2869 }
2870
2871 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
2872
2873 spin_lock(&osb->vote_task_lock);
2874 if (list_empty(&lockres->l_blocked_list)) {
2875 list_add_tail(&lockres->l_blocked_list,
2876 &osb->blocked_lock_list);
2877 osb->blocked_lock_count++;
2878 }
2879 spin_unlock(&osb->vote_task_lock);
2880
2881 mlog_exit_void();
2882}
2883
2884/* This aids in debugging situations where a bad LVB might be involved. */
2885void ocfs2_dump_meta_lvb_info(u64 level,
2886 const char *function,
2887 unsigned int line,
2888 struct ocfs2_lock_res *lockres)
2889{
2890 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
2891
2892 mlog(level, "LVB information for %s (called from %s:%u):\n",
2893 lockres->l_name, function, line);
2894 mlog(level, "version: %u, clusters: %u\n",
2895 be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
2896 mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n",
2897 be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid),
2898 be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode));
2899 mlog(level, "nlink %u, atime_packed 0x%"MLFx64", "
2900 "ctime_packed 0x%"MLFx64", mtime_packed 0x%"MLFx64"\n",
2901 be16_to_cpu(lvb->lvb_inlink), be64_to_cpu(lvb->lvb_iatime_packed),
2902 be64_to_cpu(lvb->lvb_ictime_packed),
2903 be64_to_cpu(lvb->lvb_imtime_packed));
2904}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
new file mode 100644
index 000000000000..8f2d1db2d9ea
--- /dev/null
+++ b/fs/ocfs2/dlmglue.h
@@ -0,0 +1,111 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmglue.h
5 *
6 * description here
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef DLMGLUE_H
28#define DLMGLUE_H
29
30#define OCFS2_LVB_VERSION 2
31
32struct ocfs2_meta_lvb {
33 __be32 lvb_version;
34 __be32 lvb_iclusters;
35 __be32 lvb_iuid;
36 __be32 lvb_igid;
37 __be64 lvb_iatime_packed;
38 __be64 lvb_ictime_packed;
39 __be64 lvb_imtime_packed;
40 __be64 lvb_isize;
41 __be16 lvb_imode;
42 __be16 lvb_inlink;
43 __be32 lvb_reserved[3];
44};
45
46/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
47/* don't wait on recovery. */
48#define OCFS2_META_LOCK_RECOVERY (0x01)
49/* Instruct the dlm not to queue ourselves on the other node. */
50#define OCFS2_META_LOCK_NOQUEUE (0x02)
51/* don't block waiting for the vote thread, instead return -EAGAIN */
52#define OCFS2_LOCK_NONBLOCK (0x04)
53
54int ocfs2_dlm_init(struct ocfs2_super *osb);
55void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
56void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
57void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
58 enum ocfs2_lock_type type,
59 struct inode *inode);
60void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
61int ocfs2_create_new_inode_locks(struct inode *inode);
62int ocfs2_drop_inode_locks(struct inode *inode);
63int ocfs2_data_lock_full(struct inode *inode,
64 int write,
65 int arg_flags);
66#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
67int ocfs2_data_lock_with_page(struct inode *inode,
68 int write,
69 struct page *page);
70void ocfs2_data_unlock(struct inode *inode,
71 int write);
72int ocfs2_rw_lock(struct inode *inode, int write);
73void ocfs2_rw_unlock(struct inode *inode, int write);
74int ocfs2_meta_lock_full(struct inode *inode,
75 struct ocfs2_journal_handle *handle,
76 struct buffer_head **ret_bh,
77 int ex,
78 int arg_flags);
79int ocfs2_meta_lock_with_page(struct inode *inode,
80 struct ocfs2_journal_handle *handle,
81 struct buffer_head **ret_bh,
82 int ex,
83 struct page *page);
84/* 99% of the time we don't want to supply any additional flags --
85 * those are for very specific cases only. */
86#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
87void ocfs2_meta_unlock(struct inode *inode,
88 int ex);
89int ocfs2_super_lock(struct ocfs2_super *osb,
90 int ex);
91void ocfs2_super_unlock(struct ocfs2_super *osb,
92 int ex);
93int ocfs2_rename_lock(struct ocfs2_super *osb);
94void ocfs2_rename_unlock(struct ocfs2_super *osb);
95void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
96
97/* for the vote thread */
98void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
99 struct ocfs2_lock_res *lockres);
100
101struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
102void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
103
104/* aids in debugging and tracking lvbs */
105void ocfs2_dump_meta_lvb_info(u64 level,
106 const char *function,
107 unsigned int line,
108 struct ocfs2_lock_res *lockres);
109#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
110
111#endif /* DLMGLUE_H */
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
new file mode 100644
index 000000000000..f226b2207628
--- /dev/null
+++ b/fs/ocfs2/endian.h
@@ -0,0 +1,45 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * Copyright (C) 2005 Oracle. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 */
21
22#ifndef OCFS2_ENDIAN_H
23#define OCFS2_ENDIAN_H
24
25static inline void le16_add_cpu(__le16 *var, u16 val)
26{
27 *var = cpu_to_le16(le16_to_cpu(*var) + val);
28}
29
30static inline void le32_add_cpu(__le32 *var, u32 val)
31{
32 *var = cpu_to_le32(le32_to_cpu(*var) + val);
33}
34
35static inline void le32_and_cpu(__le32 *var, u32 val)
36{
37 *var = cpu_to_le32(le32_to_cpu(*var) & val);
38}
39
40static inline void be32_add_cpu(__be32 *var, u32 val)
41{
42 *var = cpu_to_be32(be32_to_cpu(*var) + val);
43}
44
45#endif /* OCFS2_ENDIAN_H */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
new file mode 100644
index 000000000000..5810160d92a8
--- /dev/null
+++ b/fs/ocfs2/export.c
@@ -0,0 +1,248 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * export.c
5 *
6 * Functions to facilitate NFS exporting
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28
29#define MLOG_MASK_PREFIX ML_EXPORT
30#include <cluster/masklog.h>
31
32#include "ocfs2.h"
33
34#include "dir.h"
35#include "dlmglue.h"
36#include "export.h"
37#include "inode.h"
38
39#include "buffer_head_io.h"
40
41struct ocfs2_inode_handle
42{
43 u64 ih_blkno;
44 u32 ih_generation;
45};
46
47static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
48{
49 struct ocfs2_inode_handle *handle = vobjp;
50 struct inode *inode;
51 struct dentry *result;
52
53 mlog_entry("(0x%p, 0x%p)\n", sb, handle);
54
55 if (handle->ih_blkno == 0) {
56 mlog_errno(-ESTALE);
57 return ERR_PTR(-ESTALE);
58 }
59
60 inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
61
62 if (IS_ERR(inode)) {
63 mlog_errno(PTR_ERR(inode));
64 return (void *)inode;
65 }
66
67 if (handle->ih_generation != inode->i_generation) {
68 iput(inode);
69 mlog_errno(-ESTALE);
70 return ERR_PTR(-ESTALE);
71 }
72
73 result = d_alloc_anon(inode);
74
75 if (!result) {
76 iput(inode);
77 mlog_errno(-ENOMEM);
78 return ERR_PTR(-ENOMEM);
79 }
80
81 mlog_exit_ptr(result);
82 return result;
83}
84
85static struct dentry *ocfs2_get_parent(struct dentry *child)
86{
87 int status;
88 u64 blkno;
89 struct dentry *parent;
90 struct inode *inode;
91 struct inode *dir = child->d_inode;
92 struct buffer_head *dirent_bh = NULL;
93 struct ocfs2_dir_entry *dirent;
94
95 mlog_entry("(0x%p, '%.*s')\n", child,
96 child->d_name.len, child->d_name.name);
97
98 mlog(0, "find parent of directory %"MLFu64"\n",
99 OCFS2_I(dir)->ip_blkno);
100
101 status = ocfs2_meta_lock(dir, NULL, NULL, 0);
102 if (status < 0) {
103 if (status != -ENOENT)
104 mlog_errno(status);
105 parent = ERR_PTR(status);
106 goto bail;
107 }
108
109 status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
110 &dirent);
111 if (status < 0) {
112 parent = ERR_PTR(-ENOENT);
113 goto bail_unlock;
114 }
115
116 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
117 if (IS_ERR(inode)) {
118 mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
119 parent = ERR_PTR(-EACCES);
120 goto bail_unlock;
121 }
122
123 parent = d_alloc_anon(inode);
124 if (!parent) {
125 iput(inode);
126 parent = ERR_PTR(-ENOMEM);
127 }
128
129bail_unlock:
130 ocfs2_meta_unlock(dir, 0);
131
132 if (dirent_bh)
133 brelse(dirent_bh);
134
135bail:
136 mlog_exit_ptr(parent);
137
138 return parent;
139}
140
141static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
142 int connectable)
143{
144 struct inode *inode = dentry->d_inode;
145 int len = *max_len;
146 int type = 1;
147 u64 blkno;
148 u32 generation;
149
150 mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
151 dentry->d_name.len, dentry->d_name.name,
152 fh, len, connectable);
153
154 if (len < 3 || (connectable && len < 6)) {
155 mlog(ML_ERROR, "fh buffer is too small for encoding\n");
156 type = 255;
157 goto bail;
158 }
159
160 blkno = OCFS2_I(inode)->ip_blkno;
161 generation = inode->i_generation;
162
163 mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
164 blkno, generation);
165
166 len = 3;
167 fh[0] = cpu_to_le32((u32)(blkno >> 32));
168 fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
169 fh[2] = cpu_to_le32(generation);
170
171 if (connectable && !S_ISDIR(inode->i_mode)) {
172 struct inode *parent;
173
174 spin_lock(&dentry->d_lock);
175
176 parent = dentry->d_parent->d_inode;
177 blkno = OCFS2_I(parent)->ip_blkno;
178 generation = parent->i_generation;
179
180 fh[3] = cpu_to_le32((u32)(blkno >> 32));
181 fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
182 fh[5] = cpu_to_le32(generation);
183
184 spin_unlock(&dentry->d_lock);
185
186 len = 6;
187 type = 2;
188
189 mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n",
190 blkno, generation);
191 }
192
193 *max_len = len;
194
195bail:
196 mlog_exit(type);
197 return type;
198}
199
200static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
201 int fh_len, int fileid_type,
202 int (*acceptable)(void *context,
203 struct dentry *de),
204 void *context)
205{
206 struct ocfs2_inode_handle handle, parent;
207 struct dentry *ret = NULL;
208
209 mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
210 sb, fh, fh_len, fileid_type, acceptable, context);
211
212 if (fh_len < 3 || fileid_type > 2)
213 goto bail;
214
215 if (fileid_type == 2) {
216 if (fh_len < 6)
217 goto bail;
218
219 parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
220 parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
221 parent.ih_generation = le32_to_cpu(fh[5]);
222
223 mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n",
224 parent.ih_blkno, parent.ih_generation);
225 }
226
227 handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32;
228 handle.ih_blkno |= (u64)le32_to_cpu(fh[1]);
229 handle.ih_generation = le32_to_cpu(fh[2]);
230
231 mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
232 handle.ih_blkno, handle.ih_generation);
233
234 ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent,
235 acceptable, context);
236
237bail:
238 mlog_exit_ptr(ret);
239 return ret;
240}
241
242struct export_operations ocfs2_export_ops = {
243 .decode_fh = ocfs2_decode_fh,
244 .encode_fh = ocfs2_encode_fh,
245
246 .get_parent = ocfs2_get_parent,
247 .get_dentry = ocfs2_get_dentry,
248};
diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h
new file mode 100644
index 000000000000..5b77ee7866ef
--- /dev/null
+++ b/fs/ocfs2/export.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * export.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_EXPORT_H
27#define OCFS2_EXPORT_H
28
29extern struct export_operations ocfs2_export_ops;
30
31#endif /* OCFS2_EXPORT_H */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
new file mode 100644
index 000000000000..f2fb40cd296a
--- /dev/null
+++ b/fs/ocfs2/extent_map.c
@@ -0,0 +1,994 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * extent_map.c
5 *
6 * In-memory extent map for OCFS2. Man, this code was prettier in
7 * the library.
8 *
9 * Copyright (C) 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License, version 2, as published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/init.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/rbtree.h>
31
32#define MLOG_MASK_PREFIX ML_EXTENT_MAP
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "extent_map.h"
38#include "inode.h"
39#include "super.h"
40
41#include "buffer_head_io.h"
42
43
44/*
45 * SUCK SUCK SUCK
46 * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
47 */
48
49struct ocfs2_extent_map_entry {
50 struct rb_node e_node;
51 int e_tree_depth;
52 struct ocfs2_extent_rec e_rec;
53};
54
55struct ocfs2_em_insert_context {
56 int need_left;
57 int need_right;
58 struct ocfs2_extent_map_entry *new_ent;
59 struct ocfs2_extent_map_entry *old_ent;
60 struct ocfs2_extent_map_entry *left_ent;
61 struct ocfs2_extent_map_entry *right_ent;
62};
63
64static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
65
66
67static struct ocfs2_extent_map_entry *
68ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
69 u32 cpos, u32 clusters,
70 struct rb_node ***ret_p,
71 struct rb_node **ret_parent);
72static int ocfs2_extent_map_insert(struct inode *inode,
73 struct ocfs2_extent_rec *rec,
74 int tree_depth);
75static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
76 struct ocfs2_extent_map_entry *ent);
77static int ocfs2_extent_map_find_leaf(struct inode *inode,
78 u32 cpos, u32 clusters,
79 struct ocfs2_extent_list *el);
80static int ocfs2_extent_map_lookup_read(struct inode *inode,
81 u32 cpos, u32 clusters,
82 struct ocfs2_extent_map_entry **ret_ent);
83static int ocfs2_extent_map_try_insert(struct inode *inode,
84 struct ocfs2_extent_rec *rec,
85 int tree_depth,
86 struct ocfs2_em_insert_context *ctxt);
87
88/* returns 1 only if the rec contains all the given clusters -- that is that
89 * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
90 * clusters) is >= the argument's endpoint */
91static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
92 u32 cpos, u32 clusters)
93{
94 if (le32_to_cpu(rec->e_cpos) > cpos)
95 return 0;
96 if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
97 le32_to_cpu(rec->e_clusters))
98 return 0;
99 return 1;
100}
101
102
103/*
104 * Find an entry in the tree that intersects the region passed in.
105 * Note that this will find straddled intervals, it is up to the
106 * callers to enforce any boundary conditions.
107 *
108 * Callers must hold ip_lock. This lookup is not guaranteed to return
109 * a tree_depth 0 match, and as such can race inserts if the lock
110 * were not held.
111 *
112 * The rb_node garbage lets insertion share the search. Trivial
113 * callers pass NULL.
114 */
115static struct ocfs2_extent_map_entry *
116ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
117 u32 cpos, u32 clusters,
118 struct rb_node ***ret_p,
119 struct rb_node **ret_parent)
120{
121 struct rb_node **p = &em->em_extents.rb_node;
122 struct rb_node *parent = NULL;
123 struct ocfs2_extent_map_entry *ent = NULL;
124
125 while (*p)
126 {
127 parent = *p;
128 ent = rb_entry(parent, struct ocfs2_extent_map_entry,
129 e_node);
130 if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
131 p = &(*p)->rb_left;
132 ent = NULL;
133 } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
134 le32_to_cpu(ent->e_rec.e_clusters))) {
135 p = &(*p)->rb_right;
136 ent = NULL;
137 } else
138 break;
139 }
140
141 if (ret_p != NULL)
142 *ret_p = p;
143 if (ret_parent != NULL)
144 *ret_parent = parent;
145 return ent;
146}
147
148/*
149 * Find the leaf containing the interval we want. While we're on our
150 * way down the tree, fill in every record we see at any depth, because
151 * we might want it later.
152 *
153 * Note that this code is run without ip_lock. That's because it
154 * sleeps while reading. If someone is also filling the extent list at
155 * the same time we are, we might have to restart.
156 */
157static int ocfs2_extent_map_find_leaf(struct inode *inode,
158 u32 cpos, u32 clusters,
159 struct ocfs2_extent_list *el)
160{
161 int i, ret;
162 struct buffer_head *eb_bh = NULL;
163 u64 blkno;
164 u32 rec_end;
165 struct ocfs2_extent_block *eb;
166 struct ocfs2_extent_rec *rec;
167
168 /*
169 * The bh data containing the el cannot change here, because
170 * we hold alloc_sem. So we can do this without other
171 * locks.
172 */
173 while (el->l_tree_depth)
174 {
175 blkno = 0;
176 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
177 rec = &el->l_recs[i];
178 rec_end = (le32_to_cpu(rec->e_cpos) +
179 le32_to_cpu(rec->e_clusters));
180
181 ret = -EBADR;
182 if (rec_end > OCFS2_I(inode)->ip_clusters) {
183 mlog_errno(ret);
184 goto out_free;
185 }
186
187 if (rec_end <= cpos) {
188 ret = ocfs2_extent_map_insert(inode, rec,
189 le16_to_cpu(el->l_tree_depth));
190 if (ret && (ret != -EEXIST)) {
191 mlog_errno(ret);
192 goto out_free;
193 }
194 continue;
195 }
196 if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
197 ret = ocfs2_extent_map_insert(inode, rec,
198 le16_to_cpu(el->l_tree_depth));
199 if (ret && (ret != -EEXIST)) {
200 mlog_errno(ret);
201 goto out_free;
202 }
203 continue;
204 }
205
206 /*
207 * We've found a record that matches our
208 * interval. We don't insert it because we're
209 * about to traverse it.
210 */
211
212 /* Check to see if we're stradling */
213 ret = -ESRCH;
214 if (!ocfs2_extent_rec_contains_clusters(rec,
215 cpos,
216 clusters)) {
217 mlog_errno(ret);
218 goto out_free;
219 }
220
221 /*
222 * If we've already found a record, the el has
223 * two records covering the same interval.
224 * EEEK!
225 */
226 ret = -EBADR;
227 if (blkno) {
228 mlog_errno(ret);
229 goto out_free;
230 }
231
232 blkno = le64_to_cpu(rec->e_blkno);
233 }
234
235 /*
236 * We don't support holes, and we're still up
237 * in the branches, so we'd better have found someone
238 */
239 ret = -EBADR;
240 if (!blkno) {
241 mlog_errno(ret);
242 goto out_free;
243 }
244
245 if (eb_bh) {
246 brelse(eb_bh);
247 eb_bh = NULL;
248 }
249 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
250 blkno, &eb_bh, OCFS2_BH_CACHED,
251 inode);
252 if (ret) {
253 mlog_errno(ret);
254 goto out_free;
255 }
256 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
257 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
258 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
259 ret = -EIO;
260 goto out_free;
261 }
262 el = &eb->h_list;
263 }
264
265 if (el->l_tree_depth)
266 BUG();
267
268 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
269 rec = &el->l_recs[i];
270 ret = ocfs2_extent_map_insert(inode, rec,
271 le16_to_cpu(el->l_tree_depth));
272 if (ret) {
273 mlog_errno(ret);
274 goto out_free;
275 }
276 }
277
278 ret = 0;
279
280out_free:
281 if (eb_bh)
282 brelse(eb_bh);
283
284 return ret;
285}
286
287/*
288 * This lookup actually will read from disk. It has one invariant:
289 * It will never re-traverse blocks. This means that all inserts should
290 * be new regions or more granular regions (both allowed by insert).
291 */
292static int ocfs2_extent_map_lookup_read(struct inode *inode,
293 u32 cpos,
294 u32 clusters,
295 struct ocfs2_extent_map_entry **ret_ent)
296{
297 int ret;
298 u64 blkno;
299 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
300 struct ocfs2_extent_map_entry *ent;
301 struct buffer_head *bh = NULL;
302 struct ocfs2_extent_block *eb;
303 struct ocfs2_dinode *di;
304 struct ocfs2_extent_list *el;
305
306 spin_lock(&OCFS2_I(inode)->ip_lock);
307 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
308 if (ent) {
309 if (!ent->e_tree_depth) {
310 spin_unlock(&OCFS2_I(inode)->ip_lock);
311 *ret_ent = ent;
312 return 0;
313 }
314 blkno = le64_to_cpu(ent->e_rec.e_blkno);
315 spin_unlock(&OCFS2_I(inode)->ip_lock);
316
317 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
318 OCFS2_BH_CACHED, inode);
319 if (ret) {
320 mlog_errno(ret);
321 if (bh)
322 brelse(bh);
323 return ret;
324 }
325 eb = (struct ocfs2_extent_block *)bh->b_data;
326 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
327 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
328 brelse(bh);
329 return -EIO;
330 }
331 el = &eb->h_list;
332 } else {
333 spin_unlock(&OCFS2_I(inode)->ip_lock);
334
335 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
336 OCFS2_I(inode)->ip_blkno, &bh,
337 OCFS2_BH_CACHED, inode);
338 if (ret) {
339 mlog_errno(ret);
340 if (bh)
341 brelse(bh);
342 return ret;
343 }
344 di = (struct ocfs2_dinode *)bh->b_data;
345 if (!OCFS2_IS_VALID_DINODE(di)) {
346 brelse(bh);
347 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
348 return -EIO;
349 }
350 el = &di->id2.i_list;
351 }
352
353 ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
354 brelse(bh);
355 if (ret) {
356 mlog_errno(ret);
357 return ret;
358 }
359
360 ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
361 if (!ent) {
362 ret = -ESRCH;
363 mlog_errno(ret);
364 return ret;
365 }
366
367 if (ent->e_tree_depth)
368 BUG(); /* FIXME: Make sure this isn't a corruption */
369
370 *ret_ent = ent;
371
372 return 0;
373}
374
375/*
376 * Callers must hold ip_lock. This can insert pieces of the tree,
377 * thus racing lookup if the lock weren't held.
378 */
379static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
380 struct ocfs2_extent_map_entry *ent)
381{
382 struct rb_node **p, *parent;
383 struct ocfs2_extent_map_entry *old_ent;
384
385 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
386 le32_to_cpu(ent->e_rec.e_clusters),
387 &p, &parent);
388 if (old_ent)
389 return -EEXIST;
390
391 rb_link_node(&ent->e_node, parent, p);
392 rb_insert_color(&ent->e_node, &em->em_extents);
393
394 return 0;
395}
396
397
398/*
399 * Simple rule: on any return code other than -EAGAIN, anything left
400 * in the insert_context will be freed.
401 */
402static int ocfs2_extent_map_try_insert(struct inode *inode,
403 struct ocfs2_extent_rec *rec,
404 int tree_depth,
405 struct ocfs2_em_insert_context *ctxt)
406{
407 int ret;
408 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
409 struct ocfs2_extent_map_entry *old_ent;
410
411 ctxt->need_left = 0;
412 ctxt->need_right = 0;
413 ctxt->old_ent = NULL;
414
415 spin_lock(&OCFS2_I(inode)->ip_lock);
416 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
417 if (!ret) {
418 ctxt->new_ent = NULL;
419 goto out_unlock;
420 }
421
422 old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
423 le32_to_cpu(rec->e_clusters), NULL,
424 NULL);
425
426 if (!old_ent)
427 BUG();
428
429 ret = -EEXIST;
430 if (old_ent->e_tree_depth < tree_depth)
431 goto out_unlock;
432
433 if (old_ent->e_tree_depth == tree_depth) {
434 if (!memcmp(rec, &old_ent->e_rec,
435 sizeof(struct ocfs2_extent_rec)))
436 ret = 0;
437
438 /* FIXME: Should this be ESRCH/EBADR??? */
439 goto out_unlock;
440 }
441
442 /*
443 * We do it in this order specifically so that no actual tree
444 * changes occur until we have all the pieces we need. We
445 * don't want malloc failures to leave an inconsistent tree.
446 * Whenever we drop the lock, another process could be
447 * inserting. Also note that, if another process just beat us
448 * to an insert, we might not need the same pieces we needed
449 * the first go round. In the end, the pieces we need will
450 * be used, and the pieces we don't will be freed.
451 */
452 ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
453 le32_to_cpu(old_ent->e_rec.e_cpos));
454 ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
455 le32_to_cpu(old_ent->e_rec.e_clusters)) >
456 (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
457 ret = -EAGAIN;
458 if (ctxt->need_left) {
459 if (!ctxt->left_ent)
460 goto out_unlock;
461 *(ctxt->left_ent) = *old_ent;
462 ctxt->left_ent->e_rec.e_clusters =
463 cpu_to_le32(le32_to_cpu(rec->e_cpos) -
464 le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
465 }
466 if (ctxt->need_right) {
467 if (!ctxt->right_ent)
468 goto out_unlock;
469 *(ctxt->right_ent) = *old_ent;
470 ctxt->right_ent->e_rec.e_cpos =
471 cpu_to_le32(le32_to_cpu(rec->e_cpos) +
472 le32_to_cpu(rec->e_clusters));
473 ctxt->right_ent->e_rec.e_clusters =
474 cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
475 le32_to_cpu(old_ent->e_rec.e_clusters)) -
476 le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
477 }
478
479 rb_erase(&old_ent->e_node, &em->em_extents);
480 /* Now that he's erased, set him up for deletion */
481 ctxt->old_ent = old_ent;
482
483 if (ctxt->need_left) {
484 ret = ocfs2_extent_map_insert_entry(em,
485 ctxt->left_ent);
486 if (ret)
487 goto out_unlock;
488 ctxt->left_ent = NULL;
489 }
490
491 if (ctxt->need_right) {
492 ret = ocfs2_extent_map_insert_entry(em,
493 ctxt->right_ent);
494 if (ret)
495 goto out_unlock;
496 ctxt->right_ent = NULL;
497 }
498
499 ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
500
501 if (!ret)
502 ctxt->new_ent = NULL;
503
504out_unlock:
505 spin_unlock(&OCFS2_I(inode)->ip_lock);
506
507 return ret;
508}
509
510
511static int ocfs2_extent_map_insert(struct inode *inode,
512 struct ocfs2_extent_rec *rec,
513 int tree_depth)
514{
515 int ret;
516 struct ocfs2_em_insert_context ctxt = {0, };
517
518 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
519 OCFS2_I(inode)->ip_map.em_clusters) {
520 ret = -EBADR;
521 mlog_errno(ret);
522 return ret;
523 }
524
525 /* Zero e_clusters means a truncated tail record. It better be EOF */
526 if (!rec->e_clusters) {
527 if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
528 OCFS2_I(inode)->ip_map.em_clusters) {
529 ret = -EBADR;
530 mlog_errno(ret);
531 return ret;
532 }
533
534 /* Ignore the truncated tail */
535 return 0;
536 }
537
538 ret = -ENOMEM;
539 ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
540 GFP_KERNEL);
541 if (!ctxt.new_ent) {
542 mlog_errno(ret);
543 return ret;
544 }
545
546 ctxt.new_ent->e_rec = *rec;
547 ctxt.new_ent->e_tree_depth = tree_depth;
548
549 do {
550 ret = -ENOMEM;
551 if (ctxt.need_left && !ctxt.left_ent) {
552 ctxt.left_ent =
553 kmem_cache_alloc(ocfs2_em_ent_cachep,
554 GFP_KERNEL);
555 if (!ctxt.left_ent)
556 break;
557 }
558 if (ctxt.need_right && !ctxt.right_ent) {
559 ctxt.right_ent =
560 kmem_cache_alloc(ocfs2_em_ent_cachep,
561 GFP_KERNEL);
562 if (!ctxt.right_ent)
563 break;
564 }
565
566 ret = ocfs2_extent_map_try_insert(inode, rec,
567 tree_depth, &ctxt);
568 } while (ret == -EAGAIN);
569
570 if (ret < 0)
571 mlog_errno(ret);
572
573 if (ctxt.left_ent)
574 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
575 if (ctxt.right_ent)
576 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
577 if (ctxt.old_ent)
578 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
579 if (ctxt.new_ent)
580 kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
581
582 return ret;
583}
584
585/*
586 * Append this record to the tail of the extent map. It must be
587 * tree_depth 0. The record might be an extension of an existing
588 * record, and as such that needs to be handled. eg:
589 *
590 * Existing record in the extent map:
591 *
592 * cpos = 10, len = 10
593 * |---------|
594 *
595 * New Record:
596 *
597 * cpos = 10, len = 20
598 * |------------------|
599 *
600 * The passed record is the new on-disk record. The new_clusters value
601 * is how many clusters were added to the file. If the append is a
602 * contiguous append, the new_clusters has been added to
603 * rec->e_clusters. If the append is an entirely new extent, then
604 * rec->e_clusters is == new_clusters.
605 */
606int ocfs2_extent_map_append(struct inode *inode,
607 struct ocfs2_extent_rec *rec,
608 u32 new_clusters)
609{
610 int ret;
611 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
612 struct ocfs2_extent_map_entry *ent;
613 struct ocfs2_extent_rec *old;
614
615 BUG_ON(!new_clusters);
616 BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
617
618 if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
619 /*
620 * Size changed underneath us on disk. Drop any
621 * straddling records and update our idea of
622 * i_clusters
623 */
624 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
625 em->em_clusters = OCFS2_I(inode)->ip_clusters;
626 }
627
628 mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
629 le32_to_cpu(rec->e_clusters)) !=
630 (em->em_clusters + new_clusters),
631 "Inode %"MLFu64":\n"
632 "rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
633 "em->em_clusters = %u + new_clusters = %u = %u\n",
634 OCFS2_I(inode)->ip_blkno,
635 le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
636 le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
637 em->em_clusters, new_clusters,
638 em->em_clusters + new_clusters);
639
640 em->em_clusters += new_clusters;
641
642 ret = -ENOENT;
643 if (le32_to_cpu(rec->e_clusters) > new_clusters) {
644 /* This is a contiguous append */
645 ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
646 NULL, NULL);
647 if (ent) {
648 old = &ent->e_rec;
649 BUG_ON((le32_to_cpu(rec->e_cpos) +
650 le32_to_cpu(rec->e_clusters)) !=
651 (le32_to_cpu(old->e_cpos) +
652 le32_to_cpu(old->e_clusters) +
653 new_clusters));
654 if (ent->e_tree_depth == 0) {
655 BUG_ON(le32_to_cpu(old->e_cpos) !=
656 le32_to_cpu(rec->e_cpos));
657 BUG_ON(le64_to_cpu(old->e_blkno) !=
658 le64_to_cpu(rec->e_blkno));
659 ret = 0;
660 }
661 /*
662 * Let non-leafs fall through as -ENOENT to
663 * force insertion of the new leaf.
664 */
665 le32_add_cpu(&old->e_clusters, new_clusters);
666 }
667 }
668
669 if (ret == -ENOENT)
670 ret = ocfs2_extent_map_insert(inode, rec, 0);
671 if (ret < 0)
672 mlog_errno(ret);
673 return ret;
674}
675
676#if 0
677/* Code here is included but defined out as it completes the extent
678 * map api and may be used in the future. */
679
680/*
681 * Look up the record containing this cluster offset. This record is
682 * part of the extent map. Do not free it. Any changes you make to
683 * it will reflect in the extent map. So, if your last extent
684 * is (cpos = 10, clusters = 10) and you truncate the file by 5
685 * clusters, you can do:
686 *
687 * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
688 * rec->e_clusters -= 5;
689 *
690 * The lookup does not read from disk. If the map isn't filled in for
691 * an entry, you won't find it.
692 *
693 * Also note that the returned record is valid until alloc_sem is
694 * dropped. After that, truncate and extend can happen. Caveat Emptor.
695 */
696int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
697 struct ocfs2_extent_rec **rec,
698 int *tree_depth)
699{
700 int ret = -ENOENT;
701 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
702 struct ocfs2_extent_map_entry *ent;
703
704 *rec = NULL;
705
706 if (cpos >= OCFS2_I(inode)->ip_clusters)
707 return -EINVAL;
708
709 if (cpos >= em->em_clusters) {
710 /*
711 * Size changed underneath us on disk. Drop any
712 * straddling records and update our idea of
713 * i_clusters
714 */
715 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
716 em->em_clusters = OCFS2_I(inode)->ip_clusters ;
717 }
718
719 ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
720 NULL, NULL);
721
722 if (ent) {
723 *rec = &ent->e_rec;
724 if (tree_depth)
725 *tree_depth = ent->e_tree_depth;
726 ret = 0;
727 }
728
729 return ret;
730}
731
732int ocfs2_extent_map_get_clusters(struct inode *inode,
733 u32 v_cpos, int count,
734 u32 *p_cpos, int *ret_count)
735{
736 int ret;
737 u32 coff, ccount;
738 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
739 struct ocfs2_extent_map_entry *ent = NULL;
740
741 *p_cpos = ccount = 0;
742
743 if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
744 return -EINVAL;
745
746 if ((v_cpos + count) > em->em_clusters) {
747 /*
748 * Size changed underneath us on disk. Drop any
749 * straddling records and update our idea of
750 * i_clusters
751 */
752 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
753 em->em_clusters = OCFS2_I(inode)->ip_clusters;
754 }
755
756
757 ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
758 if (ret)
759 return ret;
760
761 if (ent) {
762 /* We should never find ourselves straddling an interval */
763 if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
764 v_cpos,
765 count))
766 return -ESRCH;
767
768 coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
769 *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
770 le64_to_cpu(ent->e_rec.e_blkno)) +
771 coff;
772
773 if (ret_count)
774 *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
775
776 return 0;
777 }
778
779
780 return -ENOENT;
781}
782
783#endif /* 0 */
784
785int ocfs2_extent_map_get_blocks(struct inode *inode,
786 u64 v_blkno, int count,
787 u64 *p_blkno, int *ret_count)
788{
789 int ret;
790 u64 boff;
791 u32 cpos, clusters;
792 int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
793 struct ocfs2_extent_map_entry *ent = NULL;
794 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
795 struct ocfs2_extent_rec *rec;
796
797 *p_blkno = 0;
798
799 cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
800 clusters = ocfs2_blocks_to_clusters(inode->i_sb,
801 (u64)count + bpc - 1);
802 if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
803 ret = -EINVAL;
804 mlog_errno(ret);
805 return ret;
806 }
807
808 if ((cpos + clusters) > em->em_clusters) {
809 /*
810 * Size changed underneath us on disk. Drop any
811 * straddling records and update our idea of
812 * i_clusters
813 */
814 ocfs2_extent_map_drop(inode, em->em_clusters - 1);
815 em->em_clusters = OCFS2_I(inode)->ip_clusters;
816 }
817
818 ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
819 if (ret) {
820 mlog_errno(ret);
821 return ret;
822 }
823
824 if (ent)
825 {
826 rec = &ent->e_rec;
827
828 /* We should never find ourselves straddling an interval */
829 if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
830 ret = -ESRCH;
831 mlog_errno(ret);
832 return ret;
833 }
834
835 boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
836 le32_to_cpu(rec->e_cpos));
837 boff += (v_blkno & (u64)(bpc - 1));
838 *p_blkno = le64_to_cpu(rec->e_blkno) + boff;
839
840 if (ret_count) {
841 *ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
842 le32_to_cpu(rec->e_clusters)) - boff;
843 }
844
845 return 0;
846 }
847
848 return -ENOENT;
849}
850
851int ocfs2_extent_map_init(struct inode *inode)
852{
853 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
854
855 em->em_extents = RB_ROOT;
856 em->em_clusters = 0;
857
858 return 0;
859}
860
861/* Needs the lock */
862static void __ocfs2_extent_map_drop(struct inode *inode,
863 u32 new_clusters,
864 struct rb_node **free_head,
865 struct ocfs2_extent_map_entry **tail_ent)
866{
867 struct rb_node *node, *next;
868 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
869 struct ocfs2_extent_map_entry *ent;
870
871 *free_head = NULL;
872
873 ent = NULL;
874 node = rb_last(&em->em_extents);
875 while (node)
876 {
877 next = rb_prev(node);
878
879 ent = rb_entry(node, struct ocfs2_extent_map_entry,
880 e_node);
881 if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
882 break;
883
884 rb_erase(&ent->e_node, &em->em_extents);
885
886 node->rb_right = *free_head;
887 *free_head = node;
888
889 ent = NULL;
890 node = next;
891 }
892
893 /* Do we have an entry straddling new_clusters? */
894 if (tail_ent) {
895 if (ent &&
896 ((le32_to_cpu(ent->e_rec.e_cpos) +
897 le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
898 *tail_ent = ent;
899 else
900 *tail_ent = NULL;
901 }
902}
903
904static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
905{
906 struct rb_node *node;
907 struct ocfs2_extent_map_entry *ent;
908
909 while (free_head) {
910 node = free_head;
911 free_head = node->rb_right;
912
913 ent = rb_entry(node, struct ocfs2_extent_map_entry,
914 e_node);
915 kmem_cache_free(ocfs2_em_ent_cachep, ent);
916 }
917}
918
919/*
920 * Remove all entries past new_clusters, inclusive of an entry that
921 * contains new_clusters. This is effectively a cache forget.
922 *
923 * If you want to also clip the last extent by some number of clusters,
924 * you need to call ocfs2_extent_map_trunc().
925 * This code does not check or modify ip_clusters.
926 */
927int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
928{
929 struct rb_node *free_head = NULL;
930 struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
931 struct ocfs2_extent_map_entry *ent;
932
933 spin_lock(&OCFS2_I(inode)->ip_lock);
934
935 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
936
937 if (ent) {
938 rb_erase(&ent->e_node, &em->em_extents);
939 ent->e_node.rb_right = free_head;
940 free_head = &ent->e_node;
941 }
942
943 spin_unlock(&OCFS2_I(inode)->ip_lock);
944
945 if (free_head)
946 __ocfs2_extent_map_drop_cleanup(free_head);
947
948 return 0;
949}
950
951/*
952 * Remove all entries past new_clusters and also clip any extent
953 * straddling new_clusters, if there is one. This does not check
954 * or modify ip_clusters
955 */
956int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
957{
958 struct rb_node *free_head = NULL;
959 struct ocfs2_extent_map_entry *ent = NULL;
960
961 spin_lock(&OCFS2_I(inode)->ip_lock);
962
963 __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
964
965 if (ent)
966 ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
967 le32_to_cpu(ent->e_rec.e_cpos));
968
969 OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
970
971 spin_unlock(&OCFS2_I(inode)->ip_lock);
972
973 if (free_head)
974 __ocfs2_extent_map_drop_cleanup(free_head);
975
976 return 0;
977}
978
979int __init init_ocfs2_extent_maps(void)
980{
981 ocfs2_em_ent_cachep =
982 kmem_cache_create("ocfs2_em_ent",
983 sizeof(struct ocfs2_extent_map_entry),
984 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
985 if (!ocfs2_em_ent_cachep)
986 return -ENOMEM;
987
988 return 0;
989}
990
991void __exit exit_ocfs2_extent_maps(void)
992{
993 kmem_cache_destroy(ocfs2_em_ent_cachep);
994}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
new file mode 100644
index 000000000000..fa3745efa886
--- /dev/null
+++ b/fs/ocfs2/extent_map.h
@@ -0,0 +1,46 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * extent_map.h
5 *
6 * In-memory file extent mappings for OCFS2.
7 *
8 * Copyright (C) 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public
20 * License along with this program; if not, write to the
21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 * Boston, MA 021110-1307, USA.
23 */
24
25#ifndef _EXTENT_MAP_H
26#define _EXTENT_MAP_H
27
28int init_ocfs2_extent_maps(void);
29void exit_ocfs2_extent_maps(void);
30
31/*
32 * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
33 * to be held. The allocation cannot change at all while the map is
34 * in the process of being updated.
35 */
36int ocfs2_extent_map_init(struct inode *inode);
37int ocfs2_extent_map_append(struct inode *inode,
38 struct ocfs2_extent_rec *rec,
39 u32 new_clusters);
40int ocfs2_extent_map_get_blocks(struct inode *inode,
41 u64 v_blkno, int count,
42 u64 *p_blkno, int *ret_count);
43int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
44int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
45
46#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
new file mode 100644
index 000000000000..72ae9e3306f4
--- /dev/null
+++ b/fs/ocfs2/file.c
@@ -0,0 +1,1237 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * file.c
5 *
6 * File open, close, extend, truncate
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/pagemap.h>
31#include <linux/uio.h>
32
33#define MLOG_MASK_PREFIX ML_INODE
34#include <cluster/masklog.h>
35
36#include "ocfs2.h"
37
38#include "alloc.h"
39#include "aops.h"
40#include "dir.h"
41#include "dlmglue.h"
42#include "extent_map.h"
43#include "file.h"
44#include "sysfile.h"
45#include "inode.h"
46#include "journal.h"
47#include "mmap.h"
48#include "suballoc.h"
49#include "super.h"
50
51#include "buffer_head_io.h"
52
53static int ocfs2_sync_inode(struct inode *inode)
54{
55 filemap_fdatawrite(inode->i_mapping);
56 return sync_mapping_buffers(inode->i_mapping);
57}
58
59static int ocfs2_file_open(struct inode *inode, struct file *file)
60{
61 int status;
62 int mode = file->f_flags;
63 struct ocfs2_inode_info *oi = OCFS2_I(inode);
64
65 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
66 file->f_dentry->d_name.len, file->f_dentry->d_name.name);
67
68 spin_lock(&oi->ip_lock);
69
70 /* Check that the inode hasn't been wiped from disk by another
71 * node. If it hasn't then we're safe as long as we hold the
72 * spin lock until our increment of open count. */
73 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
74 spin_unlock(&oi->ip_lock);
75
76 status = -ENOENT;
77 goto leave;
78 }
79
80 if (mode & O_DIRECT)
81 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
82
83 oi->ip_open_count++;
84 spin_unlock(&oi->ip_lock);
85 status = 0;
86leave:
87 mlog_exit(status);
88 return status;
89}
90
91static int ocfs2_file_release(struct inode *inode, struct file *file)
92{
93 struct ocfs2_inode_info *oi = OCFS2_I(inode);
94
95 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
96 file->f_dentry->d_name.len,
97 file->f_dentry->d_name.name);
98
99 spin_lock(&oi->ip_lock);
100 if (!--oi->ip_open_count)
101 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
102 spin_unlock(&oi->ip_lock);
103
104 mlog_exit(0);
105
106 return 0;
107}
108
109static int ocfs2_sync_file(struct file *file,
110 struct dentry *dentry,
111 int datasync)
112{
113 int err = 0;
114 journal_t *journal;
115 struct inode *inode = dentry->d_inode;
116 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
117
118 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
119 dentry->d_name.len, dentry->d_name.name);
120
121 err = ocfs2_sync_inode(dentry->d_inode);
122 if (err)
123 goto bail;
124
125 journal = osb->journal->j_journal;
126 err = journal_force_commit(journal);
127
128bail:
129 mlog_exit(err);
130
131 return (err < 0) ? -EIO : 0;
132}
133
134int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
135 struct inode *inode,
136 struct buffer_head *fe_bh,
137 u64 new_i_size)
138{
139 int status;
140
141 mlog_entry_void();
142 i_size_write(inode, new_i_size);
143 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
144 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
145
146 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
147 if (status < 0) {
148 mlog_errno(status);
149 goto bail;
150 }
151
152bail:
153 mlog_exit(status);
154 return status;
155}
156
157static int ocfs2_simple_size_update(struct inode *inode,
158 struct buffer_head *di_bh,
159 u64 new_i_size)
160{
161 int ret;
162 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
163 struct ocfs2_journal_handle *handle = NULL;
164
165 handle = ocfs2_start_trans(osb, NULL,
166 OCFS2_INODE_UPDATE_CREDITS);
167 if (handle == NULL) {
168 ret = -ENOMEM;
169 mlog_errno(ret);
170 goto out;
171 }
172
173 ret = ocfs2_set_inode_size(handle, inode, di_bh,
174 new_i_size);
175 if (ret < 0)
176 mlog_errno(ret);
177
178 ocfs2_commit_trans(handle);
179out:
180 return ret;
181}
182
183static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
184 struct inode *inode,
185 struct buffer_head *fe_bh,
186 u64 new_i_size)
187{
188 int status;
189 struct ocfs2_journal_handle *handle;
190
191 mlog_entry_void();
192
193 /* TODO: This needs to actually orphan the inode in this
194 * transaction. */
195
196 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
197 if (IS_ERR(handle)) {
198 status = PTR_ERR(handle);
199 mlog_errno(status);
200 goto out;
201 }
202
203 status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
204 if (status < 0)
205 mlog_errno(status);
206
207 ocfs2_commit_trans(handle);
208out:
209 mlog_exit(status);
210 return status;
211}
212
213static int ocfs2_truncate_file(struct inode *inode,
214 struct buffer_head *di_bh,
215 u64 new_i_size)
216{
217 int status = 0;
218 struct ocfs2_dinode *fe = NULL;
219 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
220 struct ocfs2_truncate_context *tc = NULL;
221
222 mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
223 OCFS2_I(inode)->ip_blkno, new_i_size);
224
225 truncate_inode_pages(inode->i_mapping, new_i_size);
226
227 fe = (struct ocfs2_dinode *) di_bh->b_data;
228 if (!OCFS2_IS_VALID_DINODE(fe)) {
229 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
230 status = -EIO;
231 goto bail;
232 }
233
234 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
235 "Inode %"MLFu64", inode i_size = %lld != di "
236 "i_size = %"MLFu64", i_flags = 0x%x\n",
237 OCFS2_I(inode)->ip_blkno,
238 i_size_read(inode),
239 le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags));
240
241 if (new_i_size > le64_to_cpu(fe->i_size)) {
242 mlog(0, "asked to truncate file with size (%"MLFu64") "
243 "to size (%"MLFu64")!\n",
244 le64_to_cpu(fe->i_size), new_i_size);
245 status = -EINVAL;
246 mlog_errno(status);
247 goto bail;
248 }
249
250 mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n",
251 le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size);
252
253 /* lets handle the simple truncate cases before doing any more
254 * cluster locking. */
255 if (new_i_size == le64_to_cpu(fe->i_size))
256 goto bail;
257
258 if (le32_to_cpu(fe->i_clusters) ==
259 ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
260 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
261 fe->i_clusters);
262 /* No allocation change is required, so lets fast path
263 * this truncate. */
264 status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
265 if (status < 0)
266 mlog_errno(status);
267 goto bail;
268 }
269
270 /* This forces other nodes to sync and drop their pages */
271 status = ocfs2_data_lock(inode, 1);
272 if (status < 0) {
273 mlog_errno(status);
274 goto bail;
275 }
276 ocfs2_data_unlock(inode, 1);
277
278 /* alright, we're going to need to do a full blown alloc size
279 * change. Orphan the inode so that recovery can complete the
280 * truncate if necessary. This does the task of marking
281 * i_size. */
282 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
283 if (status < 0) {
284 mlog_errno(status);
285 goto bail;
286 }
287
288 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
289 if (status < 0) {
290 mlog_errno(status);
291 goto bail;
292 }
293
294 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
295 if (status < 0) {
296 mlog_errno(status);
297 goto bail;
298 }
299
300 /* TODO: orphan dir cleanup here. */
301bail:
302
303 mlog_exit(status);
304 return status;
305}
306
307/*
308 * extend allocation only here.
309 * we'll update all the disk stuff, and oip->alloc_size
310 *
311 * expect stuff to be locked, a transaction started and enough data /
312 * metadata reservations in the contexts.
313 *
314 * Will return -EAGAIN, and a reason if a restart is needed.
315 * If passed in, *reason will always be set, even in error.
316 */
317int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
318 struct inode *inode,
319 u32 clusters_to_add,
320 struct buffer_head *fe_bh,
321 struct ocfs2_journal_handle *handle,
322 struct ocfs2_alloc_context *data_ac,
323 struct ocfs2_alloc_context *meta_ac,
324 enum ocfs2_alloc_restarted *reason_ret)
325{
326 int status = 0;
327 int free_extents;
328 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
329 enum ocfs2_alloc_restarted reason = RESTART_NONE;
330 u32 bit_off, num_bits;
331 u64 block;
332
333 BUG_ON(!clusters_to_add);
334
335 free_extents = ocfs2_num_free_extents(osb, inode, fe);
336 if (free_extents < 0) {
337 status = free_extents;
338 mlog_errno(status);
339 goto leave;
340 }
341
342 /* there are two cases which could cause us to EAGAIN in the
343 * we-need-more-metadata case:
344 * 1) we haven't reserved *any*
345 * 2) we are so fragmented, we've needed to add metadata too
346 * many times. */
347 if (!free_extents && !meta_ac) {
348 mlog(0, "we haven't reserved any metadata!\n");
349 status = -EAGAIN;
350 reason = RESTART_META;
351 goto leave;
352 } else if ((!free_extents)
353 && (ocfs2_alloc_context_bits_left(meta_ac)
354 < ocfs2_extend_meta_needed(fe))) {
355 mlog(0, "filesystem is really fragmented...\n");
356 status = -EAGAIN;
357 reason = RESTART_META;
358 goto leave;
359 }
360
361 status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
362 &bit_off, &num_bits);
363 if (status < 0) {
364 if (status != -ENOSPC)
365 mlog_errno(status);
366 goto leave;
367 }
368
369 BUG_ON(num_bits > clusters_to_add);
370
371 /* reserve our write early -- insert_extent may update the inode */
372 status = ocfs2_journal_access(handle, inode, fe_bh,
373 OCFS2_JOURNAL_ACCESS_WRITE);
374 if (status < 0) {
375 mlog_errno(status);
376 goto leave;
377 }
378
379 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
380 mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n",
381 num_bits, bit_off, OCFS2_I(inode)->ip_blkno);
382 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
383 num_bits, meta_ac);
384 if (status < 0) {
385 mlog_errno(status);
386 goto leave;
387 }
388
389 le32_add_cpu(&fe->i_clusters, num_bits);
390 spin_lock(&OCFS2_I(inode)->ip_lock);
391 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
392 spin_unlock(&OCFS2_I(inode)->ip_lock);
393
394 status = ocfs2_journal_dirty(handle, fe_bh);
395 if (status < 0) {
396 mlog_errno(status);
397 goto leave;
398 }
399
400 clusters_to_add -= num_bits;
401
402 if (clusters_to_add) {
403 mlog(0, "need to alloc once more, clusters = %u, wanted = "
404 "%u\n", fe->i_clusters, clusters_to_add);
405 status = -EAGAIN;
406 reason = RESTART_TRANS;
407 }
408
409leave:
410 mlog_exit(status);
411 if (reason_ret)
412 *reason_ret = reason;
413 return status;
414}
415
416static int ocfs2_extend_allocation(struct inode *inode,
417 u32 clusters_to_add)
418{
419 int status = 0;
420 int restart_func = 0;
421 int drop_alloc_sem = 0;
422 int credits, num_free_extents;
423 u32 prev_clusters;
424 struct buffer_head *bh = NULL;
425 struct ocfs2_dinode *fe = NULL;
426 struct ocfs2_journal_handle *handle = NULL;
427 struct ocfs2_alloc_context *data_ac = NULL;
428 struct ocfs2_alloc_context *meta_ac = NULL;
429 enum ocfs2_alloc_restarted why;
430 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
431
432 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
433
434 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
435 OCFS2_BH_CACHED, inode);
436 if (status < 0) {
437 mlog_errno(status);
438 goto leave;
439 }
440
441 fe = (struct ocfs2_dinode *) bh->b_data;
442 if (!OCFS2_IS_VALID_DINODE(fe)) {
443 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
444 status = -EIO;
445 goto leave;
446 }
447
448restart_all:
449 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
450
451 mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
452 "clusters_to_add = %u\n",
453 OCFS2_I(inode)->ip_blkno, i_size_read(inode),
454 fe->i_clusters, clusters_to_add);
455
456 handle = ocfs2_alloc_handle(osb);
457 if (handle == NULL) {
458 status = -ENOMEM;
459 mlog_errno(status);
460 goto leave;
461 }
462
463 num_free_extents = ocfs2_num_free_extents(osb,
464 inode,
465 fe);
466 if (num_free_extents < 0) {
467 status = num_free_extents;
468 mlog_errno(status);
469 goto leave;
470 }
471
472 if (!num_free_extents) {
473 status = ocfs2_reserve_new_metadata(osb,
474 handle,
475 fe,
476 &meta_ac);
477 if (status < 0) {
478 if (status != -ENOSPC)
479 mlog_errno(status);
480 goto leave;
481 }
482 }
483
484 status = ocfs2_reserve_clusters(osb,
485 handle,
486 clusters_to_add,
487 &data_ac);
488 if (status < 0) {
489 if (status != -ENOSPC)
490 mlog_errno(status);
491 goto leave;
492 }
493
494 /* blocks peope in read/write from reading our allocation
495 * until we're done changing it. We depend on i_sem to block
496 * other extend/truncate calls while we're here. Ordering wrt
497 * start_trans is important here -- always do it before! */
498 down_write(&OCFS2_I(inode)->ip_alloc_sem);
499 drop_alloc_sem = 1;
500
501 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
502 handle = ocfs2_start_trans(osb, handle, credits);
503 if (IS_ERR(handle)) {
504 status = PTR_ERR(handle);
505 handle = NULL;
506 mlog_errno(status);
507 goto leave;
508 }
509
510restarted_transaction:
511 /* reserve a write to the file entry early on - that we if we
512 * run out of credits in the allocation path, we can still
513 * update i_size. */
514 status = ocfs2_journal_access(handle, inode, bh,
515 OCFS2_JOURNAL_ACCESS_WRITE);
516 if (status < 0) {
517 mlog_errno(status);
518 goto leave;
519 }
520
521 prev_clusters = OCFS2_I(inode)->ip_clusters;
522
523 status = ocfs2_do_extend_allocation(osb,
524 inode,
525 clusters_to_add,
526 bh,
527 handle,
528 data_ac,
529 meta_ac,
530 &why);
531 if ((status < 0) && (status != -EAGAIN)) {
532 if (status != -ENOSPC)
533 mlog_errno(status);
534 goto leave;
535 }
536
537 status = ocfs2_journal_dirty(handle, bh);
538 if (status < 0) {
539 mlog_errno(status);
540 goto leave;
541 }
542
543 spin_lock(&OCFS2_I(inode)->ip_lock);
544 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
545 spin_unlock(&OCFS2_I(inode)->ip_lock);
546
547 if (why != RESTART_NONE && clusters_to_add) {
548 if (why == RESTART_META) {
549 mlog(0, "restarting function.\n");
550 restart_func = 1;
551 } else {
552 BUG_ON(why != RESTART_TRANS);
553
554 mlog(0, "restarting transaction.\n");
555 /* TODO: This can be more intelligent. */
556 credits = ocfs2_calc_extend_credits(osb->sb,
557 fe,
558 clusters_to_add);
559 status = ocfs2_extend_trans(handle, credits);
560 if (status < 0) {
561 /* handle still has to be committed at
562 * this point. */
563 status = -ENOMEM;
564 mlog_errno(status);
565 goto leave;
566 }
567 goto restarted_transaction;
568 }
569 }
570
571 mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
572 fe->i_clusters, fe->i_size);
573 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
574 OCFS2_I(inode)->ip_clusters, i_size_read(inode));
575
576leave:
577 if (drop_alloc_sem) {
578 up_write(&OCFS2_I(inode)->ip_alloc_sem);
579 drop_alloc_sem = 0;
580 }
581 if (handle) {
582 ocfs2_commit_trans(handle);
583 handle = NULL;
584 }
585 if (data_ac) {
586 ocfs2_free_alloc_context(data_ac);
587 data_ac = NULL;
588 }
589 if (meta_ac) {
590 ocfs2_free_alloc_context(meta_ac);
591 meta_ac = NULL;
592 }
593 if ((!status) && restart_func) {
594 restart_func = 0;
595 goto restart_all;
596 }
597 if (bh) {
598 brelse(bh);
599 bh = NULL;
600 }
601
602 mlog_exit(status);
603 return status;
604}
605
606/* Some parts of this taken from generic_cont_expand, which turned out
607 * to be too fragile to do exactly what we need without us having to
608 * worry about recursive locking in ->commit_write(). */
609static int ocfs2_write_zero_page(struct inode *inode,
610 u64 size)
611{
612 struct address_space *mapping = inode->i_mapping;
613 struct page *page;
614 unsigned long index;
615 unsigned int offset;
616 struct ocfs2_journal_handle *handle = NULL;
617 int ret;
618
619 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
620 /* ugh. in prepare/commit_write, if from==to==start of block, we
621 ** skip the prepare. make sure we never send an offset for the start
622 ** of a block
623 */
624 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
625 offset++;
626 }
627 index = size >> PAGE_CACHE_SHIFT;
628
629 page = grab_cache_page(mapping, index);
630 if (!page) {
631 ret = -ENOMEM;
632 mlog_errno(ret);
633 goto out;
634 }
635
636 ret = ocfs2_prepare_write(NULL, page, offset, offset);
637 if (ret < 0) {
638 mlog_errno(ret);
639 goto out_unlock;
640 }
641
642 if (ocfs2_should_order_data(inode)) {
643 handle = ocfs2_start_walk_page_trans(inode, page, offset,
644 offset);
645 if (IS_ERR(handle)) {
646 ret = PTR_ERR(handle);
647 handle = NULL;
648 goto out_unlock;
649 }
650 }
651
652 /* must not update i_size! */
653 ret = block_commit_write(page, offset, offset);
654 if (ret < 0)
655 mlog_errno(ret);
656 else
657 ret = 0;
658
659 if (handle)
660 ocfs2_commit_trans(handle);
661out_unlock:
662 unlock_page(page);
663 page_cache_release(page);
664out:
665 return ret;
666}
667
668static int ocfs2_zero_extend(struct inode *inode,
669 u64 zero_to_size)
670{
671 int ret = 0;
672 u64 start_off;
673 struct super_block *sb = inode->i_sb;
674
675 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
676 while (start_off < zero_to_size) {
677 ret = ocfs2_write_zero_page(inode, start_off);
678 if (ret < 0) {
679 mlog_errno(ret);
680 goto out;
681 }
682
683 start_off += sb->s_blocksize;
684 }
685
686out:
687 return ret;
688}
689
690static int ocfs2_extend_file(struct inode *inode,
691 struct buffer_head *di_bh,
692 u64 new_i_size)
693{
694 int ret = 0;
695 u32 clusters_to_add;
696
697 /* setattr sometimes calls us like this. */
698 if (new_i_size == 0)
699 goto out;
700
701 if (i_size_read(inode) == new_i_size)
702 goto out;
703 BUG_ON(new_i_size < i_size_read(inode));
704
705 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
706 OCFS2_I(inode)->ip_clusters;
707
708 if (clusters_to_add) {
709 ret = ocfs2_extend_allocation(inode, clusters_to_add);
710 if (ret < 0) {
711 mlog_errno(ret);
712 goto out;
713 }
714
715 ret = ocfs2_zero_extend(inode, new_i_size);
716 if (ret < 0) {
717 mlog_errno(ret);
718 goto out;
719 }
720 }
721
722 /* No allocation required, we just use this helper to
723 * do a trivial update of i_size. */
724 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
725 if (ret < 0) {
726 mlog_errno(ret);
727 goto out;
728 }
729
730out:
731 return ret;
732}
733
734int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
735{
736 int status = 0, size_change;
737 struct inode *inode = dentry->d_inode;
738 struct super_block *sb = inode->i_sb;
739 struct ocfs2_super *osb = OCFS2_SB(sb);
740 struct buffer_head *bh = NULL;
741 struct ocfs2_journal_handle *handle = NULL;
742
743 mlog_entry("(0x%p, '%.*s')\n", dentry,
744 dentry->d_name.len, dentry->d_name.name);
745
746 if (attr->ia_valid & ATTR_MODE)
747 mlog(0, "mode change: %d\n", attr->ia_mode);
748 if (attr->ia_valid & ATTR_UID)
749 mlog(0, "uid change: %d\n", attr->ia_uid);
750 if (attr->ia_valid & ATTR_GID)
751 mlog(0, "gid change: %d\n", attr->ia_gid);
752 if (attr->ia_valid & ATTR_SIZE)
753 mlog(0, "size change...\n");
754 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
755 mlog(0, "time change...\n");
756
757#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
758 | ATTR_GID | ATTR_UID | ATTR_MODE)
759 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
760 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
761 return 0;
762 }
763
764 status = inode_change_ok(inode, attr);
765 if (status)
766 return status;
767
768 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
769 if (size_change) {
770 status = ocfs2_rw_lock(inode, 1);
771 if (status < 0) {
772 mlog_errno(status);
773 goto bail;
774 }
775 }
776
777 status = ocfs2_meta_lock(inode, NULL, &bh, 1);
778 if (status < 0) {
779 if (status != -ENOENT)
780 mlog_errno(status);
781 goto bail_unlock_rw;
782 }
783
784 if (size_change && attr->ia_size != i_size_read(inode)) {
785 if (i_size_read(inode) > attr->ia_size)
786 status = ocfs2_truncate_file(inode, bh, attr->ia_size);
787 else
788 status = ocfs2_extend_file(inode, bh, attr->ia_size);
789 if (status < 0) {
790 if (status != -ENOSPC)
791 mlog_errno(status);
792 status = -ENOSPC;
793 goto bail_unlock;
794 }
795 }
796
797 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
798 if (IS_ERR(handle)) {
799 status = PTR_ERR(handle);
800 mlog_errno(status);
801 goto bail_unlock;
802 }
803
804 status = inode_setattr(inode, attr);
805 if (status < 0) {
806 mlog_errno(status);
807 goto bail_commit;
808 }
809
810 status = ocfs2_mark_inode_dirty(handle, inode, bh);
811 if (status < 0)
812 mlog_errno(status);
813
814bail_commit:
815 ocfs2_commit_trans(handle);
816bail_unlock:
817 ocfs2_meta_unlock(inode, 1);
818bail_unlock_rw:
819 if (size_change)
820 ocfs2_rw_unlock(inode, 1);
821bail:
822 if (bh)
823 brelse(bh);
824
825 mlog_exit(status);
826 return status;
827}
828
829int ocfs2_getattr(struct vfsmount *mnt,
830 struct dentry *dentry,
831 struct kstat *stat)
832{
833 struct inode *inode = dentry->d_inode;
834 struct super_block *sb = dentry->d_inode->i_sb;
835 struct ocfs2_super *osb = sb->s_fs_info;
836 int err;
837
838 mlog_entry_void();
839
840 err = ocfs2_inode_revalidate(dentry);
841 if (err) {
842 if (err != -ENOENT)
843 mlog_errno(err);
844 goto bail;
845 }
846
847 generic_fillattr(inode, stat);
848
849 /* We set the blksize from the cluster size for performance */
850 stat->blksize = osb->s_clustersize;
851
852bail:
853 mlog_exit(err);
854
855 return err;
856}
857
858static int ocfs2_write_remove_suid(struct inode *inode)
859{
860 int ret;
861 struct buffer_head *bh = NULL;
862 struct ocfs2_inode_info *oi = OCFS2_I(inode);
863 struct ocfs2_journal_handle *handle;
864 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
865 struct ocfs2_dinode *di;
866
867 mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno,
868 inode->i_mode);
869
870 handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
871 if (handle == NULL) {
872 ret = -ENOMEM;
873 mlog_errno(ret);
874 goto out;
875 }
876
877 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
878 if (ret < 0) {
879 mlog_errno(ret);
880 goto out_trans;
881 }
882
883 ret = ocfs2_journal_access(handle, inode, bh,
884 OCFS2_JOURNAL_ACCESS_WRITE);
885 if (ret < 0) {
886 mlog_errno(ret);
887 goto out_bh;
888 }
889
890 inode->i_mode &= ~S_ISUID;
891 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
892 inode->i_mode &= ~S_ISGID;
893
894 di = (struct ocfs2_dinode *) bh->b_data;
895 di->i_mode = cpu_to_le16(inode->i_mode);
896
897 ret = ocfs2_journal_dirty(handle, bh);
898 if (ret < 0)
899 mlog_errno(ret);
900out_bh:
901 brelse(bh);
902out_trans:
903 ocfs2_commit_trans(handle);
904out:
905 mlog_exit(ret);
906 return ret;
907}
908
909static inline int ocfs2_write_should_remove_suid(struct inode *inode)
910{
911 mode_t mode = inode->i_mode;
912
913 if (!capable(CAP_FSETID)) {
914 if (unlikely(mode & S_ISUID))
915 return 1;
916
917 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
918 return 1;
919 }
920 return 0;
921}
922
923static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
924 const char __user *buf,
925 size_t count,
926 loff_t pos)
927{
928 struct iovec local_iov = { .iov_base = (void __user *)buf,
929 .iov_len = count };
930 int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
931 u32 clusters;
932 struct file *filp = iocb->ki_filp;
933 struct inode *inode = filp->f_dentry->d_inode;
934 loff_t newsize, saved_pos;
935#ifdef OCFS2_ORACORE_WORKAROUNDS
936 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
937#endif
938
939 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
940 (unsigned int)count,
941 filp->f_dentry->d_name.len,
942 filp->f_dentry->d_name.name);
943
944 /* happy write of zero bytes */
945 if (count == 0)
946 return 0;
947
948 if (!inode) {
949 mlog(0, "bad inode\n");
950 return -EIO;
951 }
952
953#ifdef OCFS2_ORACORE_WORKAROUNDS
954 /* ugh, work around some applications which open everything O_DIRECT +
955 * O_APPEND and really don't mean to use O_DIRECT. */
956 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
957 (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT))
958 filp->f_flags &= ~O_DIRECT;
959#endif
960
961 down(&inode->i_sem);
962 /* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */
963 if (filp->f_flags & O_DIRECT) {
964 have_alloc_sem = 1;
965 down_read(&inode->i_alloc_sem);
966 }
967
968 /* concurrent O_DIRECT writes are allowed */
969 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
970 ret = ocfs2_rw_lock(inode, rw_level);
971 if (ret < 0) {
972 rw_level = -1;
973 mlog_errno(ret);
974 goto out;
975 }
976
977 /*
978 * We sample i_size under a read level meta lock to see if our write
979 * is extending the file, if it is we back off and get a write level
980 * meta lock.
981 */
982 meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
983 for(;;) {
984 ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
985 if (ret < 0) {
986 meta_level = -1;
987 mlog_errno(ret);
988 goto out;
989 }
990
991 /* Clear suid / sgid if necessary. We do this here
992 * instead of later in the write path because
993 * remove_suid() calls ->setattr without any hint that
994 * we may have already done our cluster locking. Since
995 * ocfs2_setattr() *must* take cluster locks to
996 * proceeed, this will lead us to recursively lock the
997 * inode. There's also the dinode i_size state which
998 * can be lost via setattr during extending writes (we
999 * set inode->i_size at the end of a write. */
1000 if (ocfs2_write_should_remove_suid(inode)) {
1001 if (meta_level == 0) {
1002 ocfs2_meta_unlock(inode, meta_level);
1003 meta_level = 1;
1004 continue;
1005 }
1006
1007 ret = ocfs2_write_remove_suid(inode);
1008 if (ret < 0) {
1009 mlog_errno(ret);
1010 goto out;
1011 }
1012 }
1013
1014 /* work on a copy of ppos until we're sure that we won't have
1015 * to recalculate it due to relocking. */
1016 if (filp->f_flags & O_APPEND) {
1017 saved_pos = i_size_read(inode);
1018 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1019 } else {
1020 saved_pos = iocb->ki_pos;
1021 }
1022 newsize = count + saved_pos;
1023
1024 mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
1025 saved_pos, newsize, i_size_read(inode));
1026
1027 /* No need for a higher level metadata lock if we're
1028 * never going past i_size. */
1029 if (newsize <= i_size_read(inode))
1030 break;
1031
1032 if (meta_level == 0) {
1033 ocfs2_meta_unlock(inode, meta_level);
1034 meta_level = 1;
1035 continue;
1036 }
1037
1038 spin_lock(&OCFS2_I(inode)->ip_lock);
1039 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
1040 OCFS2_I(inode)->ip_clusters;
1041 spin_unlock(&OCFS2_I(inode)->ip_lock);
1042
1043 mlog(0, "Writing at EOF, may need more allocation: "
1044 "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
1045 i_size_read(inode), newsize, clusters);
1046
1047 /* We only want to continue the rest of this loop if
1048 * our extend will actually require more
1049 * allocation. */
1050 if (!clusters)
1051 break;
1052
1053 ret = ocfs2_extend_allocation(inode, clusters);
1054 if (ret < 0) {
1055 if (ret != -ENOSPC)
1056 mlog_errno(ret);
1057 goto out;
1058 }
1059
1060 /* Fill any holes which would've been created by this
1061 * write. If we're O_APPEND, this will wind up
1062 * (correctly) being a noop. */
1063 ret = ocfs2_zero_extend(inode, (u64) newsize - count);
1064 if (ret < 0) {
1065 mlog_errno(ret);
1066 goto out;
1067 }
1068 break;
1069 }
1070
1071 /* ok, we're done with i_size and alloc work */
1072 iocb->ki_pos = saved_pos;
1073 ocfs2_meta_unlock(inode, meta_level);
1074 meta_level = -1;
1075
1076 /* communicate with ocfs2_dio_end_io */
1077 ocfs2_iocb_set_rw_locked(iocb);
1078
1079#ifdef OCFS2_ORACORE_WORKAROUNDS
1080 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
1081 filp->f_flags & O_DIRECT) {
1082 unsigned int saved_flags = filp->f_flags;
1083 int sector_size = 1 << osb->s_sectsize_bits;
1084
1085 if ((saved_pos & (sector_size - 1)) ||
1086 (count & (sector_size - 1)) ||
1087 ((unsigned long)buf & (sector_size - 1))) {
1088 filp->f_flags |= O_SYNC;
1089 filp->f_flags &= ~O_DIRECT;
1090 }
1091
1092 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
1093 &iocb->ki_pos);
1094
1095 filp->f_flags = saved_flags;
1096 } else
1097#endif
1098 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
1099 &iocb->ki_pos);
1100
1101 /* buffered aio wouldn't have proper lock coverage today */
1102 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1103
1104 /*
1105 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1106 * function pointer which is called when o_direct io completes so that
1107 * it can unlock our rw lock. (it's the clustered equivalent of
1108 * i_alloc_sem; protects truncate from racing with pending ios).
1109 * Unfortunately there are error cases which call end_io and others
1110 * that don't. so we don't have to unlock the rw_lock if either an
1111 * async dio is going to do it in the future or an end_io after an
1112 * error has already done it.
1113 */
1114 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1115 rw_level = -1;
1116 have_alloc_sem = 0;
1117 }
1118
1119out:
1120 if (meta_level != -1)
1121 ocfs2_meta_unlock(inode, meta_level);
1122 if (have_alloc_sem)
1123 up_read(&inode->i_alloc_sem);
1124 if (rw_level != -1)
1125 ocfs2_rw_unlock(inode, rw_level);
1126 up(&inode->i_sem);
1127
1128 mlog_exit(ret);
1129 return ret;
1130}
1131
1132static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1133 char __user *buf,
1134 size_t count,
1135 loff_t pos)
1136{
1137 int ret = 0, rw_level = -1, have_alloc_sem = 0;
1138 struct file *filp = iocb->ki_filp;
1139 struct inode *inode = filp->f_dentry->d_inode;
1140#ifdef OCFS2_ORACORE_WORKAROUNDS
1141 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1142#endif
1143
1144 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
1145 (unsigned int)count,
1146 filp->f_dentry->d_name.len,
1147 filp->f_dentry->d_name.name);
1148
1149 if (!inode) {
1150 ret = -EINVAL;
1151 mlog_errno(ret);
1152 goto bail;
1153 }
1154
1155#ifdef OCFS2_ORACORE_WORKAROUNDS
1156 if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
1157 if (filp->f_flags & O_DIRECT) {
1158 int sector_size = 1 << osb->s_sectsize_bits;
1159
1160 if ((pos & (sector_size - 1)) ||
1161 (count & (sector_size - 1)) ||
1162 ((unsigned long)buf & (sector_size - 1)) ||
1163 (i_size_read(inode) & (sector_size -1))) {
1164 filp->f_flags &= ~O_DIRECT;
1165 }
1166 }
1167 }
1168#endif
1169
1170 /*
1171 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
1172 * need locks to protect pending reads from racing with truncate.
1173 */
1174 if (filp->f_flags & O_DIRECT) {
1175 down_read(&inode->i_alloc_sem);
1176 have_alloc_sem = 1;
1177
1178 ret = ocfs2_rw_lock(inode, 0);
1179 if (ret < 0) {
1180 mlog_errno(ret);
1181 goto bail;
1182 }
1183 rw_level = 0;
1184 /* communicate with ocfs2_dio_end_io */
1185 ocfs2_iocb_set_rw_locked(iocb);
1186 }
1187
1188 ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
1189 if (ret == -EINVAL)
1190 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
1191
1192 /* buffered aio wouldn't have proper lock coverage today */
1193 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1194
1195 /* see ocfs2_file_aio_write */
1196 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1197 rw_level = -1;
1198 have_alloc_sem = 0;
1199 }
1200
1201bail:
1202 if (have_alloc_sem)
1203 up_read(&inode->i_alloc_sem);
1204 if (rw_level != -1)
1205 ocfs2_rw_unlock(inode, rw_level);
1206 mlog_exit(ret);
1207
1208 return ret;
1209}
1210
1211struct inode_operations ocfs2_file_iops = {
1212 .setattr = ocfs2_setattr,
1213 .getattr = ocfs2_getattr,
1214};
1215
1216struct inode_operations ocfs2_special_file_iops = {
1217 .setattr = ocfs2_setattr,
1218 .getattr = ocfs2_getattr,
1219};
1220
1221struct file_operations ocfs2_fops = {
1222 .read = do_sync_read,
1223 .write = do_sync_write,
1224 .sendfile = generic_file_sendfile,
1225 .mmap = ocfs2_mmap,
1226 .fsync = ocfs2_sync_file,
1227 .release = ocfs2_file_release,
1228 .open = ocfs2_file_open,
1229 .aio_read = ocfs2_file_aio_read,
1230 .aio_write = ocfs2_file_aio_write,
1231};
1232
1233struct file_operations ocfs2_dops = {
1234 .read = generic_read_dir,
1235 .readdir = ocfs2_readdir,
1236 .fsync = ocfs2_sync_file,
1237};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
new file mode 100644
index 000000000000..a5ea33b24060
--- /dev/null
+++ b/fs/ocfs2/file.h
@@ -0,0 +1,57 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * file.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_FILE_H
27#define OCFS2_FILE_H
28
29extern struct file_operations ocfs2_fops;
30extern struct file_operations ocfs2_dops;
31extern struct inode_operations ocfs2_file_iops;
32extern struct inode_operations ocfs2_special_file_iops;
33struct ocfs2_alloc_context;
34
35enum ocfs2_alloc_restarted {
36 RESTART_NONE = 0,
37 RESTART_TRANS,
38 RESTART_META
39};
40int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
41 struct inode *inode,
42 u32 clusters_to_add,
43 struct buffer_head *fe_bh,
44 struct ocfs2_journal_handle *handle,
45 struct ocfs2_alloc_context *data_ac,
46 struct ocfs2_alloc_context *meta_ac,
47 enum ocfs2_alloc_restarted *reason);
48int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
49int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
50 struct kstat *stat);
51
52int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
53 struct inode *inode,
54 struct buffer_head *fe_bh,
55 u64 new_i_size);
56
57#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
new file mode 100644
index 000000000000..0bbd22f46c80
--- /dev/null
+++ b/fs/ocfs2/heartbeat.c
@@ -0,0 +1,378 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * heartbeat.c
5 *
6 * Register ourselves with the heartbaet service, keep our node maps
7 * up to date, and fire off recovery when needed.
8 *
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/kmod.h>
32
33#include <cluster/heartbeat.h>
34#include <cluster/nodemanager.h>
35
36#include <dlm/dlmapi.h>
37
38#define MLOG_MASK_PREFIX ML_SUPER
39#include <cluster/masklog.h>
40
41#include "ocfs2.h"
42
43#include "alloc.h"
44#include "heartbeat.h"
45#include "inode.h"
46#include "journal.h"
47#include "vote.h"
48
49#include "buffer_head_io.h"
50
51#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
52#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
53
54static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
55 int bit);
56static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
57 int bit);
58static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
59static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
60 struct ocfs2_node_map *from);
61static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
62 struct ocfs2_node_map *from);
63
64void ocfs2_init_node_maps(struct ocfs2_super *osb)
65{
66 spin_lock_init(&osb->node_map_lock);
67 ocfs2_node_map_init(&osb->mounted_map);
68 ocfs2_node_map_init(&osb->recovery_map);
69 ocfs2_node_map_init(&osb->umount_map);
70}
71
72static void ocfs2_do_node_down(int node_num,
73 struct ocfs2_super *osb)
74{
75 BUG_ON(osb->node_num == node_num);
76
77 mlog(0, "ocfs2: node down event for %d\n", node_num);
78
79 if (!osb->dlm) {
80 /*
81 * No DLM means we're not even ready to participate yet.
82 * We check the slots after the DLM comes up, so we will
83 * notice the node death then. We can safely ignore it
84 * here.
85 */
86 return;
87 }
88
89 if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
90 /* If a node is in the umount map, then we've been
91 * expecting him to go down and we know ahead of time
92 * that recovery is not necessary. */
93 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
94 return;
95 }
96
97 ocfs2_recovery_thread(osb, node_num);
98
99 ocfs2_remove_node_from_vote_queues(osb, node_num);
100}
101
102static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
103 int node_num,
104 void *data)
105{
106 ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
107}
108
109/* Called from the dlm when it's about to evict a node. We may also
110 * get a heartbeat callback later. */
111static void ocfs2_dlm_eviction_cb(int node_num,
112 void *data)
113{
114 struct ocfs2_super *osb = (struct ocfs2_super *) data;
115 struct super_block *sb = osb->sb;
116
117 mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
118 MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
119
120 ocfs2_do_node_down(node_num, osb);
121}
122
123static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
124 int node_num,
125 void *data)
126{
127 struct ocfs2_super *osb = data;
128
129 BUG_ON(osb->node_num == node_num);
130
131 mlog(0, "node up event for %d\n", node_num);
132 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
133}
134
135void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
136{
137 o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
138 ocfs2_hb_node_down_cb, osb,
139 OCFS2_HB_NODE_DOWN_PRI);
140
141 o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
142 ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
143
144 /* Not exactly a heartbeat callback, but leads to essentially
145 * the same path so we set it up here. */
146 dlm_setup_eviction_cb(&osb->osb_eviction_cb,
147 ocfs2_dlm_eviction_cb,
148 osb);
149}
150
151/* Most functions here are just stubs for now... */
152int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
153{
154 int status;
155
156 status = o2hb_register_callback(&osb->osb_hb_down);
157 if (status < 0) {
158 mlog_errno(status);
159 goto bail;
160 }
161
162 status = o2hb_register_callback(&osb->osb_hb_up);
163 if (status < 0)
164 mlog_errno(status);
165
166bail:
167 return status;
168}
169
170void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
171{
172 int status;
173
174 status = o2hb_unregister_callback(&osb->osb_hb_down);
175 if (status < 0)
176 mlog_errno(status);
177
178 status = o2hb_unregister_callback(&osb->osb_hb_up);
179 if (status < 0)
180 mlog_errno(status);
181}
182
183void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
184{
185 int ret;
186 char *argv[5], *envp[3];
187
188 if (!osb->uuid_str) {
189 /* This can happen if we don't get far enough in mount... */
190 mlog(0, "No UUID with which to stop heartbeat!\n\n");
191 return;
192 }
193
194 argv[0] = (char *)o2nm_get_hb_ctl_path();
195 argv[1] = "-K";
196 argv[2] = "-u";
197 argv[3] = osb->uuid_str;
198 argv[4] = NULL;
199
200 mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
201
202 /* minimal command environment taken from cpu_run_sbin_hotplug */
203 envp[0] = "HOME=/";
204 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
205 envp[2] = NULL;
206
207 ret = call_usermodehelper(argv[0], argv, envp, 1);
208 if (ret < 0)
209 mlog_errno(ret);
210}
211
212/* special case -1 for now
213 * TODO: should *really* make sure the calling func never passes -1!! */
214void ocfs2_node_map_init(struct ocfs2_node_map *map)
215{
216 map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
217 memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
218 sizeof(unsigned long));
219}
220
221static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
222 int bit)
223{
224 set_bit(bit, map->map);
225}
226
227void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
228 struct ocfs2_node_map *map,
229 int bit)
230{
231 if (bit==-1)
232 return;
233 BUG_ON(bit >= map->num_nodes);
234 spin_lock(&osb->node_map_lock);
235 __ocfs2_node_map_set_bit(map, bit);
236 spin_unlock(&osb->node_map_lock);
237}
238
239static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
240 int bit)
241{
242 clear_bit(bit, map->map);
243}
244
245void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
246 struct ocfs2_node_map *map,
247 int bit)
248{
249 if (bit==-1)
250 return;
251 BUG_ON(bit >= map->num_nodes);
252 spin_lock(&osb->node_map_lock);
253 __ocfs2_node_map_clear_bit(map, bit);
254 spin_unlock(&osb->node_map_lock);
255}
256
257int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
258 struct ocfs2_node_map *map,
259 int bit)
260{
261 int ret;
262 if (bit >= map->num_nodes) {
263 mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
264 BUG();
265 }
266 spin_lock(&osb->node_map_lock);
267 ret = test_bit(bit, map->map);
268 spin_unlock(&osb->node_map_lock);
269 return ret;
270}
271
272static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
273{
274 int bit;
275 bit = find_next_bit(map->map, map->num_nodes, 0);
276 if (bit < map->num_nodes)
277 return 0;
278 return 1;
279}
280
281int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
282 struct ocfs2_node_map *map)
283{
284 int ret;
285 BUG_ON(map->num_nodes == 0);
286 spin_lock(&osb->node_map_lock);
287 ret = __ocfs2_node_map_is_empty(map);
288 spin_unlock(&osb->node_map_lock);
289 return ret;
290}
291
292static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
293 struct ocfs2_node_map *from)
294{
295 BUG_ON(from->num_nodes == 0);
296 ocfs2_node_map_init(target);
297 __ocfs2_node_map_set(target, from);
298}
299
300/* returns 1 if bit is the only bit set in target, 0 otherwise */
301int ocfs2_node_map_is_only(struct ocfs2_super *osb,
302 struct ocfs2_node_map *target,
303 int bit)
304{
305 struct ocfs2_node_map temp;
306 int ret;
307
308 spin_lock(&osb->node_map_lock);
309 __ocfs2_node_map_dup(&temp, target);
310 __ocfs2_node_map_clear_bit(&temp, bit);
311 ret = __ocfs2_node_map_is_empty(&temp);
312 spin_unlock(&osb->node_map_lock);
313
314 return ret;
315}
316
317static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
318 struct ocfs2_node_map *from)
319{
320 int num_longs, i;
321
322 BUG_ON(target->num_nodes != from->num_nodes);
323 BUG_ON(target->num_nodes == 0);
324
325 num_longs = BITS_TO_LONGS(target->num_nodes);
326 for (i = 0; i < num_longs; i++)
327 target->map[i] = from->map[i];
328}
329
330/* Returns whether the recovery bit was actually set - it may not be
331 * if a node is still marked as needing recovery */
332int ocfs2_recovery_map_set(struct ocfs2_super *osb,
333 int num)
334{
335 int set = 0;
336
337 spin_lock(&osb->node_map_lock);
338
339 __ocfs2_node_map_clear_bit(&osb->mounted_map, num);
340
341 if (!test_bit(num, osb->recovery_map.map)) {
342 __ocfs2_node_map_set_bit(&osb->recovery_map, num);
343 set = 1;
344 }
345
346 spin_unlock(&osb->node_map_lock);
347
348 return set;
349}
350
351void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
352 int num)
353{
354 ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
355}
356
357int ocfs2_node_map_iterate(struct ocfs2_super *osb,
358 struct ocfs2_node_map *map,
359 int idx)
360{
361 int i = idx;
362
363 idx = O2NM_INVALID_NODE_NUM;
364 spin_lock(&osb->node_map_lock);
365 if ((i != O2NM_INVALID_NODE_NUM) &&
366 (i >= 0) &&
367 (i < map->num_nodes)) {
368 while(i < map->num_nodes) {
369 if (test_bit(i, map->map)) {
370 idx = i;
371 break;
372 }
373 i++;
374 }
375 }
376 spin_unlock(&osb->node_map_lock);
377 return idx;
378}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
new file mode 100644
index 000000000000..e8fb079122e4
--- /dev/null
+++ b/fs/ocfs2/heartbeat.h
@@ -0,0 +1,67 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * heartbeat.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_HEARTBEAT_H
27#define OCFS2_HEARTBEAT_H
28
29void ocfs2_init_node_maps(struct ocfs2_super *osb);
30
31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
32int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
33void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
34void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
35
36/* node map functions - used to keep track of mounted and in-recovery
37 * nodes. */
38void ocfs2_node_map_init(struct ocfs2_node_map *map);
39int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
40 struct ocfs2_node_map *map);
41void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
42 struct ocfs2_node_map *map,
43 int bit);
44void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
45 struct ocfs2_node_map *map,
46 int bit);
47int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
48 struct ocfs2_node_map *map,
49 int bit);
50int ocfs2_node_map_iterate(struct ocfs2_super *osb,
51 struct ocfs2_node_map *map,
52 int idx);
53static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
54 struct ocfs2_node_map *map)
55{
56 return ocfs2_node_map_iterate(osb, map, 0);
57}
58int ocfs2_recovery_map_set(struct ocfs2_super *osb,
59 int num);
60void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
61 int num);
62/* returns 1 if bit is the only bit set in target, 0 otherwise */
63int ocfs2_node_map_is_only(struct ocfs2_super *osb,
64 struct ocfs2_node_map *target,
65 int bit);
66
67#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
new file mode 100644
index 000000000000..a91ba4dec936
--- /dev/null
+++ b/fs/ocfs2/inode.c
@@ -0,0 +1,1140 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * inode.c
5 *
6 * vfs' aops, fops, dops and iops
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/pagemap.h>
31#include <linux/smp_lock.h>
32
33#include <asm/byteorder.h>
34
35#define MLOG_MASK_PREFIX ML_INODE
36#include <cluster/masklog.h>
37
38#include "ocfs2.h"
39
40#include "alloc.h"
41#include "dlmglue.h"
42#include "extent_map.h"
43#include "file.h"
44#include "inode.h"
45#include "journal.h"
46#include "namei.h"
47#include "suballoc.h"
48#include "super.h"
49#include "symlink.h"
50#include "sysfile.h"
51#include "uptodate.h"
52#include "vote.h"
53
54#include "buffer_head_io.h"
55
56#define OCFS2_FI_FLAG_NOWAIT 0x1
57#define OCFS2_FI_FLAG_DELETE 0x2
58struct ocfs2_find_inode_args
59{
60 u64 fi_blkno;
61 unsigned long fi_ino;
62 unsigned int fi_flags;
63};
64
65static int ocfs2_read_locked_inode(struct inode *inode,
66 struct ocfs2_find_inode_args *args);
67static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
68static int ocfs2_find_actor(struct inode *inode, void *opaque);
69static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
70 struct inode *inode,
71 struct buffer_head *fe_bh);
72
73struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
74 u64 blkno,
75 int delete_vote)
76{
77 struct ocfs2_find_inode_args args;
78
79 /* ocfs2_ilookup_for_vote should *only* be called from the
80 * vote thread */
81 BUG_ON(current != osb->vote_task);
82
83 args.fi_blkno = blkno;
84 args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
85 if (delete_vote)
86 args.fi_flags |= OCFS2_FI_FLAG_DELETE;
87 args.fi_ino = ino_from_blkno(osb->sb, blkno);
88 return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
89}
90
91struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno)
92{
93 struct inode *inode = NULL;
94 struct super_block *sb = osb->sb;
95 struct ocfs2_find_inode_args args;
96
97 mlog_entry("(blkno = %"MLFu64")\n", blkno);
98
99 /* Ok. By now we've either got the offsets passed to us by the
100 * caller, or we just pulled them off the bh. Lets do some
101 * sanity checks to make sure they're OK. */
102 if (blkno == 0) {
103 inode = ERR_PTR(-EINVAL);
104 mlog_errno(PTR_ERR(inode));
105 goto bail;
106 }
107
108 args.fi_blkno = blkno;
109 args.fi_flags = 0;
110 args.fi_ino = ino_from_blkno(sb, blkno);
111
112 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor,
113 ocfs2_init_locked_inode, &args);
114 /* inode was *not* in the inode cache. 2.6.x requires
115 * us to do our own read_inode call and unlock it
116 * afterwards. */
117 if (inode && inode->i_state & I_NEW) {
118 mlog(0, "Inode was not in inode cache, reading it.\n");
119 ocfs2_read_locked_inode(inode, &args);
120 unlock_new_inode(inode);
121 }
122 if (inode == NULL) {
123 inode = ERR_PTR(-ENOMEM);
124 mlog_errno(PTR_ERR(inode));
125 goto bail;
126 }
127 if (is_bad_inode(inode)) {
128 iput(inode);
129 inode = ERR_PTR(-ESTALE);
130 mlog_errno(PTR_ERR(inode));
131 goto bail;
132 }
133
134bail:
135 if (!IS_ERR(inode)) {
136 mlog(0, "returning inode with number %"MLFu64"\n",
137 OCFS2_I(inode)->ip_blkno);
138 mlog_exit_ptr(inode);
139 } else
140 mlog_errno(PTR_ERR(inode));
141
142 return inode;
143}
144
145
146/*
147 * here's how inodes get read from disk:
148 * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR
149 * found? : return the in-memory inode
150 * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE
151 */
152
153static int ocfs2_find_actor(struct inode *inode, void *opaque)
154{
155 struct ocfs2_find_inode_args *args = NULL;
156 struct ocfs2_inode_info *oi = OCFS2_I(inode);
157 int ret = 0;
158
159 mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
160
161 args = opaque;
162
163 mlog_bug_on_msg(!inode, "No inode in find actor!\n");
164
165 if (oi->ip_blkno != args->fi_blkno)
166 goto bail;
167
168 /* OCFS2_FI_FLAG_NOWAIT is *only* set from
169 * ocfs2_ilookup_for_vote which won't create an inode for one
170 * that isn't found. The vote thread which doesn't want to get
171 * an inode which is in the process of going away - otherwise
172 * the call to __wait_on_freeing_inode in find_inode_fast will
173 * cause it to deadlock on an inode which may be waiting on a
174 * vote (or lock release) in delete_inode */
175 if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
176 (inode->i_state & (I_FREEING|I_CLEAR))) {
177 /* As stated above, we're not going to return an
178 * inode. In the case of a delete vote, the voting
179 * code is going to signal the other node to go
180 * ahead. Mark that state here, so this freeing inode
181 * has the state when it gets to delete_inode. */
182 if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
183 spin_lock(&oi->ip_lock);
184 ocfs2_mark_inode_remotely_deleted(inode);
185 spin_unlock(&oi->ip_lock);
186 }
187 goto bail;
188 }
189
190 ret = 1;
191bail:
192 mlog_exit(ret);
193 return ret;
194}
195
196/*
197 * initialize the new inode, but don't do anything that would cause
198 * us to sleep.
199 * return 0 on success, 1 on failure
200 */
201static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
202{
203 struct ocfs2_find_inode_args *args = opaque;
204
205 mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
206
207 inode->i_ino = args->fi_ino;
208 OCFS2_I(inode)->ip_blkno = args->fi_blkno;
209
210 mlog_exit(0);
211 return 0;
212}
213
214int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
215 int create_ino)
216{
217 struct super_block *sb;
218 struct ocfs2_super *osb;
219 int status = -EINVAL;
220
221 mlog_entry("(0x%p, size:%"MLFu64")\n", inode, fe->i_size);
222
223 sb = inode->i_sb;
224 osb = OCFS2_SB(sb);
225
226 /* this means that read_inode cannot create a superblock inode
227 * today. change if needed. */
228 if (!OCFS2_IS_VALID_DINODE(fe) ||
229 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
230 mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%"MLFu64", "
231 "signature = %.*s, flags = 0x%x\n",
232 inode->i_ino, le64_to_cpu(fe->i_blkno), 7,
233 fe->i_signature, le32_to_cpu(fe->i_flags));
234 goto bail;
235 }
236
237 if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
238 mlog(ML_ERROR, "file entry generation does not match "
239 "superblock! osb->fs_generation=%x, "
240 "fe->i_fs_generation=%x\n",
241 osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
242 goto bail;
243 }
244
245 inode->i_version = 1;
246 inode->i_generation = le32_to_cpu(fe->i_generation);
247 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
248 inode->i_mode = le16_to_cpu(fe->i_mode);
249 inode->i_uid = le32_to_cpu(fe->i_uid);
250 inode->i_gid = le32_to_cpu(fe->i_gid);
251 inode->i_blksize = (u32)osb->s_clustersize;
252
253 /* Fast symlinks will have i_size but no allocated clusters. */
254 if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
255 inode->i_blocks = 0;
256 else
257 inode->i_blocks =
258 ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
259 inode->i_mapping->a_ops = &ocfs2_aops;
260 inode->i_flags |= S_NOATIME;
261 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
262 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
263 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
264 inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
265 inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
266 inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
267
268 if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
269 mlog(ML_ERROR,
270 "ip_blkno %"MLFu64" != i_blkno %"MLFu64"!\n",
271 OCFS2_I(inode)->ip_blkno, fe->i_blkno);
272
273 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
274 OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
275
276 if (create_ino)
277 inode->i_ino = ino_from_blkno(inode->i_sb,
278 le64_to_cpu(fe->i_blkno));
279
280 mlog(0, "blkno = %"MLFu64", ino = %lu, create_ino = %s\n",
281 fe->i_blkno, inode->i_ino, create_ino ? "true" : "false");
282
283 inode->i_nlink = le16_to_cpu(fe->i_links_count);
284
285 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
286 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
287 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
288 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
289 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
290 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
291 mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
292 /* we can't actually hit this as read_inode can't
293 * handle superblocks today ;-) */
294 BUG();
295 }
296
297 switch (inode->i_mode & S_IFMT) {
298 case S_IFREG:
299 inode->i_fop = &ocfs2_fops;
300 inode->i_op = &ocfs2_file_iops;
301 i_size_write(inode, le64_to_cpu(fe->i_size));
302 break;
303 case S_IFDIR:
304 inode->i_op = &ocfs2_dir_iops;
305 inode->i_fop = &ocfs2_dops;
306 i_size_write(inode, le64_to_cpu(fe->i_size));
307 break;
308 case S_IFLNK:
309 if (ocfs2_inode_is_fast_symlink(inode))
310 inode->i_op = &ocfs2_fast_symlink_inode_operations;
311 else
312 inode->i_op = &ocfs2_symlink_inode_operations;
313 i_size_write(inode, le64_to_cpu(fe->i_size));
314 break;
315 default:
316 inode->i_op = &ocfs2_special_file_iops;
317 init_special_inode(inode, inode->i_mode,
318 inode->i_rdev);
319 break;
320 }
321
322 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
323 OCFS2_LOCK_TYPE_RW, inode);
324 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
325 OCFS2_LOCK_TYPE_META, inode);
326 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
327 OCFS2_LOCK_TYPE_DATA, inode);
328
329 status = 0;
330bail:
331 mlog_exit(status);
332 return status;
333}
334
335static int ocfs2_read_locked_inode(struct inode *inode,
336 struct ocfs2_find_inode_args *args)
337{
338 struct super_block *sb;
339 struct ocfs2_super *osb;
340 struct ocfs2_dinode *fe;
341 struct buffer_head *bh = NULL;
342 int status;
343 int sysfile = 0;
344
345 mlog_entry("(0x%p, 0x%p)\n", inode, args);
346
347 status = -EINVAL;
348 if (inode == NULL || inode->i_sb == NULL) {
349 mlog(ML_ERROR, "bad inode\n");
350 goto bail;
351 }
352 sb = inode->i_sb;
353 osb = OCFS2_SB(sb);
354
355 if (!args) {
356 mlog(ML_ERROR, "bad inode args\n");
357 make_bad_inode(inode);
358 goto bail;
359 }
360
361 /* Read the FE off disk. This is safe because the kernel only
362 * does one read_inode2 for a new inode, and if it doesn't
363 * exist yet then nobody can be working on it! */
364 status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL);
365 if (status < 0) {
366 mlog_errno(status);
367 make_bad_inode(inode);
368 goto bail;
369 }
370
371 fe = (struct ocfs2_dinode *) bh->b_data;
372 if (!OCFS2_IS_VALID_DINODE(fe)) {
373 mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
374 fe->i_blkno, 7, fe->i_signature);
375 make_bad_inode(inode);
376 goto bail;
377 }
378
379 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
380 sysfile = 1;
381
382 if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
383 S_ISBLK(le16_to_cpu(fe->i_mode)))
384 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
385
386 status = -EINVAL;
387 if (ocfs2_populate_inode(inode, fe, 0) < 0) {
388 mlog(ML_ERROR, "populate inode failed! i_blkno=%"MLFu64", "
389 "i_ino=%lu\n", fe->i_blkno, inode->i_ino);
390 make_bad_inode(inode);
391 goto bail;
392 }
393
394 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
395
396 if (sysfile)
397 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
398
399 status = 0;
400
401bail:
402 if (args && bh)
403 brelse(bh);
404
405 mlog_exit(status);
406 return status;
407}
408
409void ocfs2_sync_blockdev(struct super_block *sb)
410{
411 sync_blockdev(sb->s_bdev);
412}
413
414static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
415 struct inode *inode,
416 struct buffer_head *fe_bh)
417{
418 int status = 0;
419 struct ocfs2_journal_handle *handle = NULL;
420 struct ocfs2_truncate_context *tc = NULL;
421 struct ocfs2_dinode *fe;
422
423 mlog_entry_void();
424
425 fe = (struct ocfs2_dinode *) fe_bh->b_data;
426
427 /* zero allocation, zero truncate :) */
428 if (!fe->i_clusters)
429 goto bail;
430
431 handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS);
432 if (IS_ERR(handle)) {
433 status = PTR_ERR(handle);
434 handle = NULL;
435 mlog_errno(status);
436 goto bail;
437 }
438
439 status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
440 if (status < 0) {
441 mlog_errno(status);
442 goto bail;
443 }
444
445 ocfs2_commit_trans(handle);
446 handle = NULL;
447
448 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
449 if (status < 0) {
450 mlog_errno(status);
451 goto bail;
452 }
453
454 status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
455 if (status < 0) {
456 mlog_errno(status);
457 goto bail;
458 }
459bail:
460 if (handle)
461 ocfs2_commit_trans(handle);
462
463 mlog_exit(status);
464 return status;
465}
466
467static int ocfs2_remove_inode(struct inode *inode,
468 struct buffer_head *di_bh,
469 struct inode *orphan_dir_inode,
470 struct buffer_head *orphan_dir_bh)
471{
472 int status;
473 struct inode *inode_alloc_inode = NULL;
474 struct buffer_head *inode_alloc_bh = NULL;
475 struct ocfs2_journal_handle *handle;
476 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
477 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
478
479 inode_alloc_inode =
480 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
481 le16_to_cpu(di->i_suballoc_slot));
482 if (!inode_alloc_inode) {
483 status = -EEXIST;
484 mlog_errno(status);
485 goto bail;
486 }
487
488 down(&inode_alloc_inode->i_sem);
489 status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1);
490 if (status < 0) {
491 up(&inode_alloc_inode->i_sem);
492
493 mlog_errno(status);
494 goto bail;
495 }
496
497 handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS);
498 if (IS_ERR(handle)) {
499 status = PTR_ERR(handle);
500 mlog_errno(status);
501 goto bail_unlock;
502 }
503
504 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
505 orphan_dir_bh);
506 if (status < 0) {
507 mlog_errno(status);
508 goto bail_commit;
509 }
510
511 /* set the inodes dtime */
512 status = ocfs2_journal_access(handle, inode, di_bh,
513 OCFS2_JOURNAL_ACCESS_WRITE);
514 if (status < 0) {
515 mlog_errno(status);
516 goto bail_commit;
517 }
518
519 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
520 le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
521
522 status = ocfs2_journal_dirty(handle, di_bh);
523 if (status < 0) {
524 mlog_errno(status);
525 goto bail_commit;
526 }
527
528 ocfs2_remove_from_cache(inode, di_bh);
529
530 status = ocfs2_free_dinode(handle, inode_alloc_inode,
531 inode_alloc_bh, di);
532 if (status < 0)
533 mlog_errno(status);
534
535bail_commit:
536 ocfs2_commit_trans(handle);
537bail_unlock:
538 ocfs2_meta_unlock(inode_alloc_inode, 1);
539 up(&inode_alloc_inode->i_sem);
540 brelse(inode_alloc_bh);
541bail:
542 iput(inode_alloc_inode);
543
544 return status;
545}
546
547static int ocfs2_wipe_inode(struct inode *inode,
548 struct buffer_head *di_bh)
549{
550 int status, orphaned_slot;
551 struct inode *orphan_dir_inode = NULL;
552 struct buffer_head *orphan_dir_bh = NULL;
553 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
554
555 /* We've already voted on this so it should be readonly - no
556 * spinlock needed. */
557 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
558 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
559 ORPHAN_DIR_SYSTEM_INODE,
560 orphaned_slot);
561 if (!orphan_dir_inode) {
562 status = -EEXIST;
563 mlog_errno(status);
564 goto bail;
565 }
566
567 /* Lock the orphan dir. The lock will be held for the entire
568 * delete_inode operation. We do this now to avoid races with
569 * recovery completion on other nodes. */
570 down(&orphan_dir_inode->i_sem);
571 status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1);
572 if (status < 0) {
573 up(&orphan_dir_inode->i_sem);
574
575 mlog_errno(status);
576 goto bail;
577 }
578
579 /* we do this while holding the orphan dir lock because we
580 * don't want recovery being run from another node to vote for
581 * an inode delete on us -- this will result in two nodes
582 * truncating the same file! */
583 status = ocfs2_truncate_for_delete(osb, inode, di_bh);
584 if (status < 0) {
585 mlog_errno(status);
586 goto bail_unlock_dir;
587 }
588
589 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
590 orphan_dir_bh);
591 if (status < 0)
592 mlog_errno(status);
593
594bail_unlock_dir:
595 ocfs2_meta_unlock(orphan_dir_inode, 1);
596 up(&orphan_dir_inode->i_sem);
597 brelse(orphan_dir_bh);
598bail:
599 iput(orphan_dir_inode);
600
601 return status;
602}
603
604/* There is a series of simple checks that should be done before a
605 * vote is even considered. Encapsulate those in this function. */
606static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
607{
608 int ret = 0;
609 struct ocfs2_inode_info *oi = OCFS2_I(inode);
610 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
611
612 /* We shouldn't be getting here for the root directory
613 * inode.. */
614 if (inode == osb->root_inode) {
615 mlog(ML_ERROR, "Skipping delete of root inode.\n");
616 goto bail;
617 }
618
619 /* If we're coming from process_vote we can't go into our own
620 * voting [hello, deadlock city!], so unforuntately we just
621 * have to skip deleting this guy. That's OK though because
622 * the node who's doing the actual deleting should handle it
623 * anyway. */
624 if (current == osb->vote_task) {
625 mlog(0, "Skipping delete of %lu because we're currently "
626 "in process_vote\n", inode->i_ino);
627 goto bail;
628 }
629
630 spin_lock(&oi->ip_lock);
631 /* OCFS2 *never* deletes system files. This should technically
632 * never get here as system file inodes should always have a
633 * positive link count. */
634 if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
635 mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n",
636 oi->ip_blkno);
637 goto bail_unlock;
638 }
639
640 /* If we have voted "yes" on the wipe of this inode for
641 * another node, it will be marked here so we can safely skip
642 * it. Recovery will cleanup any inodes we might inadvertantly
643 * skip here. */
644 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
645 mlog(0, "Skipping delete of %lu because another node "
646 "has done this for us.\n", inode->i_ino);
647 goto bail_unlock;
648 }
649
650 ret = 1;
651bail_unlock:
652 spin_unlock(&oi->ip_lock);
653bail:
654 return ret;
655}
656
657/* Query the cluster to determine whether we should wipe an inode from
658 * disk or not.
659 *
660 * Requires the inode to have the cluster lock. */
661static int ocfs2_query_inode_wipe(struct inode *inode,
662 struct buffer_head *di_bh,
663 int *wipe)
664{
665 int status = 0;
666 struct ocfs2_inode_info *oi = OCFS2_I(inode);
667 struct ocfs2_dinode *di;
668
669 *wipe = 0;
670
671 /* While we were waiting for the cluster lock in
672 * ocfs2_delete_inode, another node might have asked to delete
673 * the inode. Recheck our flags to catch this. */
674 if (!ocfs2_inode_is_valid_to_delete(inode)) {
675 mlog(0, "Skipping delete of %"MLFu64" because flags changed\n",
676 oi->ip_blkno);
677 goto bail;
678 }
679
680 /* Now that we have an up to date inode, we can double check
681 * the link count. */
682 if (inode->i_nlink) {
683 mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n",
684 oi->ip_blkno, inode->i_nlink);
685 goto bail;
686 }
687
688 /* Do some basic inode verification... */
689 di = (struct ocfs2_dinode *) di_bh->b_data;
690 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
691 /* for lack of a better error? */
692 status = -EEXIST;
693 mlog(ML_ERROR,
694 "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned! "
695 "Disk flags 0x%x, inode flags 0x%x\n",
696 oi->ip_blkno, di->i_blkno, di->i_flags, oi->ip_flags);
697 goto bail;
698 }
699
700 /* has someone already deleted us?! baaad... */
701 if (di->i_dtime) {
702 status = -EEXIST;
703 mlog_errno(status);
704 goto bail;
705 }
706
707 status = ocfs2_request_delete_vote(inode);
708 /* -EBUSY means that other nodes are still using the
709 * inode. We're done here though, so avoid doing anything on
710 * disk and let them worry about deleting it. */
711 if (status == -EBUSY) {
712 status = 0;
713 mlog(0, "Skipping delete of %"MLFu64" because it is in use on"
714 "other nodes\n", oi->ip_blkno);
715 goto bail;
716 }
717 if (status < 0) {
718 mlog_errno(status);
719 goto bail;
720 }
721
722 spin_lock(&oi->ip_lock);
723 if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
724 /* Nobody knew which slot this inode was orphaned
725 * into. This may happen during node death and
726 * recovery knows how to clean it up so we can safely
727 * ignore this inode for now on. */
728 mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n",
729 oi->ip_blkno);
730 } else {
731 *wipe = 1;
732
733 mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n",
734 oi->ip_blkno, oi->ip_orphaned_slot);
735 }
736 spin_unlock(&oi->ip_lock);
737
738bail:
739 return status;
740}
741
742/* Support function for ocfs2_delete_inode. Will help us keep the
743 * inode data in a consistent state for clear_inode. Always truncates
744 * pages, optionally sync's them first. */
745static void ocfs2_cleanup_delete_inode(struct inode *inode,
746 int sync_data)
747{
748 mlog(0, "Cleanup inode %"MLFu64", sync = %d\n",
749 OCFS2_I(inode)->ip_blkno, sync_data);
750 if (sync_data)
751 write_inode_now(inode, 1);
752 truncate_inode_pages(&inode->i_data, 0);
753}
754
755void ocfs2_delete_inode(struct inode *inode)
756{
757 int wipe, status;
758 sigset_t blocked, oldset;
759 struct buffer_head *di_bh = NULL;
760
761 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
762
763 if (is_bad_inode(inode)) {
764 mlog(0, "Skipping delete of bad inode\n");
765 goto bail;
766 }
767
768 if (!ocfs2_inode_is_valid_to_delete(inode)) {
769 /* It's probably not necessary to truncate_inode_pages
770 * here but we do it for safety anyway (it will most
771 * likely be a no-op anyway) */
772 ocfs2_cleanup_delete_inode(inode, 0);
773 goto bail;
774 }
775
776 /* We want to block signals in delete_inode as the lock and
777 * messaging paths may return us -ERESTARTSYS. Which would
778 * cause us to exit early, resulting in inodes being orphaned
779 * forever. */
780 sigfillset(&blocked);
781 status = sigprocmask(SIG_BLOCK, &blocked, &oldset);
782 if (status < 0) {
783 mlog_errno(status);
784 ocfs2_cleanup_delete_inode(inode, 1);
785 goto bail;
786 }
787
788 /* Lock down the inode. This gives us an up to date view of
789 * it's metadata (for verification), and allows us to
790 * serialize delete_inode votes.
791 *
792 * Even though we might be doing a truncate, we don't take the
793 * allocation lock here as it won't be needed - nobody will
794 * have the file open.
795 */
796 status = ocfs2_meta_lock(inode, NULL, &di_bh, 1);
797 if (status < 0) {
798 if (status != -ENOENT)
799 mlog_errno(status);
800 ocfs2_cleanup_delete_inode(inode, 0);
801 goto bail_unblock;
802 }
803
804 /* Query the cluster. This will be the final decision made
805 * before we go ahead and wipe the inode. */
806 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
807 if (!wipe || status < 0) {
808 /* Error and inode busy vote both mean we won't be
809 * removing the inode, so they take almost the same
810 * path. */
811 if (status < 0)
812 mlog_errno(status);
813
814 /* Someone in the cluster has voted to not wipe this
815 * inode, or it was never completely orphaned. Write
816 * out the pages and exit now. */
817 ocfs2_cleanup_delete_inode(inode, 1);
818 goto bail_unlock_inode;
819 }
820
821 ocfs2_cleanup_delete_inode(inode, 0);
822
823 status = ocfs2_wipe_inode(inode, di_bh);
824 if (status < 0) {
825 mlog_errno(status);
826 goto bail_unlock_inode;
827 }
828
829 /* Mark the inode as successfully deleted. This is important
830 * for ocfs2_clear_inode as it will check this flag and skip
831 * any checkpointing work */
832 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED;
833
834bail_unlock_inode:
835 ocfs2_meta_unlock(inode, 1);
836 brelse(di_bh);
837bail_unblock:
838 status = sigprocmask(SIG_SETMASK, &oldset, NULL);
839 if (status < 0)
840 mlog_errno(status);
841bail:
842 clear_inode(inode);
843 mlog_exit_void();
844}
845
846void ocfs2_clear_inode(struct inode *inode)
847{
848 int status;
849 struct ocfs2_inode_info *oi = OCFS2_I(inode);
850
851 mlog_entry_void();
852
853 if (!inode)
854 goto bail;
855
856 mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n",
857 OCFS2_I(inode)->ip_blkno, inode->i_nlink);
858
859 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
860 "Inode=%lu\n", inode->i_ino);
861
862 /* Do these before all the other work so that we don't bounce
863 * the vote thread while waiting to destroy the locks. */
864 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
865 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
866 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
867
868 /* We very well may get a clear_inode before all an inodes
869 * metadata has hit disk. Of course, we can't drop any cluster
870 * locks until the journal has finished with it. The only
871 * exception here are successfully wiped inodes - their
872 * metadata can now be considered to be part of the system
873 * inodes from which it came. */
874 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED))
875 ocfs2_checkpoint_inode(inode);
876
877 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
878 "Clear inode of %"MLFu64", inode has io markers\n",
879 oi->ip_blkno);
880
881 ocfs2_extent_map_drop(inode, 0);
882 ocfs2_extent_map_init(inode);
883
884 status = ocfs2_drop_inode_locks(inode);
885 if (status < 0)
886 mlog_errno(status);
887
888 ocfs2_lock_res_free(&oi->ip_rw_lockres);
889 ocfs2_lock_res_free(&oi->ip_meta_lockres);
890 ocfs2_lock_res_free(&oi->ip_data_lockres);
891
892 ocfs2_metadata_cache_purge(inode);
893
894 mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached,
895 "Clear inode of %"MLFu64", inode has %u cache items\n",
896 oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached);
897
898 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
899 "Clear inode of %"MLFu64", inode has a bad flag\n",
900 oi->ip_blkno);
901
902 mlog_bug_on_msg(spin_is_locked(&oi->ip_lock),
903 "Clear inode of %"MLFu64", inode is locked\n",
904 oi->ip_blkno);
905
906 mlog_bug_on_msg(down_trylock(&oi->ip_io_sem),
907 "Clear inode of %"MLFu64", io_sem is locked\n",
908 oi->ip_blkno);
909 up(&oi->ip_io_sem);
910
911 /*
912 * down_trylock() returns 0, down_write_trylock() returns 1
913 * kernel 1, world 0
914 */
915 mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem),
916 "Clear inode of %"MLFu64", alloc_sem is locked\n",
917 oi->ip_blkno);
918 up_write(&oi->ip_alloc_sem);
919
920 mlog_bug_on_msg(oi->ip_open_count,
921 "Clear inode of %"MLFu64" has open count %d\n",
922 oi->ip_blkno, oi->ip_open_count);
923 mlog_bug_on_msg(!list_empty(&oi->ip_handle_list),
924 "Clear inode of %"MLFu64" has non empty handle list\n",
925 oi->ip_blkno);
926 mlog_bug_on_msg(oi->ip_handle,
927 "Clear inode of %"MLFu64" has non empty handle pointer\n",
928 oi->ip_blkno);
929
930 /* Clear all other flags. */
931 oi->ip_flags = OCFS2_INODE_CACHE_INLINE;
932 oi->ip_created_trans = 0;
933 oi->ip_last_trans = 0;
934 oi->ip_dir_start_lookup = 0;
935 oi->ip_blkno = 0ULL;
936
937bail:
938 mlog_exit_void();
939}
940
941/* Called under inode_lock, with no more references on the
942 * struct inode, so it's safe here to check the flags field
943 * and to manipulate i_nlink without any other locks. */
944void ocfs2_drop_inode(struct inode *inode)
945{
946 struct ocfs2_inode_info *oi = OCFS2_I(inode);
947
948 mlog_entry_void();
949
950 mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n",
951 oi->ip_blkno, inode->i_nlink, oi->ip_flags);
952
953 /* Testing ip_orphaned_slot here wouldn't work because we may
954 * not have gotten a delete_inode vote from any other nodes
955 * yet. */
956 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) {
957 mlog(0, "Inode was orphaned on another node, clearing nlink.\n");
958 inode->i_nlink = 0;
959 }
960
961 generic_drop_inode(inode);
962
963 mlog_exit_void();
964}
965
966/*
967 * TODO: this should probably be merged into ocfs2_get_block
968 *
969 * However, you now need to pay attention to the cont_prepare_write()
970 * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
971 * expects never to extend).
972 */
973struct buffer_head *ocfs2_bread(struct inode *inode,
974 int block, int *err, int reada)
975{
976 struct buffer_head *bh = NULL;
977 int tmperr;
978 u64 p_blkno;
979 int readflags = OCFS2_BH_CACHED;
980
981#if 0
982 /* only turn this on if we know we can deal with read_block
983 * returning nothing */
984 if (reada)
985 readflags |= OCFS2_BH_READAHEAD;
986#endif
987
988 if (((u64)block << inode->i_sb->s_blocksize_bits) >=
989 i_size_read(inode)) {
990 BUG_ON(!reada);
991 return NULL;
992 }
993
994 tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
995 &p_blkno, NULL);
996 if (tmperr < 0) {
997 mlog_errno(tmperr);
998 goto fail;
999 }
1000
1001 tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
1002 readflags, inode);
1003 if (tmperr < 0)
1004 goto fail;
1005
1006 tmperr = 0;
1007
1008 *err = 0;
1009 return bh;
1010
1011fail:
1012 if (bh) {
1013 brelse(bh);
1014 bh = NULL;
1015 }
1016 *err = -EIO;
1017 return NULL;
1018}
1019
1020/*
1021 * This is called from our getattr.
1022 */
1023int ocfs2_inode_revalidate(struct dentry *dentry)
1024{
1025 struct inode *inode = dentry->d_inode;
1026 int status = 0;
1027
1028 mlog_entry("(inode = 0x%p, ino = %"MLFu64")\n", inode,
1029 inode ? OCFS2_I(inode)->ip_blkno : 0ULL);
1030
1031 if (!inode) {
1032 mlog(0, "eep, no inode!\n");
1033 status = -ENOENT;
1034 goto bail;
1035 }
1036
1037 spin_lock(&OCFS2_I(inode)->ip_lock);
1038 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
1039 spin_unlock(&OCFS2_I(inode)->ip_lock);
1040 mlog(0, "inode deleted!\n");
1041 status = -ENOENT;
1042 goto bail;
1043 }
1044 spin_unlock(&OCFS2_I(inode)->ip_lock);
1045
1046 /* Let ocfs2_meta_lock do the work of updating our struct
1047 * inode for us. */
1048 status = ocfs2_meta_lock(inode, NULL, NULL, 0);
1049 if (status < 0) {
1050 if (status != -ENOENT)
1051 mlog_errno(status);
1052 goto bail;
1053 }
1054 ocfs2_meta_unlock(inode, 0);
1055bail:
1056 mlog_exit(status);
1057
1058 return status;
1059}
1060
1061/*
1062 * Updates a disk inode from a
1063 * struct inode.
1064 * Only takes ip_lock.
1065 */
1066int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
1067 struct inode *inode,
1068 struct buffer_head *bh)
1069{
1070 int status;
1071 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
1072
1073 mlog_entry("(inode %"MLFu64")\n", OCFS2_I(inode)->ip_blkno);
1074
1075 status = ocfs2_journal_access(handle, inode, bh,
1076 OCFS2_JOURNAL_ACCESS_WRITE);
1077 if (status < 0) {
1078 mlog_errno(status);
1079 goto leave;
1080 }
1081
1082 spin_lock(&OCFS2_I(inode)->ip_lock);
1083 fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1084 spin_unlock(&OCFS2_I(inode)->ip_lock);
1085
1086 fe->i_size = cpu_to_le64(i_size_read(inode));
1087 fe->i_links_count = cpu_to_le16(inode->i_nlink);
1088 fe->i_uid = cpu_to_le32(inode->i_uid);
1089 fe->i_gid = cpu_to_le32(inode->i_gid);
1090 fe->i_mode = cpu_to_le16(inode->i_mode);
1091 fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
1092 fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
1093 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1094 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1095 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
1096 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
1097
1098 status = ocfs2_journal_dirty(handle, bh);
1099 if (status < 0)
1100 mlog_errno(status);
1101
1102 status = 0;
1103leave:
1104
1105 mlog_exit(status);
1106 return status;
1107}
1108
1109/*
1110 *
1111 * Updates a struct inode from a disk inode.
1112 * does no i/o, only takes ip_lock.
1113 */
1114void ocfs2_refresh_inode(struct inode *inode,
1115 struct ocfs2_dinode *fe)
1116{
1117 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1118
1119 spin_lock(&OCFS2_I(inode)->ip_lock);
1120
1121 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
1122 i_size_write(inode, le64_to_cpu(fe->i_size));
1123 inode->i_nlink = le16_to_cpu(fe->i_links_count);
1124 inode->i_uid = le32_to_cpu(fe->i_uid);
1125 inode->i_gid = le32_to_cpu(fe->i_gid);
1126 inode->i_mode = le16_to_cpu(fe->i_mode);
1127 inode->i_blksize = (u32) osb->s_clustersize;
1128 if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
1129 inode->i_blocks = 0;
1130 else
1131 inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
1132 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
1133 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
1134 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
1135 inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
1136 inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
1137 inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
1138
1139 spin_unlock(&OCFS2_I(inode)->ip_lock);
1140}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
new file mode 100644
index 000000000000..9b0177433653
--- /dev/null
+++ b/fs/ocfs2/inode.h
@@ -0,0 +1,145 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * inode.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_INODE_H
27#define OCFS2_INODE_H
28
29/* OCFS2 Inode Private Data */
30struct ocfs2_inode_info
31{
32 u64 ip_blkno;
33
34 struct ocfs2_lock_res ip_rw_lockres;
35 struct ocfs2_lock_res ip_meta_lockres;
36 struct ocfs2_lock_res ip_data_lockres;
37
38 /* protects allocation changes on this inode. */
39 struct rw_semaphore ip_alloc_sem;
40
41 /* These fields are protected by ip_lock */
42 spinlock_t ip_lock;
43 u32 ip_open_count;
44 u32 ip_clusters;
45 struct ocfs2_extent_map ip_map;
46 struct list_head ip_io_markers;
47 int ip_orphaned_slot;
48
49 struct semaphore ip_io_sem;
50
51 /* Used by the journalling code to attach an inode to a
52 * handle. These are protected by ip_io_sem in order to lock
53 * out other I/O to the inode until we either commit or
54 * abort. */
55 struct list_head ip_handle_list;
56 struct ocfs2_journal_handle *ip_handle;
57
58 u32 ip_flags; /* see below */
59
60 /* protected by recovery_lock. */
61 struct inode *ip_next_orphan;
62
63 u32 ip_dir_start_lookup;
64
65 /* next two are protected by trans_inc_lock */
66 /* which transaction were we created on? Zero if none. */
67 unsigned long ip_created_trans;
68 /* last transaction we were a part of. */
69 unsigned long ip_last_trans;
70
71 struct ocfs2_caching_info ip_metadata_cache;
72
73 struct inode vfs_inode;
74};
75
76/*
77 * Flags for the ip_flags field
78 */
79/* System file inodes */
80#define OCFS2_INODE_SYSTEM_FILE 0x00000001
81#define OCFS2_INODE_JOURNAL 0x00000002
82#define OCFS2_INODE_BITMAP 0x00000004
83/* This inode has been wiped from disk */
84#define OCFS2_INODE_DELETED 0x00000008
85/* Another node is deleting, so our delete is a nop */
86#define OCFS2_INODE_SKIP_DELETE 0x00000010
87/* Has the inode been orphaned on another node?
88 *
89 * This hints to ocfs2_drop_inode that it should clear i_nlink before
90 * continuing.
91 *
92 * We *only* set this on unlink vote from another node. If the inode
93 * was locally orphaned, then we're sure of the state and don't need
94 * to twiddle i_nlink later - it's either zero or not depending on
95 * whether our unlink succeeded. Otherwise we got this from a node
96 * whose intention was to orphan the inode, however he may have
97 * crashed, failed etc, so we let ocfs2_drop_inode zero the value and
98 * rely on ocfs2_delete_inode to sort things out under the proper
99 * cluster locks.
100 */
101#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
102/* Does someone have the file open O_DIRECT */
103#define OCFS2_INODE_OPEN_DIRECT 0x00000040
104/* Indicates that the metadata cache should be used as an array. */
105#define OCFS2_INODE_CACHE_INLINE 0x00000080
106
107static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
108{
109 return container_of(inode, struct ocfs2_inode_info, vfs_inode);
110}
111
112#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
113#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
114
115extern kmem_cache_t *ocfs2_inode_cache;
116
117extern struct address_space_operations ocfs2_aops;
118
119struct buffer_head *ocfs2_bread(struct inode *inode, int block,
120 int *err, int reada);
121void ocfs2_clear_inode(struct inode *inode);
122void ocfs2_delete_inode(struct inode *inode);
123void ocfs2_drop_inode(struct inode *inode);
124struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff);
125struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
126 u64 blkno,
127 int delete_vote);
128int ocfs2_inode_init_private(struct inode *inode);
129int ocfs2_inode_revalidate(struct dentry *dentry);
130int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
131 int create_ino);
132void ocfs2_read_inode(struct inode *inode);
133void ocfs2_read_inode2(struct inode *inode, void *opaque);
134ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
135 size_t size, loff_t *offp);
136void ocfs2_sync_blockdev(struct super_block *sb);
137void ocfs2_refresh_inode(struct inode *inode,
138 struct ocfs2_dinode *fe);
139int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
140 struct inode *inode,
141 struct buffer_head *bh);
142int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
143int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
144
145#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
new file mode 100644
index 000000000000..04428042e5e5
--- /dev/null
+++ b/fs/ocfs2/journal.c
@@ -0,0 +1,1652 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * journal.c
5 *
6 * Defines functions of journalling api
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/kthread.h>
31
32#define MLOG_MASK_PREFIX ML_JOURNAL
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "dlmglue.h"
39#include "extent_map.h"
40#include "heartbeat.h"
41#include "inode.h"
42#include "journal.h"
43#include "localalloc.h"
44#include "namei.h"
45#include "slot_map.h"
46#include "super.h"
47#include "vote.h"
48#include "sysfile.h"
49
50#include "buffer_head_io.h"
51
52spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
53
54static int ocfs2_force_read_journal(struct inode *inode);
55static int ocfs2_recover_node(struct ocfs2_super *osb,
56 int node_num);
57static int __ocfs2_recovery_thread(void *arg);
58static int ocfs2_commit_cache(struct ocfs2_super *osb);
59static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
60static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
61 struct ocfs2_journal_handle *handle);
62static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle);
63static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
64 int dirty);
65static int ocfs2_trylock_journal(struct ocfs2_super *osb,
66 int slot_num);
67static int ocfs2_recover_orphans(struct ocfs2_super *osb,
68 int slot);
69static int ocfs2_commit_thread(void *arg);
70
71static int ocfs2_commit_cache(struct ocfs2_super *osb)
72{
73 int status = 0;
74 unsigned int flushed;
75 unsigned long old_id;
76 struct ocfs2_journal *journal = NULL;
77
78 mlog_entry_void();
79
80 journal = osb->journal;
81
82 /* Flush all pending commits and checkpoint the journal. */
83 down_write(&journal->j_trans_barrier);
84
85 if (atomic_read(&journal->j_num_trans) == 0) {
86 up_write(&journal->j_trans_barrier);
87 mlog(0, "No transactions for me to flush!\n");
88 goto finally;
89 }
90
91 journal_lock_updates(journal->j_journal);
92 status = journal_flush(journal->j_journal);
93 journal_unlock_updates(journal->j_journal);
94 if (status < 0) {
95 up_write(&journal->j_trans_barrier);
96 mlog_errno(status);
97 goto finally;
98 }
99
100 old_id = ocfs2_inc_trans_id(journal);
101
102 flushed = atomic_read(&journal->j_num_trans);
103 atomic_set(&journal->j_num_trans, 0);
104 up_write(&journal->j_trans_barrier);
105
106 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
107 journal->j_trans_id, flushed);
108
109 ocfs2_kick_vote_thread(osb);
110 wake_up(&journal->j_checkpointed);
111finally:
112 mlog_exit(status);
113 return status;
114}
115
116struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb)
117{
118 struct ocfs2_journal_handle *retval = NULL;
119
120 retval = kcalloc(1, sizeof(*retval), GFP_KERNEL);
121 if (!retval) {
122 mlog(ML_ERROR, "Failed to allocate memory for journal "
123 "handle!\n");
124 return NULL;
125 }
126
127 retval->max_buffs = 0;
128 retval->num_locks = 0;
129 retval->k_handle = NULL;
130
131 INIT_LIST_HEAD(&retval->locks);
132 INIT_LIST_HEAD(&retval->inode_list);
133 retval->journal = osb->journal;
134
135 return retval;
136}
137
138/* pass it NULL and it will allocate a new handle object for you. If
139 * you pass it a handle however, it may still return error, in which
140 * case it has free'd the passed handle for you. */
141struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
142 struct ocfs2_journal_handle *handle,
143 int max_buffs)
144{
145 int ret;
146 journal_t *journal = osb->journal->j_journal;
147
148 mlog_entry("(max_buffs = %d)\n", max_buffs);
149
150 if (!osb || !osb->journal->j_journal)
151 BUG();
152
153 if (ocfs2_is_hard_readonly(osb)) {
154 ret = -EROFS;
155 goto done_free;
156 }
157
158 BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
159 BUG_ON(max_buffs <= 0);
160
161 /* JBD might support this, but our journalling code doesn't yet. */
162 if (journal_current_handle()) {
163 mlog(ML_ERROR, "Recursive transaction attempted!\n");
164 BUG();
165 }
166
167 if (!handle)
168 handle = ocfs2_alloc_handle(osb);
169 if (!handle) {
170 ret = -ENOMEM;
171 mlog(ML_ERROR, "Failed to allocate memory for journal "
172 "handle!\n");
173 goto done_free;
174 }
175
176 handle->max_buffs = max_buffs;
177
178 down_read(&osb->journal->j_trans_barrier);
179
180 /* actually start the transaction now */
181 handle->k_handle = journal_start(journal, max_buffs);
182 if (IS_ERR(handle->k_handle)) {
183 up_read(&osb->journal->j_trans_barrier);
184
185 ret = PTR_ERR(handle->k_handle);
186 handle->k_handle = NULL;
187 mlog_errno(ret);
188
189 if (is_journal_aborted(journal)) {
190 ocfs2_abort(osb->sb, "Detected aborted journal");
191 ret = -EROFS;
192 }
193 goto done_free;
194 }
195
196 atomic_inc(&(osb->journal->j_num_trans));
197 handle->flags |= OCFS2_HANDLE_STARTED;
198
199 mlog_exit_ptr(handle);
200 return handle;
201
202done_free:
203 if (handle)
204 ocfs2_commit_unstarted_handle(handle); /* will kfree handle */
205
206 mlog_exit(ret);
207 return ERR_PTR(ret);
208}
209
210void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
211 struct inode *inode)
212{
213 BUG_ON(!handle);
214 BUG_ON(!inode);
215
216 atomic_inc(&inode->i_count);
217
218 /* we're obviously changing it... */
219 down(&inode->i_sem);
220
221 /* sanity check */
222 BUG_ON(OCFS2_I(inode)->ip_handle);
223 BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
224
225 OCFS2_I(inode)->ip_handle = handle;
226 list_del(&(OCFS2_I(inode)->ip_handle_list));
227 list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
228}
229
230static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
231{
232 struct list_head *p, *n;
233 struct inode *inode;
234 struct ocfs2_inode_info *oi;
235
236 list_for_each_safe(p, n, &handle->inode_list) {
237 oi = list_entry(p, struct ocfs2_inode_info,
238 ip_handle_list);
239 inode = &oi->vfs_inode;
240
241 OCFS2_I(inode)->ip_handle = NULL;
242 list_del_init(&OCFS2_I(inode)->ip_handle_list);
243
244 up(&inode->i_sem);
245 iput(inode);
246 }
247}
248
249/* This is trivial so we do it out of the main commit
250 * paths. Beware, it can be called from start_trans too! */
251static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle)
252{
253 mlog_entry_void();
254
255 BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
256
257 ocfs2_handle_unlock_inodes(handle);
258 /* You are allowed to add journal locks before the transaction
259 * has started. */
260 ocfs2_handle_cleanup_locks(handle->journal, handle);
261
262 kfree(handle);
263
264 mlog_exit_void();
265}
266
267void ocfs2_commit_trans(struct ocfs2_journal_handle *handle)
268{
269 handle_t *jbd_handle;
270 int retval;
271 struct ocfs2_journal *journal = handle->journal;
272
273 mlog_entry_void();
274
275 BUG_ON(!handle);
276
277 if (!(handle->flags & OCFS2_HANDLE_STARTED)) {
278 ocfs2_commit_unstarted_handle(handle);
279 mlog_exit_void();
280 return;
281 }
282
283 /* release inode semaphores we took during this transaction */
284 ocfs2_handle_unlock_inodes(handle);
285
286 /* ocfs2_extend_trans may have had to call journal_restart
287 * which will always commit the transaction, but may return
288 * error for any number of reasons. If this is the case, we
289 * clear k_handle as it's not valid any more. */
290 if (handle->k_handle) {
291 jbd_handle = handle->k_handle;
292
293 if (handle->flags & OCFS2_HANDLE_SYNC)
294 jbd_handle->h_sync = 1;
295 else
296 jbd_handle->h_sync = 0;
297
298 /* actually stop the transaction. if we've set h_sync,
299 * it'll have been committed when we return */
300 retval = journal_stop(jbd_handle);
301 if (retval < 0) {
302 mlog_errno(retval);
303 mlog(ML_ERROR, "Could not commit transaction\n");
304 BUG();
305 }
306
307 handle->k_handle = NULL; /* it's been free'd in journal_stop */
308 }
309
310 ocfs2_handle_cleanup_locks(journal, handle);
311
312 up_read(&journal->j_trans_barrier);
313
314 kfree(handle);
315 mlog_exit_void();
316}
317
318/*
319 * 'nblocks' is what you want to add to the current
320 * transaction. extend_trans will either extend the current handle by
321 * nblocks, or commit it and start a new one with nblocks credits.
322 *
323 * WARNING: This will not release any semaphores or disk locks taken
324 * during the transaction, so make sure they were taken *before*
325 * start_trans or we'll have ordering deadlocks.
326 *
327 * WARNING2: Note that we do *not* drop j_trans_barrier here. This is
328 * good because transaction ids haven't yet been recorded on the
329 * cluster locks associated with this handle.
330 */
331int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
332 int nblocks)
333{
334 int status;
335
336 BUG_ON(!handle);
337 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
338 BUG_ON(!nblocks);
339
340 mlog_entry_void();
341
342 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
343
344 status = journal_extend(handle->k_handle, nblocks);
345 if (status < 0) {
346 mlog_errno(status);
347 goto bail;
348 }
349
350 if (status > 0) {
351 mlog(0, "journal_extend failed, trying journal_restart\n");
352 status = journal_restart(handle->k_handle, nblocks);
353 if (status < 0) {
354 handle->k_handle = NULL;
355 mlog_errno(status);
356 goto bail;
357 }
358 handle->max_buffs = nblocks;
359 } else
360 handle->max_buffs += nblocks;
361
362 status = 0;
363bail:
364
365 mlog_exit(status);
366 return status;
367}
368
369int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
370 struct inode *inode,
371 struct buffer_head *bh,
372 int type)
373{
374 int status;
375
376 BUG_ON(!inode);
377 BUG_ON(!handle);
378 BUG_ON(!bh);
379 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
380
381 mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n",
382 (unsigned long long)bh->b_blocknr, type,
383 (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
384 "OCFS2_JOURNAL_ACCESS_CREATE" :
385 "OCFS2_JOURNAL_ACCESS_WRITE",
386 bh->b_size);
387
388 /* we can safely remove this assertion after testing. */
389 if (!buffer_uptodate(bh)) {
390 mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
391 mlog(ML_ERROR, "b_blocknr=%llu\n",
392 (unsigned long long)bh->b_blocknr);
393 BUG();
394 }
395
396 /* Set the current transaction information on the inode so
397 * that the locking code knows whether it can drop it's locks
398 * on this inode or not. We're protected from the commit
399 * thread updating the current transaction id until
400 * ocfs2_commit_trans() because ocfs2_start_trans() took
401 * j_trans_barrier for us. */
402 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
403
404 down(&OCFS2_I(inode)->ip_io_sem);
405 switch (type) {
406 case OCFS2_JOURNAL_ACCESS_CREATE:
407 case OCFS2_JOURNAL_ACCESS_WRITE:
408 status = journal_get_write_access(handle->k_handle, bh);
409 break;
410
411 case OCFS2_JOURNAL_ACCESS_UNDO:
412 status = journal_get_undo_access(handle->k_handle, bh);
413 break;
414
415 default:
416 status = -EINVAL;
417 mlog(ML_ERROR, "Uknown access type!\n");
418 }
419 up(&OCFS2_I(inode)->ip_io_sem);
420
421 if (status < 0)
422 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
423 status, type);
424
425 mlog_exit(status);
426 return status;
427}
428
429int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
430 struct buffer_head *bh)
431{
432 int status;
433
434 BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED));
435
436 mlog_entry("(bh->b_blocknr=%llu)\n",
437 (unsigned long long)bh->b_blocknr);
438
439 status = journal_dirty_metadata(handle->k_handle, bh);
440 if (status < 0)
441 mlog(ML_ERROR, "Could not dirty metadata buffer. "
442 "(bh->b_blocknr=%llu)\n",
443 (unsigned long long)bh->b_blocknr);
444
445 mlog_exit(status);
446 return status;
447}
448
449int ocfs2_journal_dirty_data(handle_t *handle,
450 struct buffer_head *bh)
451{
452 int err = journal_dirty_data(handle, bh);
453 if (err)
454 mlog_errno(err);
455 /* TODO: When we can handle it, abort the handle and go RO on
456 * error here. */
457
458 return err;
459}
460
461/* We always assume you're adding a metadata lock at level 'ex' */
462int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
463 struct inode *inode)
464{
465 int status;
466 struct ocfs2_journal_lock *lock;
467
468 BUG_ON(!inode);
469
470 lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS);
471 if (!lock) {
472 status = -ENOMEM;
473 mlog_errno(-ENOMEM);
474 goto bail;
475 }
476
477 if (!igrab(inode))
478 BUG();
479 lock->jl_inode = inode;
480
481 list_add_tail(&(lock->jl_lock_list), &(handle->locks));
482 handle->num_locks++;
483
484 status = 0;
485bail:
486 mlog_exit(status);
487 return status;
488}
489
490static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal,
491 struct ocfs2_journal_handle *handle)
492{
493 struct list_head *p, *n;
494 struct ocfs2_journal_lock *lock;
495 struct inode *inode;
496
497 list_for_each_safe(p, n, &(handle->locks)) {
498 lock = list_entry(p, struct ocfs2_journal_lock,
499 jl_lock_list);
500 list_del(&lock->jl_lock_list);
501 handle->num_locks--;
502
503 inode = lock->jl_inode;
504 ocfs2_meta_unlock(inode, 1);
505 if (atomic_read(&inode->i_count) == 1)
506 mlog(ML_ERROR,
507 "Inode %"MLFu64", I'm doing a last iput for!",
508 OCFS2_I(inode)->ip_blkno);
509 iput(inode);
510 kmem_cache_free(ocfs2_lock_cache, lock);
511 }
512}
513
514#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5)
515
516void ocfs2_set_journal_params(struct ocfs2_super *osb)
517{
518 journal_t *journal = osb->journal->j_journal;
519
520 spin_lock(&journal->j_state_lock);
521 journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL;
522 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
523 journal->j_flags |= JFS_BARRIER;
524 else
525 journal->j_flags &= ~JFS_BARRIER;
526 spin_unlock(&journal->j_state_lock);
527}
528
529int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
530{
531 int status = -1;
532 struct inode *inode = NULL; /* the journal inode */
533 journal_t *j_journal = NULL;
534 struct ocfs2_dinode *di = NULL;
535 struct buffer_head *bh = NULL;
536 struct ocfs2_super *osb;
537 int meta_lock = 0;
538
539 mlog_entry_void();
540
541 BUG_ON(!journal);
542
543 osb = journal->j_osb;
544
545 /* already have the inode for our journal */
546 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
547 osb->slot_num);
548 if (inode == NULL) {
549 status = -EACCES;
550 mlog_errno(status);
551 goto done;
552 }
553 if (is_bad_inode(inode)) {
554 mlog(ML_ERROR, "access error (bad inode)\n");
555 iput(inode);
556 inode = NULL;
557 status = -EACCES;
558 goto done;
559 }
560
561 SET_INODE_JOURNAL(inode);
562 OCFS2_I(inode)->ip_open_count++;
563
564 status = ocfs2_meta_lock(inode, NULL, &bh, 1);
565 if (status < 0) {
566 if (status != -ERESTARTSYS)
567 mlog(ML_ERROR, "Could not get lock on journal!\n");
568 goto done;
569 }
570
571 meta_lock = 1;
572 di = (struct ocfs2_dinode *)bh->b_data;
573
574 if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) {
575 mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
576 inode->i_size);
577 status = -EINVAL;
578 goto done;
579 }
580
581 mlog(0, "inode->i_size = %lld\n", inode->i_size);
582 mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks);
583 mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
584
585 /* call the kernels journal init function now */
586 j_journal = journal_init_inode(inode);
587 if (j_journal == NULL) {
588 mlog(ML_ERROR, "Linux journal layer error\n");
589 status = -EINVAL;
590 goto done;
591 }
592
593 mlog(0, "Returned from journal_init_inode\n");
594 mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
595
596 *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
597 OCFS2_JOURNAL_DIRTY_FL);
598
599 journal->j_journal = j_journal;
600 journal->j_inode = inode;
601 journal->j_bh = bh;
602
603 ocfs2_set_journal_params(osb);
604
605 journal->j_state = OCFS2_JOURNAL_LOADED;
606
607 status = 0;
608done:
609 if (status < 0) {
610 if (meta_lock)
611 ocfs2_meta_unlock(inode, 1);
612 if (bh != NULL)
613 brelse(bh);
614 if (inode) {
615 OCFS2_I(inode)->ip_open_count--;
616 iput(inode);
617 }
618 }
619
620 mlog_exit(status);
621 return status;
622}
623
624static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
625 int dirty)
626{
627 int status;
628 unsigned int flags;
629 struct ocfs2_journal *journal = osb->journal;
630 struct buffer_head *bh = journal->j_bh;
631 struct ocfs2_dinode *fe;
632
633 mlog_entry_void();
634
635 fe = (struct ocfs2_dinode *)bh->b_data;
636 if (!OCFS2_IS_VALID_DINODE(fe)) {
637 /* This is called from startup/shutdown which will
638 * handle the errors in a specific manner, so no need
639 * to call ocfs2_error() here. */
640 mlog(ML_ERROR, "Journal dinode %"MLFu64" has invalid "
641 "signature: %.*s", fe->i_blkno, 7, fe->i_signature);
642 status = -EIO;
643 goto out;
644 }
645
646 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
647 if (dirty)
648 flags |= OCFS2_JOURNAL_DIRTY_FL;
649 else
650 flags &= ~OCFS2_JOURNAL_DIRTY_FL;
651 fe->id1.journal1.ij_flags = cpu_to_le32(flags);
652
653 status = ocfs2_write_block(osb, bh, journal->j_inode);
654 if (status < 0)
655 mlog_errno(status);
656
657out:
658 mlog_exit(status);
659 return status;
660}
661
662/*
663 * If the journal has been kmalloc'd it needs to be freed after this
664 * call.
665 */
666void ocfs2_journal_shutdown(struct ocfs2_super *osb)
667{
668 struct ocfs2_journal *journal = NULL;
669 int status = 0;
670 struct inode *inode = NULL;
671 int num_running_trans = 0;
672
673 mlog_entry_void();
674
675 if (!osb)
676 BUG();
677
678 journal = osb->journal;
679 if (!journal)
680 goto done;
681
682 inode = journal->j_inode;
683
684 if (journal->j_state != OCFS2_JOURNAL_LOADED)
685 goto done;
686
687 /* need to inc inode use count as journal_destroy will iput. */
688 if (!igrab(inode))
689 BUG();
690
691 num_running_trans = atomic_read(&(osb->journal->j_num_trans));
692 if (num_running_trans > 0)
693 mlog(0, "Shutting down journal: must wait on %d "
694 "running transactions!\n",
695 num_running_trans);
696
697 /* Do a commit_cache here. It will flush our journal, *and*
698 * release any locks that are still held.
699 * set the SHUTDOWN flag and release the trans lock.
700 * the commit thread will take the trans lock for us below. */
701 journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN;
702
703 /* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not
704 * drop the trans_lock (which we want to hold until we
705 * completely destroy the journal. */
706 if (osb->commit_task) {
707 /* Wait for the commit thread */
708 mlog(0, "Waiting for ocfs2commit to exit....\n");
709 kthread_stop(osb->commit_task);
710 osb->commit_task = NULL;
711 }
712
713 BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
714
715 status = ocfs2_journal_toggle_dirty(osb, 0);
716 if (status < 0)
717 mlog_errno(status);
718
719 /* Shutdown the kernel journal system */
720 journal_destroy(journal->j_journal);
721
722 OCFS2_I(inode)->ip_open_count--;
723
724 /* unlock our journal */
725 ocfs2_meta_unlock(inode, 1);
726
727 brelse(journal->j_bh);
728 journal->j_bh = NULL;
729
730 journal->j_state = OCFS2_JOURNAL_FREE;
731
732// up_write(&journal->j_trans_barrier);
733done:
734 if (inode)
735 iput(inode);
736 mlog_exit_void();
737}
738
739static void ocfs2_clear_journal_error(struct super_block *sb,
740 journal_t *journal,
741 int slot)
742{
743 int olderr;
744
745 olderr = journal_errno(journal);
746 if (olderr) {
747 mlog(ML_ERROR, "File system error %d recorded in "
748 "journal %u.\n", olderr, slot);
749 mlog(ML_ERROR, "File system on device %s needs checking.\n",
750 sb->s_id);
751
752 journal_ack_err(journal);
753 journal_clear_err(journal);
754 }
755}
756
757int ocfs2_journal_load(struct ocfs2_journal *journal)
758{
759 int status = 0;
760 struct ocfs2_super *osb;
761
762 mlog_entry_void();
763
764 if (!journal)
765 BUG();
766
767 osb = journal->j_osb;
768
769 status = journal_load(journal->j_journal);
770 if (status < 0) {
771 mlog(ML_ERROR, "Failed to load journal!\n");
772 goto done;
773 }
774
775 ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
776
777 status = ocfs2_journal_toggle_dirty(osb, 1);
778 if (status < 0) {
779 mlog_errno(status);
780 goto done;
781 }
782
783 /* Launch the commit thread */
784 osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d",
785 osb->osb_id);
786 if (IS_ERR(osb->commit_task)) {
787 status = PTR_ERR(osb->commit_task);
788 osb->commit_task = NULL;
789 mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d",
790 status);
791 goto done;
792 }
793
794done:
795 mlog_exit(status);
796 return status;
797}
798
799
800/* 'full' flag tells us whether we clear out all blocks or if we just
801 * mark the journal clean */
802int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
803{
804 int status;
805
806 mlog_entry_void();
807
808 if (!journal)
809 BUG();
810
811 status = journal_wipe(journal->j_journal, full);
812 if (status < 0) {
813 mlog_errno(status);
814 goto bail;
815 }
816
817 status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
818 if (status < 0)
819 mlog_errno(status);
820
821bail:
822 mlog_exit(status);
823 return status;
824}
825
826/*
827 * JBD Might read a cached version of another nodes journal file. We
828 * don't want this as this file changes often and we get no
829 * notification on those changes. The only way to be sure that we've
830 * got the most up to date version of those blocks then is to force
831 * read them off disk. Just searching through the buffer cache won't
832 * work as there may be pages backing this file which are still marked
833 * up to date. We know things can't change on this file underneath us
834 * as we have the lock by now :)
835 */
836static int ocfs2_force_read_journal(struct inode *inode)
837{
838 int status = 0;
839 int i, p_blocks;
840 u64 v_blkno, p_blkno;
841#define CONCURRENT_JOURNAL_FILL 32
842 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
843
844 mlog_entry_void();
845
846 BUG_ON(inode->i_blocks !=
847 ocfs2_align_bytes_to_sectors(i_size_read(inode)));
848
849 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
850
851 mlog(0, "Force reading %lu blocks\n",
852 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9)));
853
854 v_blkno = 0;
855 while (v_blkno <
856 (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
857
858 status = ocfs2_extent_map_get_blocks(inode, v_blkno,
859 1, &p_blkno,
860 &p_blocks);
861 if (status < 0) {
862 mlog_errno(status);
863 goto bail;
864 }
865
866 if (p_blocks > CONCURRENT_JOURNAL_FILL)
867 p_blocks = CONCURRENT_JOURNAL_FILL;
868
869 status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
870 p_blkno, p_blocks, bhs, 0,
871 inode);
872 if (status < 0) {
873 mlog_errno(status);
874 goto bail;
875 }
876
877 for(i = 0; i < p_blocks; i++) {
878 brelse(bhs[i]);
879 bhs[i] = NULL;
880 }
881
882 v_blkno += p_blocks;
883 }
884
885bail:
886 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
887 if (bhs[i])
888 brelse(bhs[i]);
889 mlog_exit(status);
890 return status;
891}
892
893struct ocfs2_la_recovery_item {
894 struct list_head lri_list;
895 int lri_slot;
896 struct ocfs2_dinode *lri_la_dinode;
897 struct ocfs2_dinode *lri_tl_dinode;
898};
899
900/* Does the second half of the recovery process. By this point, the
901 * node is marked clean and can actually be considered recovered,
902 * hence it's no longer in the recovery map, but there's still some
903 * cleanup we can do which shouldn't happen within the recovery thread
904 * as locking in that context becomes very difficult if we are to take
905 * recovering nodes into account.
906 *
907 * NOTE: This function can and will sleep on recovery of other nodes
908 * during cluster locking, just like any other ocfs2 process.
909 */
910void ocfs2_complete_recovery(void *data)
911{
912 int ret;
913 struct ocfs2_super *osb = data;
914 struct ocfs2_journal *journal = osb->journal;
915 struct ocfs2_dinode *la_dinode, *tl_dinode;
916 struct ocfs2_la_recovery_item *item;
917 struct list_head *p, *n;
918 LIST_HEAD(tmp_la_list);
919
920 mlog_entry_void();
921
922 mlog(0, "completing recovery from keventd\n");
923
924 spin_lock(&journal->j_lock);
925 list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
926 spin_unlock(&journal->j_lock);
927
928 list_for_each_safe(p, n, &tmp_la_list) {
929 item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
930 list_del_init(&item->lri_list);
931
932 mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
933
934 la_dinode = item->lri_la_dinode;
935 if (la_dinode) {
936 mlog(0, "Clean up local alloc %"MLFu64"\n",
937 la_dinode->i_blkno);
938
939 ret = ocfs2_complete_local_alloc_recovery(osb,
940 la_dinode);
941 if (ret < 0)
942 mlog_errno(ret);
943
944 kfree(la_dinode);
945 }
946
947 tl_dinode = item->lri_tl_dinode;
948 if (tl_dinode) {
949 mlog(0, "Clean up truncate log %"MLFu64"\n",
950 tl_dinode->i_blkno);
951
952 ret = ocfs2_complete_truncate_log_recovery(osb,
953 tl_dinode);
954 if (ret < 0)
955 mlog_errno(ret);
956
957 kfree(tl_dinode);
958 }
959
960 ret = ocfs2_recover_orphans(osb, item->lri_slot);
961 if (ret < 0)
962 mlog_errno(ret);
963
964 kfree(item);
965 }
966
967 mlog(0, "Recovery completion\n");
968 mlog_exit_void();
969}
970
971/* NOTE: This function always eats your references to la_dinode and
972 * tl_dinode, either manually on error, or by passing them to
973 * ocfs2_complete_recovery */
974static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
975 int slot_num,
976 struct ocfs2_dinode *la_dinode,
977 struct ocfs2_dinode *tl_dinode)
978{
979 struct ocfs2_la_recovery_item *item;
980
981 item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL);
982 if (!item) {
983 /* Though we wish to avoid it, we are in fact safe in
984 * skipping local alloc cleanup as fsck.ocfs2 is more
985 * than capable of reclaiming unused space. */
986 if (la_dinode)
987 kfree(la_dinode);
988
989 if (tl_dinode)
990 kfree(tl_dinode);
991
992 mlog_errno(-ENOMEM);
993 return;
994 }
995
996 INIT_LIST_HEAD(&item->lri_list);
997 item->lri_la_dinode = la_dinode;
998 item->lri_slot = slot_num;
999 item->lri_tl_dinode = tl_dinode;
1000
1001 spin_lock(&journal->j_lock);
1002 list_add_tail(&item->lri_list, &journal->j_la_cleanups);
1003 queue_work(ocfs2_wq, &journal->j_recovery_work);
1004 spin_unlock(&journal->j_lock);
1005}
1006
1007/* Called by the mount code to queue recovery the last part of
1008 * recovery for it's own slot. */
1009void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
1010{
1011 struct ocfs2_journal *journal = osb->journal;
1012
1013 if (osb->dirty) {
1014 /* No need to queue up our truncate_log as regular
1015 * cleanup will catch that. */
1016 ocfs2_queue_recovery_completion(journal,
1017 osb->slot_num,
1018 osb->local_alloc_copy,
1019 NULL);
1020 ocfs2_schedule_truncate_log_flush(osb, 0);
1021
1022 osb->local_alloc_copy = NULL;
1023 osb->dirty = 0;
1024 }
1025}
1026
1027static int __ocfs2_recovery_thread(void *arg)
1028{
1029 int status, node_num;
1030 struct ocfs2_super *osb = arg;
1031
1032 mlog_entry_void();
1033
1034 status = ocfs2_wait_on_mount(osb);
1035 if (status < 0) {
1036 goto bail;
1037 }
1038
1039restart:
1040 status = ocfs2_super_lock(osb, 1);
1041 if (status < 0) {
1042 mlog_errno(status);
1043 goto bail;
1044 }
1045
1046 while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
1047 node_num = ocfs2_node_map_first_set_bit(osb,
1048 &osb->recovery_map);
1049 if (node_num == O2NM_INVALID_NODE_NUM) {
1050 mlog(0, "Out of nodes to recover.\n");
1051 break;
1052 }
1053
1054 status = ocfs2_recover_node(osb, node_num);
1055 if (status < 0) {
1056 mlog(ML_ERROR,
1057 "Error %d recovering node %d on device (%u,%u)!\n",
1058 status, node_num,
1059 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1060 mlog(ML_ERROR, "Volume requires unmount.\n");
1061 continue;
1062 }
1063
1064 ocfs2_recovery_map_clear(osb, node_num);
1065 }
1066 ocfs2_super_unlock(osb, 1);
1067
1068 /* We always run recovery on our own orphan dir - the dead
1069 * node(s) may have voted "no" on an inode delete earlier. A
1070 * revote is therefore required. */
1071 ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
1072 NULL);
1073
1074bail:
1075 down(&osb->recovery_lock);
1076 if (!status &&
1077 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
1078 up(&osb->recovery_lock);
1079 goto restart;
1080 }
1081
1082 osb->recovery_thread_task = NULL;
1083 mb(); /* sync with ocfs2_recovery_thread_running */
1084 wake_up(&osb->recovery_event);
1085
1086 up(&osb->recovery_lock);
1087
1088 mlog_exit(status);
1089 /* no one is callint kthread_stop() for us so the kthread() api
1090 * requires that we call do_exit(). And it isn't exported, but
1091 * complete_and_exit() seems to be a minimal wrapper around it. */
1092 complete_and_exit(NULL, status);
1093 return status;
1094}
1095
1096void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
1097{
1098 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1099 node_num, osb->node_num);
1100
1101 down(&osb->recovery_lock);
1102 if (osb->disable_recovery)
1103 goto out;
1104
1105 /* People waiting on recovery will wait on
1106 * the recovery map to empty. */
1107 if (!ocfs2_recovery_map_set(osb, node_num))
1108 mlog(0, "node %d already be in recovery.\n", node_num);
1109
1110 mlog(0, "starting recovery thread...\n");
1111
1112 if (osb->recovery_thread_task)
1113 goto out;
1114
1115 osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb,
1116 "ocfs2rec-%d", osb->osb_id);
1117 if (IS_ERR(osb->recovery_thread_task)) {
1118 mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
1119 osb->recovery_thread_task = NULL;
1120 }
1121
1122out:
1123 up(&osb->recovery_lock);
1124 wake_up(&osb->recovery_event);
1125
1126 mlog_exit_void();
1127}
1128
1129/* Does the actual journal replay and marks the journal inode as
1130 * clean. Will only replay if the journal inode is marked dirty. */
1131static int ocfs2_replay_journal(struct ocfs2_super *osb,
1132 int node_num,
1133 int slot_num)
1134{
1135 int status;
1136 int got_lock = 0;
1137 unsigned int flags;
1138 struct inode *inode = NULL;
1139 struct ocfs2_dinode *fe;
1140 journal_t *journal = NULL;
1141 struct buffer_head *bh = NULL;
1142
1143 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
1144 slot_num);
1145 if (inode == NULL) {
1146 status = -EACCES;
1147 mlog_errno(status);
1148 goto done;
1149 }
1150 if (is_bad_inode(inode)) {
1151 status = -EACCES;
1152 iput(inode);
1153 inode = NULL;
1154 mlog_errno(status);
1155 goto done;
1156 }
1157 SET_INODE_JOURNAL(inode);
1158
1159 status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
1160 OCFS2_META_LOCK_RECOVERY);
1161 if (status < 0) {
1162 mlog(0, "status returned from ocfs2_meta_lock=%d\n", status);
1163 if (status != -ERESTARTSYS)
1164 mlog(ML_ERROR, "Could not lock journal!\n");
1165 goto done;
1166 }
1167 got_lock = 1;
1168
1169 fe = (struct ocfs2_dinode *) bh->b_data;
1170
1171 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
1172
1173 if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
1174 mlog(0, "No recovery required for node %d\n", node_num);
1175 goto done;
1176 }
1177
1178 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
1179 node_num, slot_num,
1180 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1181
1182 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
1183
1184 status = ocfs2_force_read_journal(inode);
1185 if (status < 0) {
1186 mlog_errno(status);
1187 goto done;
1188 }
1189
1190 mlog(0, "calling journal_init_inode\n");
1191 journal = journal_init_inode(inode);
1192 if (journal == NULL) {
1193 mlog(ML_ERROR, "Linux journal layer error\n");
1194 status = -EIO;
1195 goto done;
1196 }
1197
1198 status = journal_load(journal);
1199 if (status < 0) {
1200 mlog_errno(status);
1201 if (!igrab(inode))
1202 BUG();
1203 journal_destroy(journal);
1204 goto done;
1205 }
1206
1207 ocfs2_clear_journal_error(osb->sb, journal, slot_num);
1208
1209 /* wipe the journal */
1210 mlog(0, "flushing the journal.\n");
1211 journal_lock_updates(journal);
1212 status = journal_flush(journal);
1213 journal_unlock_updates(journal);
1214 if (status < 0)
1215 mlog_errno(status);
1216
1217 /* This will mark the node clean */
1218 flags = le32_to_cpu(fe->id1.journal1.ij_flags);
1219 flags &= ~OCFS2_JOURNAL_DIRTY_FL;
1220 fe->id1.journal1.ij_flags = cpu_to_le32(flags);
1221
1222 status = ocfs2_write_block(osb, bh, inode);
1223 if (status < 0)
1224 mlog_errno(status);
1225
1226 if (!igrab(inode))
1227 BUG();
1228
1229 journal_destroy(journal);
1230
1231done:
1232 /* drop the lock on this nodes journal */
1233 if (got_lock)
1234 ocfs2_meta_unlock(inode, 1);
1235
1236 if (inode)
1237 iput(inode);
1238
1239 if (bh)
1240 brelse(bh);
1241
1242 mlog_exit(status);
1243 return status;
1244}
1245
1246/*
1247 * Do the most important parts of node recovery:
1248 * - Replay it's journal
1249 * - Stamp a clean local allocator file
1250 * - Stamp a clean truncate log
1251 * - Mark the node clean
1252 *
1253 * If this function completes without error, a node in OCFS2 can be
1254 * said to have been safely recovered. As a result, failure during the
1255 * second part of a nodes recovery process (local alloc recovery) is
1256 * far less concerning.
1257 */
1258static int ocfs2_recover_node(struct ocfs2_super *osb,
1259 int node_num)
1260{
1261 int status = 0;
1262 int slot_num;
1263 struct ocfs2_slot_info *si = osb->slot_info;
1264 struct ocfs2_dinode *la_copy = NULL;
1265 struct ocfs2_dinode *tl_copy = NULL;
1266
1267 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1268 node_num, osb->node_num);
1269
1270 mlog(0, "checking node %d\n", node_num);
1271
1272 /* Should not ever be called to recover ourselves -- in that
1273 * case we should've called ocfs2_journal_load instead. */
1274 if (osb->node_num == node_num)
1275 BUG();
1276
1277 slot_num = ocfs2_node_num_to_slot(si, node_num);
1278 if (slot_num == OCFS2_INVALID_SLOT) {
1279 status = 0;
1280 mlog(0, "no slot for this node, so no recovery required.\n");
1281 goto done;
1282 }
1283
1284 mlog(0, "node %d was using slot %d\n", node_num, slot_num);
1285
1286 status = ocfs2_replay_journal(osb, node_num, slot_num);
1287 if (status < 0) {
1288 mlog_errno(status);
1289 goto done;
1290 }
1291
1292 /* Stamp a clean local alloc file AFTER recovering the journal... */
1293 status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy);
1294 if (status < 0) {
1295 mlog_errno(status);
1296 goto done;
1297 }
1298
1299 /* An error from begin_truncate_log_recovery is not
1300 * serious enough to warrant halting the rest of
1301 * recovery. */
1302 status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy);
1303 if (status < 0)
1304 mlog_errno(status);
1305
1306 /* Likewise, this would be a strange but ultimately not so
1307 * harmful place to get an error... */
1308 ocfs2_clear_slot(si, slot_num);
1309 status = ocfs2_update_disk_slots(osb, si);
1310 if (status < 0)
1311 mlog_errno(status);
1312
1313 /* This will kfree the memory pointed to by la_copy and tl_copy */
1314 ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
1315 tl_copy);
1316
1317 status = 0;
1318done:
1319
1320 mlog_exit(status);
1321 return status;
1322}
1323
1324/* Test node liveness by trylocking his journal. If we get the lock,
1325 * we drop it here. Return 0 if we got the lock, -EAGAIN if node is
1326 * still alive (we couldn't get the lock) and < 0 on error. */
1327static int ocfs2_trylock_journal(struct ocfs2_super *osb,
1328 int slot_num)
1329{
1330 int status, flags;
1331 struct inode *inode = NULL;
1332
1333 inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
1334 slot_num);
1335 if (inode == NULL) {
1336 mlog(ML_ERROR, "access error\n");
1337 status = -EACCES;
1338 goto bail;
1339 }
1340 if (is_bad_inode(inode)) {
1341 mlog(ML_ERROR, "access error (bad inode)\n");
1342 iput(inode);
1343 inode = NULL;
1344 status = -EACCES;
1345 goto bail;
1346 }
1347 SET_INODE_JOURNAL(inode);
1348
1349 flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE;
1350 status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags);
1351 if (status < 0) {
1352 if (status != -EAGAIN)
1353 mlog_errno(status);
1354 goto bail;
1355 }
1356
1357 ocfs2_meta_unlock(inode, 1);
1358bail:
1359 if (inode)
1360 iput(inode);
1361
1362 return status;
1363}
1364
1365/* Call this underneath ocfs2_super_lock. It also assumes that the
1366 * slot info struct has been updated from disk. */
1367int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1368{
1369 int status, i, node_num;
1370 struct ocfs2_slot_info *si = osb->slot_info;
1371
1372 /* This is called with the super block cluster lock, so we
1373 * know that the slot map can't change underneath us. */
1374
1375 spin_lock(&si->si_lock);
1376 for(i = 0; i < si->si_num_slots; i++) {
1377 if (i == osb->slot_num)
1378 continue;
1379 if (ocfs2_is_empty_slot(si, i))
1380 continue;
1381
1382 node_num = si->si_global_node_nums[i];
1383 if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
1384 continue;
1385 spin_unlock(&si->si_lock);
1386
1387 /* Ok, we have a slot occupied by another node which
1388 * is not in the recovery map. We trylock his journal
1389 * file here to test if he's alive. */
1390 status = ocfs2_trylock_journal(osb, i);
1391 if (!status) {
1392 /* Since we're called from mount, we know that
1393 * the recovery thread can't race us on
1394 * setting / checking the recovery bits. */
1395 ocfs2_recovery_thread(osb, node_num);
1396 } else if ((status < 0) && (status != -EAGAIN)) {
1397 mlog_errno(status);
1398 goto bail;
1399 }
1400
1401 spin_lock(&si->si_lock);
1402 }
1403 spin_unlock(&si->si_lock);
1404
1405 status = 0;
1406bail:
1407 mlog_exit(status);
1408 return status;
1409}
1410
1411static int ocfs2_recover_orphans(struct ocfs2_super *osb,
1412 int slot)
1413{
1414 int status = 0;
1415 int have_disk_lock = 0;
1416 struct inode *inode = NULL;
1417 struct inode *iter;
1418 struct inode *orphan_dir_inode = NULL;
1419 unsigned long offset, blk, local;
1420 struct buffer_head *bh = NULL;
1421 struct ocfs2_dir_entry *de;
1422 struct super_block *sb = osb->sb;
1423 struct ocfs2_inode_info *oi;
1424
1425 mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
1426
1427 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
1428 ORPHAN_DIR_SYSTEM_INODE,
1429 slot);
1430 if (!orphan_dir_inode) {
1431 status = -ENOENT;
1432 mlog_errno(status);
1433 goto out;
1434 }
1435
1436 down(&orphan_dir_inode->i_sem);
1437 status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);
1438 if (status < 0) {
1439 up(&orphan_dir_inode->i_sem);
1440 mlog_errno(status);
1441 goto out;
1442 }
1443 have_disk_lock = 1;
1444
1445 offset = 0;
1446 iter = NULL;
1447 while(offset < i_size_read(orphan_dir_inode)) {
1448 blk = offset >> sb->s_blocksize_bits;
1449
1450 bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
1451 if (!bh)
1452 status = -EINVAL;
1453 if (status < 0) {
1454 up(&orphan_dir_inode->i_sem);
1455 if (bh)
1456 brelse(bh);
1457 mlog_errno(status);
1458 goto out;
1459 }
1460
1461 local = 0;
1462 while(offset < i_size_read(orphan_dir_inode)
1463 && local < sb->s_blocksize) {
1464 de = (struct ocfs2_dir_entry *) (bh->b_data + local);
1465
1466 if (!ocfs2_check_dir_entry(orphan_dir_inode,
1467 de, bh, local)) {
1468 up(&orphan_dir_inode->i_sem);
1469 status = -EINVAL;
1470 mlog_errno(status);
1471 brelse(bh);
1472 goto out;
1473 }
1474
1475 local += le16_to_cpu(de->rec_len);
1476 offset += le16_to_cpu(de->rec_len);
1477
1478 /* I guess we silently fail on no inode? */
1479 if (!le64_to_cpu(de->inode))
1480 continue;
1481 if (de->file_type > OCFS2_FT_MAX) {
1482 mlog(ML_ERROR,
1483 "block %llu contains invalid de: "
1484 "inode = %"MLFu64", rec_len = %u, "
1485 "name_len = %u, file_type = %u, "
1486 "name='%.*s'\n",
1487 (unsigned long long)bh->b_blocknr,
1488 le64_to_cpu(de->inode),
1489 le16_to_cpu(de->rec_len),
1490 de->name_len,
1491 de->file_type,
1492 de->name_len,
1493 de->name);
1494 continue;
1495 }
1496 if (de->name_len == 1 && !strncmp(".", de->name, 1))
1497 continue;
1498 if (de->name_len == 2 && !strncmp("..", de->name, 2))
1499 continue;
1500
1501 iter = ocfs2_iget(osb, le64_to_cpu(de->inode));
1502 if (IS_ERR(iter))
1503 continue;
1504
1505 mlog(0, "queue orphan %"MLFu64"\n",
1506 OCFS2_I(iter)->ip_blkno);
1507 OCFS2_I(iter)->ip_next_orphan = inode;
1508 inode = iter;
1509 }
1510 brelse(bh);
1511 }
1512 up(&orphan_dir_inode->i_sem);
1513
1514 ocfs2_meta_unlock(orphan_dir_inode, 0);
1515 have_disk_lock = 0;
1516
1517 iput(orphan_dir_inode);
1518 orphan_dir_inode = NULL;
1519
1520 while (inode) {
1521 oi = OCFS2_I(inode);
1522 mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno);
1523
1524 iter = oi->ip_next_orphan;
1525
1526 spin_lock(&oi->ip_lock);
1527 /* Delete voting may have set these on the assumption
1528 * that the other node would wipe them successfully.
1529 * If they are still in the node's orphan dir, we need
1530 * to reset that state. */
1531 oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
1532
1533 /* Set the proper information to get us going into
1534 * ocfs2_delete_inode. */
1535 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
1536 oi->ip_orphaned_slot = slot;
1537 spin_unlock(&oi->ip_lock);
1538
1539 iput(inode);
1540
1541 inode = iter;
1542 }
1543
1544out:
1545 if (have_disk_lock)
1546 ocfs2_meta_unlock(orphan_dir_inode, 0);
1547
1548 if (orphan_dir_inode)
1549 iput(orphan_dir_inode);
1550
1551 return status;
1552}
1553
1554static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
1555{
1556 /* This check is good because ocfs2 will wait on our recovery
1557 * thread before changing it to something other than MOUNTED
1558 * or DISABLED. */
1559 wait_event(osb->osb_mount_event,
1560 atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
1561 atomic_read(&osb->vol_state) == VOLUME_DISABLED);
1562
1563 /* If there's an error on mount, then we may never get to the
1564 * MOUNTED flag, but this is set right before
1565 * dismount_volume() so we can trust it. */
1566 if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
1567 mlog(0, "mount error, exiting!\n");
1568 return -EBUSY;
1569 }
1570
1571 return 0;
1572}
1573
1574static int ocfs2_commit_thread(void *arg)
1575{
1576 int status;
1577 struct ocfs2_super *osb = arg;
1578 struct ocfs2_journal *journal = osb->journal;
1579
1580 /* we can trust j_num_trans here because _should_stop() is only set in
1581 * shutdown and nobody other than ourselves should be able to start
1582 * transactions. committing on shutdown might take a few iterations
1583 * as final transactions put deleted inodes on the list */
1584 while (!(kthread_should_stop() &&
1585 atomic_read(&journal->j_num_trans) == 0)) {
1586
1587 wait_event_interruptible_timeout(osb->checkpoint_event,
1588 atomic_read(&journal->j_num_trans)
1589 || kthread_should_stop(),
1590 OCFS2_CHECKPOINT_INTERVAL);
1591
1592 status = ocfs2_commit_cache(osb);
1593 if (status < 0)
1594 mlog_errno(status);
1595
1596 if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
1597 mlog(ML_KTHREAD,
1598 "commit_thread: %u transactions pending on "
1599 "shutdown\n",
1600 atomic_read(&journal->j_num_trans));
1601 }
1602 }
1603
1604 return 0;
1605}
1606
1607/* Look for a dirty journal without taking any cluster locks. Used for
1608 * hard readonly access to determine whether the file system journals
1609 * require recovery. */
1610int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
1611{
1612 int ret = 0;
1613 unsigned int slot;
1614 struct buffer_head *di_bh;
1615 struct ocfs2_dinode *di;
1616 struct inode *journal = NULL;
1617
1618 for(slot = 0; slot < osb->max_slots; slot++) {
1619 journal = ocfs2_get_system_file_inode(osb,
1620 JOURNAL_SYSTEM_INODE,
1621 slot);
1622 if (!journal || is_bad_inode(journal)) {
1623 ret = -EACCES;
1624 mlog_errno(ret);
1625 goto out;
1626 }
1627
1628 di_bh = NULL;
1629 ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
1630 0, journal);
1631 if (ret < 0) {
1632 mlog_errno(ret);
1633 goto out;
1634 }
1635
1636 di = (struct ocfs2_dinode *) di_bh->b_data;
1637
1638 if (le32_to_cpu(di->id1.journal1.ij_flags) &
1639 OCFS2_JOURNAL_DIRTY_FL)
1640 ret = -EROFS;
1641
1642 brelse(di_bh);
1643 if (ret)
1644 break;
1645 }
1646
1647out:
1648 if (journal)
1649 iput(journal);
1650
1651 return ret;
1652}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
new file mode 100644
index 000000000000..7d0a816184fa
--- /dev/null
+++ b/fs/ocfs2/journal.h
@@ -0,0 +1,457 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * journal.h
5 *
6 * Defines journalling api and structures.
7 *
8 * Copyright (C) 2003, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_JOURNAL_H
27#define OCFS2_JOURNAL_H
28
29#include <linux/fs.h>
30#include <linux/jbd.h>
31
32#define OCFS2_CHECKPOINT_INTERVAL (8 * HZ)
33
34enum ocfs2_journal_state {
35 OCFS2_JOURNAL_FREE = 0,
36 OCFS2_JOURNAL_LOADED,
37 OCFS2_JOURNAL_IN_SHUTDOWN,
38};
39
40struct ocfs2_super;
41struct ocfs2_dinode;
42struct ocfs2_journal_handle;
43
44struct ocfs2_journal {
45 enum ocfs2_journal_state j_state; /* Journals current state */
46
47 journal_t *j_journal; /* The kernels journal type */
48 struct inode *j_inode; /* Kernel inode pointing to
49 * this journal */
50 struct ocfs2_super *j_osb; /* pointer to the super
51 * block for the node
52 * we're currently
53 * running on -- not
54 * necessarily the super
55 * block from the node
56 * which we usually run
57 * from (recovery,
58 * etc) */
59 struct buffer_head *j_bh; /* Journal disk inode block */
60 atomic_t j_num_trans; /* Number of transactions
61 * currently in the system. */
62 unsigned long j_trans_id;
63 struct rw_semaphore j_trans_barrier;
64 wait_queue_head_t j_checkpointed;
65
66 spinlock_t j_lock;
67 struct list_head j_la_cleanups;
68 struct work_struct j_recovery_work;
69};
70
71extern spinlock_t trans_inc_lock;
72
73/* wrap j_trans_id so we never have it equal to zero. */
74static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
75{
76 unsigned long old_id;
77 spin_lock(&trans_inc_lock);
78 old_id = j->j_trans_id++;
79 if (unlikely(!j->j_trans_id))
80 j->j_trans_id = 1;
81 spin_unlock(&trans_inc_lock);
82 return old_id;
83}
84
85static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
86 struct inode *inode)
87{
88 spin_lock(&trans_inc_lock);
89 OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
90 spin_unlock(&trans_inc_lock);
91}
92
93/* Used to figure out whether it's safe to drop a metadata lock on an
94 * inode. Returns true if all the inodes changes have been
95 * checkpointed to disk. You should be holding the spinlock on the
96 * metadata lock while calling this to be sure that nobody can take
97 * the lock and put it on another transaction. */
98static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
99{
100 int ret;
101 struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
102
103 spin_lock(&trans_inc_lock);
104 ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
105 spin_unlock(&trans_inc_lock);
106 return ret;
107}
108
109/* convenience function to check if an inode is still new (has never
110 * hit disk) Will do you a favor and set created_trans = 0 when you've
111 * been checkpointed. returns '1' if the inode is still new. */
112static inline int ocfs2_inode_is_new(struct inode *inode)
113{
114 int ret;
115
116 /* System files are never "new" as they're written out by
117 * mkfs. This helps us early during mount, before we have the
118 * journal open and j_trans_id could be junk. */
119 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
120 return 0;
121 spin_lock(&trans_inc_lock);
122 ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
123 OCFS2_I(inode)->ip_created_trans));
124 if (!ret)
125 OCFS2_I(inode)->ip_created_trans = 0;
126 spin_unlock(&trans_inc_lock);
127 return ret;
128}
129
130static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
131 struct inode *inode)
132{
133 spin_lock(&trans_inc_lock);
134 OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
135 spin_unlock(&trans_inc_lock);
136}
137
138extern kmem_cache_t *ocfs2_lock_cache;
139
140struct ocfs2_journal_lock {
141 struct inode *jl_inode;
142 struct list_head jl_lock_list;
143};
144
145struct ocfs2_journal_handle {
146 handle_t *k_handle; /* kernel handle. */
147 struct ocfs2_journal *journal;
148 u32 flags; /* see flags below. */
149 int max_buffs; /* Buffs reserved by this handle */
150
151 /* The following two fields are for ocfs2_handle_add_lock */
152 int num_locks;
153 struct list_head locks; /* A bunch of locks to
154 * release on commit. This
155 * should be a list_head */
156
157 struct list_head inode_list;
158};
159
160#define OCFS2_HANDLE_STARTED 1
161/* should we sync-commit this handle? */
162#define OCFS2_HANDLE_SYNC 2
163static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle)
164{
165 return handle->flags & OCFS2_HANDLE_STARTED;
166}
167
168static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync)
169{
170 if (sync)
171 handle->flags |= OCFS2_HANDLE_SYNC;
172 else
173 handle->flags &= ~OCFS2_HANDLE_SYNC;
174}
175
176/* Exported only for the journal struct init code in super.c. Do not call. */
177void ocfs2_complete_recovery(void *data);
178
179/*
180 * Journal Control:
181 * Initialize, Load, Shutdown, Wipe a journal.
182 *
183 * ocfs2_journal_init - Initialize journal structures in the OSB.
184 * ocfs2_journal_load - Load the given journal off disk. Replay it if
185 * there's transactions still in there.
186 * ocfs2_journal_shutdown - Shutdown a journal, this will flush all
187 * uncommitted, uncheckpointed transactions.
188 * ocfs2_journal_wipe - Wipe transactions from a journal. Optionally
189 * zero out each block.
190 * ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb.
191 * ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
192 * event on.
193 * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
194 */
195void ocfs2_set_journal_params(struct ocfs2_super *osb);
196int ocfs2_journal_init(struct ocfs2_journal *journal,
197 int *dirty);
198void ocfs2_journal_shutdown(struct ocfs2_super *osb);
199int ocfs2_journal_wipe(struct ocfs2_journal *journal,
200 int full);
201int ocfs2_journal_load(struct ocfs2_journal *journal);
202int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
203void ocfs2_recovery_thread(struct ocfs2_super *osb,
204 int node_num);
205int ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
206void ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
207
208static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
209{
210 atomic_set(&osb->needs_checkpoint, 1);
211 wake_up(&osb->checkpoint_event);
212}
213
214static inline void ocfs2_checkpoint_inode(struct inode *inode)
215{
216 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
217
218 if (!ocfs2_inode_fully_checkpointed(inode)) {
219 /* WARNING: This only kicks off a single
220 * checkpoint. If someone races you and adds more
221 * metadata to the journal, you won't know, and will
222 * wind up waiting *alot* longer than necessary. Right
223 * now we only use this in clear_inode so that's
224 * OK. */
225 ocfs2_start_checkpoint(osb);
226
227 wait_event(osb->journal->j_checkpointed,
228 ocfs2_inode_fully_checkpointed(inode));
229 }
230}
231
232/*
233 * Transaction Handling:
234 * Manage the lifetime of a transaction handle.
235 *
236 * ocfs2_alloc_handle - Only allocate a handle so we can start putting
237 * cluster locks on it. To actually change blocks,
238 * call ocfs2_start_trans with the handle returned
239 * from this function. You may call ocfs2_commit_trans
240 * at any time in the lifetime of a handle.
241 * ocfs2_start_trans - Begin a transaction. Give it an upper estimate of
242 * the number of blocks that will be changed during
243 * this handle.
244 * ocfs2_commit_trans - Complete a handle.
245 * ocfs2_extend_trans - Extend a handle by nblocks credits. This may
246 * commit the handle to disk in the process, but will
247 * not release any locks taken during the transaction.
248 * ocfs2_journal_access - Notify the handle that we want to journal this
249 * buffer. Will have to call ocfs2_journal_dirty once
250 * we've actually dirtied it. Type is one of . or .
251 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
252 * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
253 * the current handle commits.
254 * ocfs2_handle_add_lock - Sometimes we need to delay lock release
255 * until after a transaction has been completed. Use
256 * ocfs2_handle_add_lock to indicate that a lock needs
257 * to be released at the end of that handle. Locks
258 * will be released in the order that they are added.
259 * ocfs2_handle_add_inode - Add a locked inode to a transaction.
260 */
261
262/* You must always start_trans with a number of buffs > 0, but it's
263 * perfectly legal to go through an entire transaction without having
264 * dirtied any buffers. */
265struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb);
266struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
267 struct ocfs2_journal_handle *handle,
268 int max_buffs);
269void ocfs2_commit_trans(struct ocfs2_journal_handle *handle);
270int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
271 int nblocks);
272
273/*
274 * Create access is for when we get a newly created buffer and we're
275 * not gonna read it off disk, but rather fill it ourselves. Right
276 * now, we don't do anything special with this (it turns into a write
277 * request), but this is a good placeholder in case we do...
278 *
279 * Write access is for when we read a block off disk and are going to
280 * modify it. This way the journalling layer knows it may need to make
281 * a copy of that block (if it's part of another, uncommitted
282 * transaction) before we do so.
283 */
284#define OCFS2_JOURNAL_ACCESS_CREATE 0
285#define OCFS2_JOURNAL_ACCESS_WRITE 1
286#define OCFS2_JOURNAL_ACCESS_UNDO 2
287
288int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
289 struct inode *inode,
290 struct buffer_head *bh,
291 int type);
292/*
293 * A word about the journal_access/journal_dirty "dance". It is
294 * entirely legal to journal_access a buffer more than once (as long
295 * as the access type is the same -- I'm not sure what will happen if
296 * access type is different but this should never happen anyway) It is
297 * also legal to journal_dirty a buffer more than once. In fact, you
298 * can even journal_access a buffer after you've done a
299 * journal_access/journal_dirty pair. The only thing you cannot do
300 * however, is journal_dirty a buffer which you haven't yet passed to
301 * journal_access at least once.
302 *
303 * That said, 99% of the time this doesn't matter and this is what the
304 * path looks like:
305 *
306 * <read a bh>
307 * ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE);
308 * <modify the bh>
309 * ocfs2_journal_dirty(handle, bh);
310 */
311int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
312 struct buffer_head *bh);
313int ocfs2_journal_dirty_data(handle_t *handle,
314 struct buffer_head *bh);
315int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
316 struct inode *inode);
317/*
318 * Use this to protect from other processes reading buffer state while
319 * it's in flight.
320 */
321void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
322 struct inode *inode);
323
324/*
325 * Credit Macros:
326 * Convenience macros to calculate number of credits needed.
327 *
328 * For convenience sake, I have a set of macros here which calculate
329 * the *maximum* number of sectors which will be changed for various
330 * metadata updates.
331 */
332
333/* simple file updates like chmod, etc. */
334#define OCFS2_INODE_UPDATE_CREDITS 1
335
336/* get one bit out of a suballocator: dinode + group descriptor +
337 * prev. group desc. if we relink. */
338#define OCFS2_SUBALLOC_ALLOC (3)
339
340/* dinode + group descriptor update. We don't relink on free yet. */
341#define OCFS2_SUBALLOC_FREE (2)
342
343#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
344#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
345 + OCFS2_TRUNCATE_LOG_UPDATE)
346
347/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
348 * bitmap block for the new bit) */
349#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
350
351/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
352 * group descriptor + mkdir/symlink blocks */
353#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \
354 + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
355
356/* local alloc metadata change + main bitmap updates */
357#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \
358 + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
359
360/* used when we don't need an allocation change for a dir extend. One
361 * for the dinode, one for the new block. */
362#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
363
364/* file update (nlink, etc) + dir entry block */
365#define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
366
367/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
368 * dir inode link */
369#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \
370 + OCFS2_LINK_CREDITS)
371
372/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
373 * inode alloc group descriptor */
374#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
375
376/* dinode update, old dir dinode update, new dir dinode update, old
377 * dir dir entry, new dir dir entry, dir entry update for renaming
378 * directory + target unlink */
379#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \
380 + OCFS2_UNLINK_CREDITS)
381
382static inline int ocfs2_calc_extend_credits(struct super_block *sb,
383 struct ocfs2_dinode *fe,
384 u32 bits_wanted)
385{
386 int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
387
388 /* bitmap dinode, group desc. + relinked group. */
389 bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
390
391 /* we might need to shift tree depth so lets assume an
392 * absolute worst case of complete fragmentation. Even with
393 * that, we only need one update for the dinode, and then
394 * however many metadata chunks needed * a remaining suballoc
395 * alloc. */
396 sysfile_bitmap_blocks = 1 +
397 (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
398
399 /* this does not include *new* metadata blocks, which are
400 * accounted for in sysfile_bitmap_blocks. fe +
401 * prev. last_eb_blk + blocks along edge of tree.
402 * calc_symlink_credits passes because we just need 1
403 * credit for the dinode there. */
404 dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
405
406 return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
407}
408
409static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
410{
411 int blocks = OCFS2_MKNOD_CREDITS;
412
413 /* links can be longer than one block so we may update many
414 * within our single allocated extent. */
415 blocks += ocfs2_clusters_to_blocks(sb, 1);
416
417 return blocks;
418}
419
420static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
421 unsigned int cpg)
422{
423 int blocks;
424 int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
425 /* parent inode update + new block group header + bitmap inode update
426 + bitmap blocks affected */
427 blocks = 1 + 1 + 1 + bitmap_blocks;
428 return blocks;
429}
430
431static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
432 unsigned int clusters_to_del,
433 struct ocfs2_dinode *fe,
434 struct ocfs2_extent_list *last_el)
435{
436 /* for dinode + all headers in this pass + update to next leaf */
437 u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
438 u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
439 int credits = 1 + tree_depth + 1;
440 int i;
441
442 i = next_free - 1;
443 BUG_ON(i < 0);
444
445 /* We may be deleting metadata blocks, so metadata alloc dinode +
446 one desc. block for each possible delete. */
447 if (tree_depth && next_free == 1 &&
448 le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
449 credits += 1 + tree_depth;
450
451 /* update to the truncate log. */
452 credits += OCFS2_TRUNCATE_LOG_UPDATE;
453
454 return credits;
455}
456
457#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
new file mode 100644
index 000000000000..fe373a2101d9
--- /dev/null
+++ b/fs/ocfs2/localalloc.c
@@ -0,0 +1,983 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * localalloc.c
5 *
6 * Node local data allocation
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/bitops.h>
31
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "dlmglue.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "super.h"
44#include "sysfile.h"
45
46#include "buffer_head_io.h"
47
48#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab))
49
50static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
51
52static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
53
54static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
55 struct ocfs2_dinode *alloc,
56 u32 numbits);
57
58static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
59
60static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
61 struct ocfs2_journal_handle *handle,
62 struct ocfs2_dinode *alloc,
63 struct inode *main_bm_inode,
64 struct buffer_head *main_bm_bh);
65
66static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
67 struct ocfs2_journal_handle *handle,
68 struct ocfs2_alloc_context **ac,
69 struct inode **bitmap_inode,
70 struct buffer_head **bitmap_bh);
71
72static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
73 struct ocfs2_journal_handle *handle,
74 struct ocfs2_alloc_context *ac);
75
76static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
77 struct inode *local_alloc_inode);
78
79/*
80 * Determine how large our local alloc window should be, in bits.
81 *
82 * These values (and the behavior in ocfs2_alloc_should_use_local) have
83 * been chosen so that most allocations, including new block groups go
84 * through local alloc.
85 */
86static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
87{
88 BUG_ON(osb->s_clustersize_bits < 12);
89
90 return 2048 >> (osb->s_clustersize_bits - 12);
91}
92
93/*
94 * Tell us whether a given allocation should use the local alloc
95 * file. Otherwise, it has to go to the main bitmap.
96 */
97int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
98{
99 int la_bits = ocfs2_local_alloc_window_bits(osb);
100
101 if (osb->local_alloc_state != OCFS2_LA_ENABLED)
102 return 0;
103
104 /* la_bits should be at least twice the size (in clusters) of
105 * a new block group. We want to be sure block group
106 * allocations go through the local alloc, so allow an
107 * allocation to take up to half the bitmap. */
108 if (bits > (la_bits / 2))
109 return 0;
110
111 return 1;
112}
113
114int ocfs2_load_local_alloc(struct ocfs2_super *osb)
115{
116 int status = 0;
117 struct ocfs2_dinode *alloc = NULL;
118 struct buffer_head *alloc_bh = NULL;
119 u32 num_used;
120 struct inode *inode = NULL;
121 struct ocfs2_local_alloc *la;
122
123 mlog_entry_void();
124
125 /* read the alloc off disk */
126 inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
127 osb->slot_num);
128 if (!inode) {
129 status = -EINVAL;
130 mlog_errno(status);
131 goto bail;
132 }
133
134 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
135 &alloc_bh, 0, inode);
136 if (status < 0) {
137 mlog_errno(status);
138 goto bail;
139 }
140
141 alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
142 la = OCFS2_LOCAL_ALLOC(alloc);
143
144 if (!(le32_to_cpu(alloc->i_flags) &
145 (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
146 mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n",
147 OCFS2_I(inode)->ip_blkno);
148 status = -EINVAL;
149 goto bail;
150 }
151
152 if ((la->la_size == 0) ||
153 (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
154 mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
155 le16_to_cpu(la->la_size));
156 status = -EINVAL;
157 goto bail;
158 }
159
160 /* do a little verification. */
161 num_used = ocfs2_local_alloc_count_bits(alloc);
162
163 /* hopefully the local alloc has always been recovered before
164 * we load it. */
165 if (num_used
166 || alloc->id1.bitmap1.i_used
167 || alloc->id1.bitmap1.i_total
168 || la->la_bm_off)
169 mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
170 "found = %u, set = %u, taken = %u, off = %u\n",
171 num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
172 le32_to_cpu(alloc->id1.bitmap1.i_total),
173 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
174
175 osb->local_alloc_bh = alloc_bh;
176 osb->local_alloc_state = OCFS2_LA_ENABLED;
177
178bail:
179 if (status < 0)
180 if (alloc_bh)
181 brelse(alloc_bh);
182 if (inode)
183 iput(inode);
184
185 mlog_exit(status);
186 return status;
187}
188
189/*
190 * return any unused bits to the bitmap and write out a clean
191 * local_alloc.
192 *
193 * local_alloc_bh is optional. If not passed, we will simply use the
194 * one off osb. If you do pass it however, be warned that it *will* be
195 * returned brelse'd and NULL'd out.*/
196void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
197{
198 int status;
199 struct ocfs2_journal_handle *handle = NULL;
200 struct inode *local_alloc_inode = NULL;
201 struct buffer_head *bh = NULL;
202 struct buffer_head *main_bm_bh = NULL;
203 struct inode *main_bm_inode = NULL;
204 struct ocfs2_dinode *alloc_copy = NULL;
205 struct ocfs2_dinode *alloc = NULL;
206
207 mlog_entry_void();
208
209 if (osb->local_alloc_state == OCFS2_LA_UNUSED)
210 goto bail;
211
212 local_alloc_inode =
213 ocfs2_get_system_file_inode(osb,
214 LOCAL_ALLOC_SYSTEM_INODE,
215 osb->slot_num);
216 if (!local_alloc_inode) {
217 status = -ENOENT;
218 mlog_errno(status);
219 goto bail;
220 }
221
222 osb->local_alloc_state = OCFS2_LA_DISABLED;
223
224 handle = ocfs2_alloc_handle(osb);
225 if (!handle) {
226 status = -ENOMEM;
227 mlog_errno(status);
228 goto bail;
229 }
230
231 main_bm_inode = ocfs2_get_system_file_inode(osb,
232 GLOBAL_BITMAP_SYSTEM_INODE,
233 OCFS2_INVALID_SLOT);
234 if (!main_bm_inode) {
235 status = -EINVAL;
236 mlog_errno(status);
237 goto bail;
238 }
239
240 ocfs2_handle_add_inode(handle, main_bm_inode);
241 status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
242 if (status < 0) {
243 mlog_errno(status);
244 goto bail;
245 }
246
247 /* WINDOW_MOVE_CREDITS is a bit heavy... */
248 handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
249 if (IS_ERR(handle)) {
250 mlog_errno(PTR_ERR(handle));
251 handle = NULL;
252 goto bail;
253 }
254
255 bh = osb->local_alloc_bh;
256 alloc = (struct ocfs2_dinode *) bh->b_data;
257
258 alloc_copy = kmalloc(bh->b_size, GFP_KERNEL);
259 if (!alloc_copy) {
260 status = -ENOMEM;
261 goto bail;
262 }
263 memcpy(alloc_copy, alloc, bh->b_size);
264
265 status = ocfs2_journal_access(handle, local_alloc_inode, bh,
266 OCFS2_JOURNAL_ACCESS_WRITE);
267 if (status < 0) {
268 mlog_errno(status);
269 goto bail;
270 }
271
272 ocfs2_clear_local_alloc(alloc);
273
274 status = ocfs2_journal_dirty(handle, bh);
275 if (status < 0) {
276 mlog_errno(status);
277 goto bail;
278 }
279
280 brelse(bh);
281 osb->local_alloc_bh = NULL;
282 osb->local_alloc_state = OCFS2_LA_UNUSED;
283
284 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
285 main_bm_inode, main_bm_bh);
286 if (status < 0)
287 mlog_errno(status);
288
289bail:
290 if (handle)
291 ocfs2_commit_trans(handle);
292
293 if (main_bm_bh)
294 brelse(main_bm_bh);
295
296 if (main_bm_inode)
297 iput(main_bm_inode);
298
299 if (local_alloc_inode)
300 iput(local_alloc_inode);
301
302 if (alloc_copy)
303 kfree(alloc_copy);
304
305 mlog_exit_void();
306}
307
308/*
309 * We want to free the bitmap bits outside of any recovery context as
310 * we'll need a cluster lock to do so, but we must clear the local
311 * alloc before giving up the recovered nodes journal. To solve this,
312 * we kmalloc a copy of the local alloc before it's change for the
313 * caller to process with ocfs2_complete_local_alloc_recovery
314 */
315int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
316 int slot_num,
317 struct ocfs2_dinode **alloc_copy)
318{
319 int status = 0;
320 struct buffer_head *alloc_bh = NULL;
321 struct inode *inode = NULL;
322 struct ocfs2_dinode *alloc;
323
324 mlog_entry("(slot_num = %d)\n", slot_num);
325
326 *alloc_copy = NULL;
327
328 inode = ocfs2_get_system_file_inode(osb,
329 LOCAL_ALLOC_SYSTEM_INODE,
330 slot_num);
331 if (!inode) {
332 status = -EINVAL;
333 mlog_errno(status);
334 goto bail;
335 }
336
337 down(&inode->i_sem);
338
339 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
340 &alloc_bh, 0, inode);
341 if (status < 0) {
342 mlog_errno(status);
343 goto bail;
344 }
345
346 *alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
347 if (!(*alloc_copy)) {
348 status = -ENOMEM;
349 goto bail;
350 }
351 memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
352
353 alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
354 ocfs2_clear_local_alloc(alloc);
355
356 status = ocfs2_write_block(osb, alloc_bh, inode);
357 if (status < 0)
358 mlog_errno(status);
359
360bail:
361 if ((status < 0) && (*alloc_copy)) {
362 kfree(*alloc_copy);
363 *alloc_copy = NULL;
364 }
365
366 if (alloc_bh)
367 brelse(alloc_bh);
368
369 if (inode) {
370 up(&inode->i_sem);
371 iput(inode);
372 }
373
374 mlog_exit(status);
375 return status;
376}
377
378/*
379 * Step 2: By now, we've completed the journal recovery, we've stamped
380 * a clean local alloc on disk and dropped the node out of the
381 * recovery map. Dlm locks will no longer stall, so lets clear out the
382 * main bitmap.
383 */
384int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
385 struct ocfs2_dinode *alloc)
386{
387 int status;
388 struct ocfs2_journal_handle *handle = NULL;
389 struct buffer_head *main_bm_bh = NULL;
390 struct inode *main_bm_inode = NULL;
391
392 mlog_entry_void();
393
394 handle = ocfs2_alloc_handle(osb);
395 if (!handle) {
396 status = -ENOMEM;
397 mlog_errno(status);
398 goto bail;
399 }
400
401 main_bm_inode = ocfs2_get_system_file_inode(osb,
402 GLOBAL_BITMAP_SYSTEM_INODE,
403 OCFS2_INVALID_SLOT);
404 if (!main_bm_inode) {
405 status = -EINVAL;
406 mlog_errno(status);
407 goto bail;
408 }
409
410 ocfs2_handle_add_inode(handle, main_bm_inode);
411 status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
412 if (status < 0) {
413 mlog_errno(status);
414 goto bail;
415 }
416
417 handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
418 if (IS_ERR(handle)) {
419 status = PTR_ERR(handle);
420 handle = NULL;
421 mlog_errno(status);
422 goto bail;
423 }
424
425 /* we want the bitmap change to be recorded on disk asap */
426 ocfs2_handle_set_sync(handle, 1);
427
428 status = ocfs2_sync_local_to_main(osb, handle, alloc,
429 main_bm_inode, main_bm_bh);
430 if (status < 0)
431 mlog_errno(status);
432
433bail:
434 if (handle)
435 ocfs2_commit_trans(handle);
436
437 if (main_bm_bh)
438 brelse(main_bm_bh);
439
440 if (main_bm_inode)
441 iput(main_bm_inode);
442
443 mlog_exit(status);
444 return status;
445}
446
447/*
448 * make sure we've got at least bitswanted contiguous bits in the
449 * local alloc. You lose them when you drop i_sem.
450 *
451 * We will add ourselves to the transaction passed in, but may start
452 * our own in order to shift windows.
453 */
454int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
455 struct ocfs2_journal_handle *passed_handle,
456 u32 bits_wanted,
457 struct ocfs2_alloc_context *ac)
458{
459 int status;
460 struct ocfs2_dinode *alloc;
461 struct inode *local_alloc_inode;
462 unsigned int free_bits;
463
464 mlog_entry_void();
465
466 BUG_ON(!passed_handle);
467 BUG_ON(!ac);
468 BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED);
469
470 local_alloc_inode =
471 ocfs2_get_system_file_inode(osb,
472 LOCAL_ALLOC_SYSTEM_INODE,
473 osb->slot_num);
474 if (!local_alloc_inode) {
475 status = -ENOENT;
476 mlog_errno(status);
477 goto bail;
478 }
479 ocfs2_handle_add_inode(passed_handle, local_alloc_inode);
480
481 if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
482 status = -ENOSPC;
483 goto bail;
484 }
485
486 if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
487 mlog(0, "Asking for more than my max window size!\n");
488 status = -ENOSPC;
489 goto bail;
490 }
491
492 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
493
494 if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
495 ocfs2_local_alloc_count_bits(alloc)) {
496 ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has "
497 "%u free bits, but a count shows %u",
498 le64_to_cpu(alloc->i_blkno),
499 le32_to_cpu(alloc->id1.bitmap1.i_used),
500 ocfs2_local_alloc_count_bits(alloc));
501 status = -EIO;
502 goto bail;
503 }
504
505 free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
506 le32_to_cpu(alloc->id1.bitmap1.i_used);
507 if (bits_wanted > free_bits) {
508 /* uhoh, window change time. */
509 status =
510 ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
511 if (status < 0) {
512 if (status != -ENOSPC)
513 mlog_errno(status);
514 goto bail;
515 }
516 }
517
518 ac->ac_inode = igrab(local_alloc_inode);
519 get_bh(osb->local_alloc_bh);
520 ac->ac_bh = osb->local_alloc_bh;
521 ac->ac_which = OCFS2_AC_USE_LOCAL;
522 status = 0;
523bail:
524 if (local_alloc_inode)
525 iput(local_alloc_inode);
526
527 mlog_exit(status);
528 return status;
529}
530
531int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
532 struct ocfs2_journal_handle *handle,
533 struct ocfs2_alloc_context *ac,
534 u32 min_bits,
535 u32 *bit_off,
536 u32 *num_bits)
537{
538 int status, start;
539 struct inode *local_alloc_inode;
540 u32 bits_wanted;
541 void *bitmap;
542 struct ocfs2_dinode *alloc;
543 struct ocfs2_local_alloc *la;
544
545 mlog_entry_void();
546 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
547
548 bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
549 local_alloc_inode = ac->ac_inode;
550 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
551 la = OCFS2_LOCAL_ALLOC(alloc);
552
553 start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
554 if (start == -1) {
555 /* TODO: Shouldn't we just BUG here? */
556 status = -ENOSPC;
557 mlog_errno(status);
558 goto bail;
559 }
560
561 bitmap = la->la_bitmap;
562 *bit_off = le32_to_cpu(la->la_bm_off) + start;
563 /* local alloc is always contiguous by nature -- we never
564 * delete bits from it! */
565 *num_bits = bits_wanted;
566
567 status = ocfs2_journal_access(handle, local_alloc_inode,
568 osb->local_alloc_bh,
569 OCFS2_JOURNAL_ACCESS_WRITE);
570 if (status < 0) {
571 mlog_errno(status);
572 goto bail;
573 }
574
575 while(bits_wanted--)
576 ocfs2_set_bit(start++, bitmap);
577
578 alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits +
579 le32_to_cpu(alloc->id1.bitmap1.i_used));
580
581 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
582 if (status < 0) {
583 mlog_errno(status);
584 goto bail;
585 }
586
587 status = 0;
588bail:
589 mlog_exit(status);
590 return status;
591}
592
593static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
594{
595 int i;
596 u8 *buffer;
597 u32 count = 0;
598 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
599
600 mlog_entry_void();
601
602 buffer = la->la_bitmap;
603 for (i = 0; i < le16_to_cpu(la->la_size); i++)
604 count += hweight8(buffer[i]);
605
606 mlog_exit(count);
607 return count;
608}
609
610static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
611 struct ocfs2_dinode *alloc,
612 u32 numbits)
613{
614 int numfound, bitoff, left, startoff, lastzero;
615 void *bitmap = NULL;
616
617 mlog_entry("(numbits wanted = %u)\n", numbits);
618
619 if (!alloc->id1.bitmap1.i_total) {
620 mlog(0, "No bits in my window!\n");
621 bitoff = -1;
622 goto bail;
623 }
624
625 bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
626
627 numfound = bitoff = startoff = 0;
628 lastzero = -1;
629 left = le32_to_cpu(alloc->id1.bitmap1.i_total);
630 while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
631 if (bitoff == left) {
632 /* mlog(0, "bitoff (%d) == left", bitoff); */
633 break;
634 }
635 /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
636 "numfound = %d\n", bitoff, startoff, numfound);*/
637
638 /* Ok, we found a zero bit... is it contig. or do we
639 * start over?*/
640 if (bitoff == startoff) {
641 /* we found a zero */
642 numfound++;
643 startoff++;
644 } else {
645 /* got a zero after some ones */
646 numfound = 1;
647 startoff = bitoff+1;
648 }
649 /* we got everything we needed */
650 if (numfound == numbits) {
651 /* mlog(0, "Found it all!\n"); */
652 break;
653 }
654 }
655
656 mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
657 numfound);
658
659 if (numfound == numbits)
660 bitoff = startoff - numfound;
661 else
662 bitoff = -1;
663
664bail:
665 mlog_exit(bitoff);
666 return bitoff;
667}
668
669static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
670{
671 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
672 int i;
673 mlog_entry_void();
674
675 alloc->id1.bitmap1.i_total = 0;
676 alloc->id1.bitmap1.i_used = 0;
677 la->la_bm_off = 0;
678 for(i = 0; i < le16_to_cpu(la->la_size); i++)
679 la->la_bitmap[i] = 0;
680
681 mlog_exit_void();
682}
683
684#if 0
685/* turn this on and uncomment below to aid debugging window shifts. */
686static void ocfs2_verify_zero_bits(unsigned long *bitmap,
687 unsigned int start,
688 unsigned int count)
689{
690 unsigned int tmp = count;
691 while(tmp--) {
692 if (ocfs2_test_bit(start + tmp, bitmap)) {
693 printk("ocfs2_verify_zero_bits: start = %u, count = "
694 "%u\n", start, count);
695 printk("ocfs2_verify_zero_bits: bit %u is set!",
696 start + tmp);
697 BUG();
698 }
699 }
700}
701#endif
702
703/*
704 * sync the local alloc to main bitmap.
705 *
706 * assumes you've already locked the main bitmap -- the bitmap inode
707 * passed is used for caching.
708 */
709static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
710 struct ocfs2_journal_handle *handle,
711 struct ocfs2_dinode *alloc,
712 struct inode *main_bm_inode,
713 struct buffer_head *main_bm_bh)
714{
715 int status = 0;
716 int bit_off, left, count, start;
717 u64 la_start_blk;
718 u64 blkno;
719 void *bitmap;
720 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
721
722 mlog_entry("total = %u, COUNT = %u, used = %u\n",
723 le32_to_cpu(alloc->id1.bitmap1.i_total),
724 ocfs2_local_alloc_count_bits(alloc),
725 le32_to_cpu(alloc->id1.bitmap1.i_used));
726
727 if (!alloc->id1.bitmap1.i_total) {
728 mlog(0, "nothing to sync!\n");
729 goto bail;
730 }
731
732 if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
733 le32_to_cpu(alloc->id1.bitmap1.i_total)) {
734 mlog(0, "all bits were taken!\n");
735 goto bail;
736 }
737
738 la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
739 le32_to_cpu(la->la_bm_off));
740 bitmap = la->la_bitmap;
741 start = count = bit_off = 0;
742 left = le32_to_cpu(alloc->id1.bitmap1.i_total);
743
744 while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
745 != -1) {
746 if ((bit_off < left) && (bit_off == start)) {
747 count++;
748 start++;
749 continue;
750 }
751 if (count) {
752 blkno = la_start_blk +
753 ocfs2_clusters_to_blocks(osb->sb,
754 start - count);
755
756 mlog(0, "freeing %u bits starting at local "
757 "alloc bit %u (la_start_blk = %"MLFu64", "
758 "blkno = %"MLFu64")\n", count, start - count,
759 la_start_blk, blkno);
760
761 status = ocfs2_free_clusters(handle, main_bm_inode,
762 main_bm_bh, blkno, count);
763 if (status < 0) {
764 mlog_errno(status);
765 goto bail;
766 }
767 }
768 if (bit_off >= left)
769 break;
770 count = 1;
771 start = bit_off + 1;
772 }
773
774bail:
775 mlog_exit(status);
776 return status;
777}
778
779static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
780 struct ocfs2_journal_handle *handle,
781 struct ocfs2_alloc_context **ac,
782 struct inode **bitmap_inode,
783 struct buffer_head **bitmap_bh)
784{
785 int status;
786
787 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
788 if (!(*ac)) {
789 status = -ENOMEM;
790 mlog_errno(status);
791 goto bail;
792 }
793
794 (*ac)->ac_handle = handle;
795 (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
796
797 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
798 if (status < 0) {
799 if (status != -ENOSPC)
800 mlog_errno(status);
801 goto bail;
802 }
803
804 *bitmap_inode = (*ac)->ac_inode;
805 igrab(*bitmap_inode);
806 *bitmap_bh = (*ac)->ac_bh;
807 get_bh(*bitmap_bh);
808 status = 0;
809bail:
810 if ((status < 0) && *ac) {
811 ocfs2_free_alloc_context(*ac);
812 *ac = NULL;
813 }
814
815 mlog_exit(status);
816 return status;
817}
818
819/*
820 * pass it the bitmap lock in lock_bh if you have it.
821 */
822static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
823 struct ocfs2_journal_handle *handle,
824 struct ocfs2_alloc_context *ac)
825{
826 int status = 0;
827 u32 cluster_off, cluster_count;
828 struct ocfs2_dinode *alloc = NULL;
829 struct ocfs2_local_alloc *la;
830
831 mlog_entry_void();
832
833 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
834 la = OCFS2_LOCAL_ALLOC(alloc);
835
836 if (alloc->id1.bitmap1.i_total)
837 mlog(0, "asking me to alloc a new window over a non-empty "
838 "one\n");
839
840 mlog(0, "Allocating %u clusters for a new window.\n",
841 ocfs2_local_alloc_window_bits(osb));
842 /* we used the generic suballoc reserve function, but we set
843 * everything up nicely, so there's no reason why we can't use
844 * the more specific cluster api to claim bits. */
845 status = ocfs2_claim_clusters(osb, handle, ac,
846 ocfs2_local_alloc_window_bits(osb),
847 &cluster_off, &cluster_count);
848 if (status < 0) {
849 if (status != -ENOSPC)
850 mlog_errno(status);
851 goto bail;
852 }
853
854 la->la_bm_off = cpu_to_le32(cluster_off);
855 alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
856 /* just in case... In the future when we find space ourselves,
857 * we don't have to get all contiguous -- but we'll have to
858 * set all previously used bits in bitmap and update
859 * la_bits_set before setting the bits in the main bitmap. */
860 alloc->id1.bitmap1.i_used = 0;
861 memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
862 le16_to_cpu(la->la_size));
863
864 mlog(0, "New window allocated:\n");
865 mlog(0, "window la_bm_off = %u\n",
866 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
867 mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
868
869bail:
870 mlog_exit(status);
871 return status;
872}
873
874/* Note that we do *NOT* lock the local alloc inode here as
875 * it's been locked already for us. */
876static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
877 struct inode *local_alloc_inode)
878{
879 int status = 0;
880 struct buffer_head *main_bm_bh = NULL;
881 struct inode *main_bm_inode = NULL;
882 struct ocfs2_journal_handle *handle = NULL;
883 struct ocfs2_dinode *alloc;
884 struct ocfs2_dinode *alloc_copy = NULL;
885 struct ocfs2_alloc_context *ac = NULL;
886
887 mlog_entry_void();
888
889 handle = ocfs2_alloc_handle(osb);
890 if (!handle) {
891 status = -ENOMEM;
892 mlog_errno(status);
893 goto bail;
894 }
895
896 /* This will lock the main bitmap for us. */
897 status = ocfs2_local_alloc_reserve_for_window(osb,
898 handle,
899 &ac,
900 &main_bm_inode,
901 &main_bm_bh);
902 if (status < 0) {
903 if (status != -ENOSPC)
904 mlog_errno(status);
905 goto bail;
906 }
907
908 handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
909 if (IS_ERR(handle)) {
910 status = PTR_ERR(handle);
911 handle = NULL;
912 mlog_errno(status);
913 goto bail;
914 }
915
916 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
917
918 /* We want to clear the local alloc before doing anything
919 * else, so that if we error later during this operation,
920 * local alloc shutdown won't try to double free main bitmap
921 * bits. Make a copy so the sync function knows which bits to
922 * free. */
923 alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL);
924 if (!alloc_copy) {
925 status = -ENOMEM;
926 mlog_errno(status);
927 goto bail;
928 }
929 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
930
931 status = ocfs2_journal_access(handle, local_alloc_inode,
932 osb->local_alloc_bh,
933 OCFS2_JOURNAL_ACCESS_WRITE);
934 if (status < 0) {
935 mlog_errno(status);
936 goto bail;
937 }
938
939 ocfs2_clear_local_alloc(alloc);
940
941 status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
942 if (status < 0) {
943 mlog_errno(status);
944 goto bail;
945 }
946
947 status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
948 main_bm_inode, main_bm_bh);
949 if (status < 0) {
950 mlog_errno(status);
951 goto bail;
952 }
953
954 status = ocfs2_local_alloc_new_window(osb, handle, ac);
955 if (status < 0) {
956 if (status != -ENOSPC)
957 mlog_errno(status);
958 goto bail;
959 }
960
961 atomic_inc(&osb->alloc_stats.moves);
962
963 status = 0;
964bail:
965 if (handle)
966 ocfs2_commit_trans(handle);
967
968 if (main_bm_bh)
969 brelse(main_bm_bh);
970
971 if (main_bm_inode)
972 iput(main_bm_inode);
973
974 if (alloc_copy)
975 kfree(alloc_copy);
976
977 if (ac)
978 ocfs2_free_alloc_context(ac);
979
980 mlog_exit(status);
981 return status;
982}
983
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
new file mode 100644
index 000000000000..30f88ce14e46
--- /dev/null
+++ b/fs/ocfs2/localalloc.h
@@ -0,0 +1,56 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * localalloc.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_LOCALALLOC_H
27#define OCFS2_LOCALALLOC_H
28
29int ocfs2_load_local_alloc(struct ocfs2_super *osb);
30
31void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
32
33int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
34 int node_num,
35 struct ocfs2_dinode **alloc_copy);
36
37int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
38 struct ocfs2_dinode *alloc);
39
40int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
41 u64 bits);
42
43struct ocfs2_alloc_context;
44int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
45 struct ocfs2_journal_handle *passed_handle,
46 u32 bits_wanted,
47 struct ocfs2_alloc_context *ac);
48
49int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
50 struct ocfs2_journal_handle *handle,
51 struct ocfs2_alloc_context *ac,
52 u32 min_bits,
53 u32 *bit_off,
54 u32 *num_bits);
55
56#endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
new file mode 100644
index 000000000000..afdeec4b0eef
--- /dev/null
+++ b/fs/ocfs2/mmap.c
@@ -0,0 +1,102 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * mmap.c
5 *
6 * Code to deal with the mess that is clustered mmap.
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/pagemap.h>
31#include <linux/uio.h>
32#include <linux/signal.h>
33#include <linux/rbtree.h>
34
35#define MLOG_MASK_PREFIX ML_FILE_IO
36#include <cluster/masklog.h>
37
38#include "ocfs2.h"
39
40#include "dlmglue.h"
41#include "file.h"
42#include "inode.h"
43#include "mmap.h"
44
45static struct page *ocfs2_nopage(struct vm_area_struct * area,
46 unsigned long address,
47 int *type)
48{
49 struct inode *inode = area->vm_file->f_dentry->d_inode;
50 struct page *page = NOPAGE_SIGBUS;
51 sigset_t blocked, oldset;
52 int ret;
53
54 mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
55
56 /* The best way to deal with signals in this path is
57 * to block them upfront, rather than allowing the
58 * locking paths to return -ERESTARTSYS. */
59 sigfillset(&blocked);
60
61 /* We should technically never get a bad ret return
62 * from sigprocmask */
63 ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
64 if (ret < 0) {
65 mlog_errno(ret);
66 goto out;
67 }
68
69 page = filemap_nopage(area, address, type);
70
71 ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
72 if (ret < 0)
73 mlog_errno(ret);
74out:
75 mlog_exit_ptr(page);
76 return page;
77}
78
79static struct vm_operations_struct ocfs2_file_vm_ops = {
80 .nopage = ocfs2_nopage,
81};
82
83int ocfs2_mmap(struct file *file,
84 struct vm_area_struct *vma)
85{
86 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
87 struct inode *inode = mapping->host;
88
89 /* We don't want to support shared writable mappings yet. */
90 if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
91 && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
92 mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
93 /* This is -EINVAL because generic_file_readonly_mmap
94 * returns it in a similar situation. */
95 return -EINVAL;
96 }
97
98 update_atime(inode);
99 vma->vm_ops = &ocfs2_file_vm_ops;
100 return 0;
101}
102
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h
new file mode 100644
index 000000000000..1274ee0f1fe2
--- /dev/null
+++ b/fs/ocfs2/mmap.h
@@ -0,0 +1,6 @@
1#ifndef OCFS2_MMAP_H
2#define OCFS2_MMAP_H
3
4int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
5
6#endif /* OCFS2_MMAP_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
new file mode 100644
index 000000000000..f6b77ff1d2bf
--- /dev/null
+++ b/fs/ocfs2/namei.c
@@ -0,0 +1,2264 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * namei.c
5 *
6 * Create and rename file, directory, symlinks
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * Portions of this code from linux/fs/ext3/dir.c
11 *
12 * Copyright (C) 1992, 1993, 1994, 1995
13 * Remy Card (card@masi.ibp.fr)
14 * Laboratoire MASI - Institut Blaise pascal
15 * Universite Pierre et Marie Curie (Paris VI)
16 *
17 * from
18 *
19 * linux/fs/minix/dir.c
20 *
21 * Copyright (C) 1991, 1992 Linux Torvalds
22 *
23 * This program is free software; you can redistribute it and/or
24 * modify it under the terms of the GNU General Public
25 * License as published by the Free Software Foundation; either
26 * version 2 of the License, or (at your option) any later version.
27 *
28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
31 * General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public
34 * License along with this program; if not, write to the
35 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
36 * Boston, MA 021110-1307, USA.
37 */
38
39#include <linux/fs.h>
40#include <linux/types.h>
41#include <linux/slab.h>
42#include <linux/highmem.h>
43
44#define MLOG_MASK_PREFIX ML_NAMEI
45#include <cluster/masklog.h>
46
47#include "ocfs2.h"
48
49#include "alloc.h"
50#include "dcache.h"
51#include "dir.h"
52#include "dlmglue.h"
53#include "extent_map.h"
54#include "file.h"
55#include "inode.h"
56#include "journal.h"
57#include "namei.h"
58#include "suballoc.h"
59#include "symlink.h"
60#include "sysfile.h"
61#include "uptodate.h"
62#include "vote.h"
63
64#include "buffer_head_io.h"
65
66#define NAMEI_RA_CHUNKS 2
67#define NAMEI_RA_BLOCKS 4
68#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
69#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
70
71static int inline ocfs2_search_dirblock(struct buffer_head *bh,
72 struct inode *dir,
73 const char *name, int namelen,
74 unsigned long offset,
75 struct ocfs2_dir_entry **res_dir);
76
77static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
78 struct inode *dir,
79 struct ocfs2_dir_entry *de_del,
80 struct buffer_head *bh);
81
82static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
83 struct inode *dir,
84 const char *name, int namelen,
85 struct inode *inode, u64 blkno,
86 struct buffer_head *parent_fe_bh,
87 struct buffer_head *insert_bh);
88
89static int ocfs2_mknod_locked(struct ocfs2_super *osb,
90 struct inode *dir,
91 struct dentry *dentry, int mode,
92 dev_t dev,
93 struct buffer_head **new_fe_bh,
94 struct buffer_head *parent_fe_bh,
95 struct ocfs2_journal_handle *handle,
96 struct inode **ret_inode,
97 struct ocfs2_alloc_context *inode_ac);
98
99static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
100 struct ocfs2_journal_handle *handle,
101 struct inode *parent,
102 struct inode *inode,
103 struct buffer_head *fe_bh,
104 struct ocfs2_alloc_context *data_ac);
105
106static int ocfs2_double_lock(struct ocfs2_super *osb,
107 struct ocfs2_journal_handle *handle,
108 struct buffer_head **bh1,
109 struct inode *inode1,
110 struct buffer_head **bh2,
111 struct inode *inode2);
112
113static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
114 struct ocfs2_journal_handle *handle,
115 struct inode *inode,
116 char *name,
117 struct buffer_head **de_bh);
118
119static int ocfs2_orphan_add(struct ocfs2_super *osb,
120 struct ocfs2_journal_handle *handle,
121 struct inode *inode,
122 struct ocfs2_dinode *fe,
123 char *name,
124 struct buffer_head *de_bh);
125
126static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
127 struct ocfs2_journal_handle *handle,
128 struct inode *inode,
129 const char *symname);
130
131static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle,
132 struct dentry *dentry,
133 struct inode *inode, u64 blkno,
134 struct buffer_head *parent_fe_bh,
135 struct buffer_head *insert_bh)
136{
137 return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
138 dentry->d_name.name, dentry->d_name.len,
139 inode, blkno, parent_fe_bh, insert_bh);
140}
141
142/* An orphan dir name is an 8 byte value, printed as a hex string */
143#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
144
145static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
146 struct nameidata *nd)
147{
148 int status;
149 u64 blkno;
150 struct buffer_head *dirent_bh = NULL;
151 struct inode *inode = NULL;
152 struct dentry *ret;
153 struct ocfs2_dir_entry *dirent;
154 struct ocfs2_inode_info *oi;
155
156 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
157 dentry->d_name.len, dentry->d_name.name);
158
159 if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
160 ret = ERR_PTR(-ENAMETOOLONG);
161 goto bail;
162 }
163
164 mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len,
165 dentry->d_name.name, OCFS2_I(dir)->ip_blkno);
166
167 status = ocfs2_meta_lock(dir, NULL, NULL, 0);
168 if (status < 0) {
169 if (status != -ENOENT)
170 mlog_errno(status);
171 ret = ERR_PTR(status);
172 goto bail;
173 }
174
175 status = ocfs2_find_files_on_disk(dentry->d_name.name,
176 dentry->d_name.len, &blkno,
177 dir, &dirent_bh, &dirent);
178 if (status < 0)
179 goto bail_add;
180
181 inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
182 if (IS_ERR(inode)) {
183 mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
184 ret = ERR_PTR(-EACCES);
185 goto bail_unlock;
186 }
187
188 oi = OCFS2_I(inode);
189 /* Clear any orphaned state... If we were able to look up the
190 * inode from a directory, it certainly can't be orphaned. We
191 * might have the bad state from a node which intended to
192 * orphan this inode but crashed before it could commit the
193 * unlink. */
194 spin_lock(&oi->ip_lock);
195 oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
196 oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
197 spin_unlock(&oi->ip_lock);
198
199bail_add:
200
201 dentry->d_op = &ocfs2_dentry_ops;
202 ret = d_splice_alias(inode, dentry);
203
204bail_unlock:
205 /* Don't drop the cluster lock until *after* the d_add --
206 * unlink on another node will message us to remove that
207 * dentry under this lock so otherwise we can race this with
208 * the vote thread and have a stale dentry. */
209 ocfs2_meta_unlock(dir, 0);
210
211bail:
212 if (dirent_bh)
213 brelse(dirent_bh);
214
215 mlog_exit_ptr(ret);
216
217 return ret;
218}
219
220static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
221 struct ocfs2_journal_handle *handle,
222 struct inode *parent,
223 struct inode *inode,
224 struct buffer_head *fe_bh,
225 struct ocfs2_alloc_context *data_ac)
226{
227 int status;
228 struct buffer_head *new_bh = NULL;
229 struct ocfs2_dir_entry *de = NULL;
230
231 mlog_entry_void();
232
233 status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
234 data_ac, NULL, &new_bh);
235 if (status < 0) {
236 mlog_errno(status);
237 goto bail;
238 }
239
240 ocfs2_set_new_buffer_uptodate(inode, new_bh);
241
242 status = ocfs2_journal_access(handle, inode, new_bh,
243 OCFS2_JOURNAL_ACCESS_CREATE);
244 if (status < 0) {
245 mlog_errno(status);
246 goto bail;
247 }
248 memset(new_bh->b_data, 0, osb->sb->s_blocksize);
249
250 de = (struct ocfs2_dir_entry *) new_bh->b_data;
251 de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
252 de->name_len = 1;
253 de->rec_len =
254 cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
255 strcpy(de->name, ".");
256 ocfs2_set_de_type(de, S_IFDIR);
257 de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
258 de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
259 de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
260 OCFS2_DIR_REC_LEN(1));
261 de->name_len = 2;
262 strcpy(de->name, "..");
263 ocfs2_set_de_type(de, S_IFDIR);
264
265 status = ocfs2_journal_dirty(handle, new_bh);
266 if (status < 0) {
267 mlog_errno(status);
268 goto bail;
269 }
270
271 i_size_write(inode, inode->i_sb->s_blocksize);
272 inode->i_nlink = 2;
273 inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
274 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
275 if (status < 0) {
276 mlog_errno(status);
277 goto bail;
278 }
279
280 status = 0;
281bail:
282 if (new_bh)
283 brelse(new_bh);
284
285 mlog_exit(status);
286 return status;
287}
288
289static int ocfs2_mknod(struct inode *dir,
290 struct dentry *dentry,
291 int mode,
292 dev_t dev)
293{
294 int status = 0;
295 struct buffer_head *parent_fe_bh = NULL;
296 struct ocfs2_journal_handle *handle = NULL;
297 struct ocfs2_super *osb;
298 struct ocfs2_dinode *dirfe;
299 struct buffer_head *new_fe_bh = NULL;
300 struct buffer_head *de_bh = NULL;
301 struct inode *inode = NULL;
302 struct ocfs2_alloc_context *inode_ac = NULL;
303 struct ocfs2_alloc_context *data_ac = NULL;
304
305 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
306 (unsigned long)dev, dentry->d_name.len,
307 dentry->d_name.name);
308
309 /* get our super block */
310 osb = OCFS2_SB(dir->i_sb);
311
312 if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
313 mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n",
314 OCFS2_I(dir)->ip_blkno, dir->i_nlink);
315 status = -EMLINK;
316 goto leave;
317 }
318
319 handle = ocfs2_alloc_handle(osb);
320 if (handle == NULL) {
321 status = -ENOMEM;
322 mlog_errno(status);
323 goto leave;
324 }
325
326 status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
327 if (status < 0) {
328 if (status != -ENOENT)
329 mlog_errno(status);
330 goto leave;
331 }
332
333 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
334 if (!dirfe->i_links_count) {
335 /* can't make a file in a deleted directory. */
336 status = -ENOENT;
337 goto leave;
338 }
339
340 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
341 dentry->d_name.len);
342 if (status)
343 goto leave;
344
345 /* get a spot inside the dir. */
346 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
347 dentry->d_name.name,
348 dentry->d_name.len, &de_bh);
349 if (status < 0) {
350 mlog_errno(status);
351 goto leave;
352 }
353
354 /* reserve an inode spot */
355 status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
356 if (status < 0) {
357 if (status != -ENOSPC)
358 mlog_errno(status);
359 goto leave;
360 }
361
362 /* are we making a directory? If so, reserve a cluster for his
363 * 1st extent. */
364 if (S_ISDIR(mode)) {
365 status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
366 if (status < 0) {
367 if (status != -ENOSPC)
368 mlog_errno(status);
369 goto leave;
370 }
371 }
372
373 handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS);
374 if (IS_ERR(handle)) {
375 status = PTR_ERR(handle);
376 handle = NULL;
377 mlog_errno(status);
378 goto leave;
379 }
380
381 /* do the real work now. */
382 status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
383 &new_fe_bh, parent_fe_bh, handle,
384 &inode, inode_ac);
385 if (status < 0) {
386 mlog_errno(status);
387 goto leave;
388 }
389
390 if (S_ISDIR(mode)) {
391 status = ocfs2_fill_new_dir(osb, handle, dir, inode,
392 new_fe_bh, data_ac);
393 if (status < 0) {
394 mlog_errno(status);
395 goto leave;
396 }
397
398 status = ocfs2_journal_access(handle, dir, parent_fe_bh,
399 OCFS2_JOURNAL_ACCESS_WRITE);
400 if (status < 0) {
401 mlog_errno(status);
402 goto leave;
403 }
404 le16_add_cpu(&dirfe->i_links_count, 1);
405 status = ocfs2_journal_dirty(handle, parent_fe_bh);
406 if (status < 0) {
407 mlog_errno(status);
408 goto leave;
409 }
410 dir->i_nlink++;
411 }
412
413 status = ocfs2_add_entry(handle, dentry, inode,
414 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
415 de_bh);
416 if (status < 0) {
417 mlog_errno(status);
418 goto leave;
419 }
420
421 insert_inode_hash(inode);
422 dentry->d_op = &ocfs2_dentry_ops;
423 d_instantiate(dentry, inode);
424 status = 0;
425leave:
426 if (handle)
427 ocfs2_commit_trans(handle);
428
429 if (status == -ENOSPC)
430 mlog(0, "Disk is full\n");
431
432 if (new_fe_bh)
433 brelse(new_fe_bh);
434
435 if (de_bh)
436 brelse(de_bh);
437
438 if (parent_fe_bh)
439 brelse(parent_fe_bh);
440
441 if ((status < 0) && inode)
442 iput(inode);
443
444 if (inode_ac)
445 ocfs2_free_alloc_context(inode_ac);
446
447 if (data_ac)
448 ocfs2_free_alloc_context(data_ac);
449
450 mlog_exit(status);
451
452 return status;
453}
454
455static int ocfs2_mknod_locked(struct ocfs2_super *osb,
456 struct inode *dir,
457 struct dentry *dentry, int mode,
458 dev_t dev,
459 struct buffer_head **new_fe_bh,
460 struct buffer_head *parent_fe_bh,
461 struct ocfs2_journal_handle *handle,
462 struct inode **ret_inode,
463 struct ocfs2_alloc_context *inode_ac)
464{
465 int status = 0;
466 struct ocfs2_dinode *fe = NULL;
467 struct ocfs2_extent_list *fel;
468 u64 fe_blkno = 0;
469 u16 suballoc_bit;
470 struct inode *inode = NULL;
471
472 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
473 (unsigned long)dev, dentry->d_name.len,
474 dentry->d_name.name);
475
476 *new_fe_bh = NULL;
477 *ret_inode = NULL;
478
479 status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
480 &fe_blkno);
481 if (status < 0) {
482 mlog_errno(status);
483 goto leave;
484 }
485
486 inode = new_inode(dir->i_sb);
487 if (IS_ERR(inode)) {
488 status = PTR_ERR(inode);
489 mlog(ML_ERROR, "new_inode failed!\n");
490 goto leave;
491 }
492
493 /* populate as many fields early on as possible - many of
494 * these are used by the support functions here and in
495 * callers. */
496 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
497 OCFS2_I(inode)->ip_blkno = fe_blkno;
498 if (S_ISDIR(mode))
499 inode->i_nlink = 2;
500 else
501 inode->i_nlink = 1;
502 inode->i_mode = mode;
503 spin_lock(&osb->osb_lock);
504 inode->i_generation = osb->s_next_generation++;
505 spin_unlock(&osb->osb_lock);
506
507 *new_fe_bh = sb_getblk(osb->sb, fe_blkno);
508 if (!*new_fe_bh) {
509 status = -EIO;
510 mlog_errno(status);
511 goto leave;
512 }
513 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
514
515 status = ocfs2_journal_access(handle, inode, *new_fe_bh,
516 OCFS2_JOURNAL_ACCESS_CREATE);
517 if (status < 0) {
518 mlog_errno(status);
519 goto leave;
520 }
521
522 fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data;
523 memset(fe, 0, osb->sb->s_blocksize);
524
525 fe->i_generation = cpu_to_le32(inode->i_generation);
526 fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
527 fe->i_blkno = cpu_to_le64(fe_blkno);
528 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
529 fe->i_suballoc_slot = cpu_to_le16(osb->slot_num);
530 fe->i_uid = cpu_to_le32(current->fsuid);
531 if (dir->i_mode & S_ISGID) {
532 fe->i_gid = cpu_to_le32(dir->i_gid);
533 if (S_ISDIR(mode))
534 mode |= S_ISGID;
535 } else
536 fe->i_gid = cpu_to_le32(current->fsgid);
537 fe->i_mode = cpu_to_le16(mode);
538 if (S_ISCHR(mode) || S_ISBLK(mode))
539 fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
540
541 fe->i_links_count = cpu_to_le16(inode->i_nlink);
542
543 fe->i_last_eb_blk = 0;
544 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
545 le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL);
546 fe->i_atime = fe->i_ctime = fe->i_mtime =
547 cpu_to_le64(CURRENT_TIME.tv_sec);
548 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
549 cpu_to_le32(CURRENT_TIME.tv_nsec);
550 fe->i_dtime = 0;
551
552 fel = &fe->id2.i_list;
553 fel->l_tree_depth = 0;
554 fel->l_next_free_rec = 0;
555 fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
556
557 status = ocfs2_journal_dirty(handle, *new_fe_bh);
558 if (status < 0) {
559 mlog_errno(status);
560 goto leave;
561 }
562
563 if (ocfs2_populate_inode(inode, fe, 1) < 0) {
564 mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
565 "i_blkno=%"MLFu64", i_ino=%lu\n",
566 (unsigned long long) (*new_fe_bh)->b_blocknr,
567 fe->i_blkno, inode->i_ino);
568 BUG();
569 }
570
571 ocfs2_inode_set_new(osb, inode);
572 status = ocfs2_create_new_inode_locks(inode);
573 if (status < 0)
574 mlog_errno(status);
575
576 status = 0; /* error in ocfs2_create_new_inode_locks is not
577 * critical */
578
579 *ret_inode = inode;
580leave:
581 if (status < 0) {
582 if (*new_fe_bh) {
583 brelse(*new_fe_bh);
584 *new_fe_bh = NULL;
585 }
586 if (inode)
587 iput(inode);
588 }
589
590 mlog_exit(status);
591 return status;
592}
593
594static int ocfs2_mkdir(struct inode *dir,
595 struct dentry *dentry,
596 int mode)
597{
598 int ret;
599
600 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
601 dentry->d_name.len, dentry->d_name.name);
602 ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
603 mlog_exit(ret);
604
605 return ret;
606}
607
608static int ocfs2_create(struct inode *dir,
609 struct dentry *dentry,
610 int mode,
611 struct nameidata *nd)
612{
613 int ret;
614
615 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
616 dentry->d_name.len, dentry->d_name.name);
617 ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
618 mlog_exit(ret);
619
620 return ret;
621}
622
623static int ocfs2_link(struct dentry *old_dentry,
624 struct inode *dir,
625 struct dentry *dentry)
626{
627 struct ocfs2_journal_handle *handle = NULL;
628 struct inode *inode = old_dentry->d_inode;
629 int err;
630 struct buffer_head *fe_bh = NULL;
631 struct buffer_head *parent_fe_bh = NULL;
632 struct buffer_head *de_bh = NULL;
633 struct ocfs2_dinode *fe = NULL;
634 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
635
636 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
637 old_dentry->d_name.len, old_dentry->d_name.name,
638 dentry->d_name.len, dentry->d_name.name);
639
640 if (S_ISDIR(inode->i_mode)) {
641 err = -EPERM;
642 goto bail;
643 }
644
645 if (inode->i_nlink >= OCFS2_LINK_MAX) {
646 err = -EMLINK;
647 goto bail;
648 }
649
650 handle = ocfs2_alloc_handle(osb);
651 if (handle == NULL) {
652 err = -ENOMEM;
653 goto bail;
654 }
655
656 err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
657 if (err < 0) {
658 if (err != -ENOENT)
659 mlog_errno(err);
660 goto bail;
661 }
662
663 err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
664 dentry->d_name.len);
665 if (err)
666 goto bail;
667
668 err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
669 dentry->d_name.name,
670 dentry->d_name.len, &de_bh);
671 if (err < 0) {
672 mlog_errno(err);
673 goto bail;
674 }
675
676 err = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
677 if (err < 0) {
678 if (err != -ENOENT)
679 mlog_errno(err);
680 goto bail;
681 }
682
683 fe = (struct ocfs2_dinode *) fe_bh->b_data;
684 if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) {
685 err = -EMLINK;
686 goto bail;
687 }
688
689 handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS);
690 if (IS_ERR(handle)) {
691 err = PTR_ERR(handle);
692 handle = NULL;
693 mlog_errno(err);
694 goto bail;
695 }
696
697 err = ocfs2_journal_access(handle, inode, fe_bh,
698 OCFS2_JOURNAL_ACCESS_WRITE);
699 if (err < 0) {
700 mlog_errno(err);
701 goto bail;
702 }
703
704 inode->i_nlink++;
705 inode->i_ctime = CURRENT_TIME;
706 fe->i_links_count = cpu_to_le16(inode->i_nlink);
707 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
708 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
709
710 err = ocfs2_journal_dirty(handle, fe_bh);
711 if (err < 0) {
712 le16_add_cpu(&fe->i_links_count, -1);
713 inode->i_nlink--;
714 mlog_errno(err);
715 goto bail;
716 }
717
718 err = ocfs2_add_entry(handle, dentry, inode,
719 OCFS2_I(inode)->ip_blkno,
720 parent_fe_bh, de_bh);
721 if (err) {
722 le16_add_cpu(&fe->i_links_count, -1);
723 inode->i_nlink--;
724 mlog_errno(err);
725 goto bail;
726 }
727
728 atomic_inc(&inode->i_count);
729 dentry->d_op = &ocfs2_dentry_ops;
730 d_instantiate(dentry, inode);
731bail:
732 if (handle)
733 ocfs2_commit_trans(handle);
734 if (de_bh)
735 brelse(de_bh);
736 if (fe_bh)
737 brelse(fe_bh);
738 if (parent_fe_bh)
739 brelse(parent_fe_bh);
740
741 mlog_exit(err);
742
743 return err;
744}
745
746static int ocfs2_unlink(struct inode *dir,
747 struct dentry *dentry)
748{
749 int status;
750 unsigned int saved_nlink = 0;
751 struct inode *inode = dentry->d_inode;
752 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
753 u64 blkno;
754 struct ocfs2_dinode *fe = NULL;
755 struct buffer_head *fe_bh = NULL;
756 struct buffer_head *parent_node_bh = NULL;
757 struct ocfs2_journal_handle *handle = NULL;
758 struct ocfs2_dir_entry *dirent = NULL;
759 struct buffer_head *dirent_bh = NULL;
760 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
761 struct buffer_head *orphan_entry_bh = NULL;
762
763 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
764 dentry->d_name.len, dentry->d_name.name);
765
766 BUG_ON(dentry->d_parent->d_inode != dir);
767
768 mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
769
770 if (inode == osb->root_inode) {
771 mlog(0, "Cannot delete the root directory\n");
772 status = -EPERM;
773 goto leave;
774 }
775
776 handle = ocfs2_alloc_handle(osb);
777 if (handle == NULL) {
778 status = -ENOMEM;
779 mlog_errno(status);
780 goto leave;
781 }
782
783 status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1);
784 if (status < 0) {
785 if (status != -ENOENT)
786 mlog_errno(status);
787 goto leave;
788 }
789
790 status = ocfs2_find_files_on_disk(dentry->d_name.name,
791 dentry->d_name.len, &blkno,
792 dir, &dirent_bh, &dirent);
793 if (status < 0) {
794 if (status != -ENOENT)
795 mlog_errno(status);
796 goto leave;
797 }
798
799 if (OCFS2_I(inode)->ip_blkno != blkno) {
800 status = -ENOENT;
801
802 mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") "
803 "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno,
804 OCFS2_I(inode)->ip_flags);
805 goto leave;
806 }
807
808 status = ocfs2_meta_lock(inode, handle, &fe_bh, 1);
809 if (status < 0) {
810 if (status != -ENOENT)
811 mlog_errno(status);
812 goto leave;
813 }
814
815 if (S_ISDIR(inode->i_mode)) {
816 if (!ocfs2_empty_dir(inode)) {
817 status = -ENOTEMPTY;
818 goto leave;
819 } else if (inode->i_nlink != 2) {
820 status = -ENOTEMPTY;
821 goto leave;
822 }
823 }
824
825 /* There are still a few steps left until we can consider the
826 * unlink to have succeeded. Save off nlink here before
827 * modification so we can set it back in case we hit an issue
828 * before commit. */
829 saved_nlink = inode->i_nlink;
830 if (S_ISDIR(inode->i_mode))
831 inode->i_nlink = 0;
832 else
833 inode->i_nlink--;
834
835 status = ocfs2_request_unlink_vote(inode, dentry,
836 (unsigned int) inode->i_nlink);
837 if (status < 0) {
838 /* This vote should succeed under all normal
839 * circumstances. */
840 mlog_errno(status);
841 goto leave;
842 }
843
844 if (!inode->i_nlink) {
845 status = ocfs2_prepare_orphan_dir(osb, handle, inode,
846 orphan_name,
847 &orphan_entry_bh);
848 if (status < 0) {
849 mlog_errno(status);
850 goto leave;
851 }
852 }
853
854 handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS);
855 if (IS_ERR(handle)) {
856 status = PTR_ERR(handle);
857 handle = NULL;
858 mlog_errno(status);
859 goto leave;
860 }
861
862 status = ocfs2_journal_access(handle, inode, fe_bh,
863 OCFS2_JOURNAL_ACCESS_WRITE);
864 if (status < 0) {
865 mlog_errno(status);
866 goto leave;
867 }
868
869 fe = (struct ocfs2_dinode *) fe_bh->b_data;
870
871 if (!inode->i_nlink) {
872 status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
873 orphan_entry_bh);
874 if (status < 0) {
875 mlog_errno(status);
876 goto leave;
877 }
878 }
879
880 /* delete the name from the parent dir */
881 status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh);
882 if (status < 0) {
883 mlog_errno(status);
884 goto leave;
885 }
886
887 /* We can set nlink on the dinode now. clear the saved version
888 * so that it doesn't get set later. */
889 fe->i_links_count = cpu_to_le16(inode->i_nlink);
890 saved_nlink = 0;
891
892 status = ocfs2_journal_dirty(handle, fe_bh);
893 if (status < 0) {
894 mlog_errno(status);
895 goto leave;
896 }
897
898 if (S_ISDIR(inode->i_mode)) {
899 dir->i_nlink--;
900 status = ocfs2_mark_inode_dirty(handle, dir,
901 parent_node_bh);
902 if (status < 0) {
903 mlog_errno(status);
904 dir->i_nlink++;
905 }
906 }
907
908leave:
909 if (status < 0 && saved_nlink)
910 inode->i_nlink = saved_nlink;
911
912 if (handle)
913 ocfs2_commit_trans(handle);
914
915 if (fe_bh)
916 brelse(fe_bh);
917
918 if (dirent_bh)
919 brelse(dirent_bh);
920
921 if (parent_node_bh)
922 brelse(parent_node_bh);
923
924 if (orphan_entry_bh)
925 brelse(orphan_entry_bh);
926
927 mlog_exit(status);
928
929 return status;
930}
931
932/*
933 * The only place this should be used is rename!
934 * if they have the same id, then the 1st one is the only one locked.
935 */
936static int ocfs2_double_lock(struct ocfs2_super *osb,
937 struct ocfs2_journal_handle *handle,
938 struct buffer_head **bh1,
939 struct inode *inode1,
940 struct buffer_head **bh2,
941 struct inode *inode2)
942{
943 int status;
944 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
945 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
946 struct buffer_head **tmpbh;
947 struct inode *tmpinode;
948
949 mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n",
950 oi1->ip_blkno, oi2->ip_blkno);
951
952 BUG_ON(!handle);
953
954 if (*bh1)
955 *bh1 = NULL;
956 if (*bh2)
957 *bh2 = NULL;
958
959 /* we always want to lock the one with the lower lockid first. */
960 if (oi1->ip_blkno != oi2->ip_blkno) {
961 if (oi1->ip_blkno < oi2->ip_blkno) {
962 /* switch id1 and id2 around */
963 mlog(0, "switching them around...\n");
964 tmpbh = bh2;
965 bh2 = bh1;
966 bh1 = tmpbh;
967
968 tmpinode = inode2;
969 inode2 = inode1;
970 inode1 = tmpinode;
971 }
972 /* lock id2 */
973 status = ocfs2_meta_lock(inode2, handle, bh2, 1);
974 if (status < 0) {
975 if (status != -ENOENT)
976 mlog_errno(status);
977 goto bail;
978 }
979 }
980 /* lock id1 */
981 status = ocfs2_meta_lock(inode1, handle, bh1, 1);
982 if (status < 0) {
983 if (status != -ENOENT)
984 mlog_errno(status);
985 goto bail;
986 }
987bail:
988 mlog_exit(status);
989 return status;
990}
991
992#define PARENT_INO(buffer) \
993 ((struct ocfs2_dir_entry *) \
994 ((char *)buffer + \
995 le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
996
997static int ocfs2_rename(struct inode *old_dir,
998 struct dentry *old_dentry,
999 struct inode *new_dir,
1000 struct dentry *new_dentry)
1001{
1002 int status = 0, rename_lock = 0;
1003 struct inode *old_inode = old_dentry->d_inode;
1004 struct inode *new_inode = new_dentry->d_inode;
1005 struct ocfs2_dinode *newfe = NULL;
1006 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
1007 struct buffer_head *orphan_entry_bh = NULL;
1008 struct buffer_head *newfe_bh = NULL;
1009 struct buffer_head *insert_entry_bh = NULL;
1010 struct ocfs2_super *osb = NULL;
1011 u64 newfe_blkno;
1012 struct ocfs2_journal_handle *handle = NULL;
1013 struct buffer_head *old_dir_bh = NULL;
1014 struct buffer_head *new_dir_bh = NULL;
1015 struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
1016 // and new_dentry
1017 struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
1018 struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
1019 // this is the 1st dirent bh
1020 nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink;
1021 unsigned int links_count;
1022
1023 /* At some point it might be nice to break this function up a
1024 * bit. */
1025
1026 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
1027 old_dir, old_dentry, new_dir, new_dentry,
1028 old_dentry->d_name.len, old_dentry->d_name.name,
1029 new_dentry->d_name.len, new_dentry->d_name.name);
1030
1031 osb = OCFS2_SB(old_dir->i_sb);
1032
1033 if (new_inode) {
1034 if (!igrab(new_inode))
1035 BUG();
1036 }
1037
1038 if (atomic_read(&old_dentry->d_count) > 2) {
1039 shrink_dcache_parent(old_dentry);
1040 if (atomic_read(&old_dentry->d_count) > 2) {
1041 status = -EBUSY;
1042 goto bail;
1043 }
1044 }
1045
1046 /* Assume a directory heirarchy thusly:
1047 * a/b/c
1048 * a/d
1049 * a,b,c, and d are all directories.
1050 *
1051 * from cwd of 'a' on both nodes:
1052 * node1: mv b/c d
1053 * node2: mv d b/c
1054 *
1055 * And that's why, just like the VFS, we need a file system
1056 * rename lock. */
1057 if (old_dentry != new_dentry) {
1058 status = ocfs2_rename_lock(osb);
1059 if (status < 0) {
1060 mlog_errno(status);
1061 goto bail;
1062 }
1063 rename_lock = 1;
1064 }
1065
1066 handle = ocfs2_alloc_handle(osb);
1067 if (handle == NULL) {
1068 status = -ENOMEM;
1069 mlog_errno(status);
1070 goto bail;
1071 }
1072
1073 /* if old and new are the same, this'll just do one lock. */
1074 status = ocfs2_double_lock(osb, handle,
1075 &old_dir_bh, old_dir,
1076 &new_dir_bh, new_dir);
1077 if (status < 0) {
1078 mlog_errno(status);
1079 goto bail;
1080 }
1081
1082 /* make sure both dirs have bhs
1083 * get an extra ref on old_dir_bh if old==new */
1084 if (!new_dir_bh) {
1085 if (old_dir_bh) {
1086 new_dir_bh = old_dir_bh;
1087 get_bh(new_dir_bh);
1088 } else {
1089 mlog(ML_ERROR, "no old_dir_bh!\n");
1090 status = -EIO;
1091 goto bail;
1092 }
1093 }
1094
1095 if (S_ISDIR(old_inode->i_mode)) {
1096 /* Directories actually require metadata updates to
1097 * the directory info so we can't get away with not
1098 * doing node locking on it. */
1099 status = ocfs2_meta_lock(old_inode, handle, NULL, 1);
1100 if (status < 0) {
1101 if (status != -ENOENT)
1102 mlog_errno(status);
1103 goto bail;
1104 }
1105
1106 status = ocfs2_request_rename_vote(old_inode, old_dentry);
1107 if (status < 0) {
1108 mlog_errno(status);
1109 goto bail;
1110 }
1111
1112 status = -EIO;
1113 old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
1114 if (!old_inode_de_bh)
1115 goto bail;
1116
1117 status = -EIO;
1118 if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
1119 OCFS2_I(old_dir)->ip_blkno)
1120 goto bail;
1121 status = -EMLINK;
1122 if (!new_inode && new_dir!=old_dir &&
1123 new_dir->i_nlink >= OCFS2_LINK_MAX)
1124 goto bail;
1125 } else {
1126 /* Ah, the simple case - we're a file so just send a
1127 * message. */
1128 status = ocfs2_request_rename_vote(old_inode, old_dentry);
1129 if (status < 0) {
1130 mlog_errno(status);
1131 goto bail;
1132 }
1133 }
1134
1135 status = -ENOENT;
1136 old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
1137 old_dentry->d_name.len,
1138 old_dir, &old_de);
1139 if (!old_de_bh)
1140 goto bail;
1141
1142 /*
1143 * Check for inode number is _not_ due to possible IO errors.
1144 * We might rmdir the source, keep it as pwd of some process
1145 * and merrily kill the link to whatever was created under the
1146 * same name. Goodbye sticky bit ;-<
1147 */
1148 if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
1149 goto bail;
1150
1151 /* check if the target already exists (in which case we need
1152 * to delete it */
1153 status = ocfs2_find_files_on_disk(new_dentry->d_name.name,
1154 new_dentry->d_name.len,
1155 &newfe_blkno, new_dir, &new_de_bh,
1156 &new_de);
1157 /* The only error we allow here is -ENOENT because the new
1158 * file not existing is perfectly valid. */
1159 if ((status < 0) && (status != -ENOENT)) {
1160 /* If we cannot find the file specified we should just */
1161 /* return the error... */
1162 mlog_errno(status);
1163 goto bail;
1164 }
1165
1166 if (!new_de && new_inode)
1167 mlog(ML_ERROR, "inode %lu does not exist in it's parent "
1168 "directory!", new_inode->i_ino);
1169
1170 /* In case we need to overwrite an existing file, we blow it
1171 * away first */
1172 if (new_de) {
1173 /* VFS didn't think there existed an inode here, but
1174 * someone else in the cluster must have raced our
1175 * rename to create one. Today we error cleanly, in
1176 * the future we should consider calling iget to build
1177 * a new struct inode for this entry. */
1178 if (!new_inode) {
1179 status = -EACCES;
1180
1181 mlog(0, "We found an inode for name %.*s but VFS "
1182 "didn't give us one.\n", new_dentry->d_name.len,
1183 new_dentry->d_name.name);
1184 goto bail;
1185 }
1186
1187 if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
1188 status = -EACCES;
1189
1190 mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") "
1191 "disagree. ip_flags = %x\n",
1192 OCFS2_I(new_inode)->ip_blkno, newfe_blkno,
1193 OCFS2_I(new_inode)->ip_flags);
1194 goto bail;
1195 }
1196
1197 status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1);
1198 if (status < 0) {
1199 if (status != -ENOENT)
1200 mlog_errno(status);
1201 goto bail;
1202 }
1203
1204 if (S_ISDIR(new_inode->i_mode))
1205 links_count = 0;
1206 else
1207 links_count = (unsigned int) (new_inode->i_nlink - 1);
1208
1209 status = ocfs2_request_unlink_vote(new_inode, new_dentry,
1210 links_count);
1211 if (status < 0) {
1212 mlog_errno(status);
1213 goto bail;
1214 }
1215
1216 newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
1217
1218 mlog(0, "aha rename over existing... new_de=%p "
1219 "new_blkno=%"MLFu64" newfebh=%p bhblocknr=%llu\n",
1220 new_de, newfe_blkno, newfe_bh, newfe_bh ?
1221 (unsigned long long)newfe_bh->b_blocknr : 0ULL);
1222
1223 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
1224 status = ocfs2_prepare_orphan_dir(osb, handle,
1225 new_inode,
1226 orphan_name,
1227 &orphan_entry_bh);
1228 if (status < 0) {
1229 mlog_errno(status);
1230 goto bail;
1231 }
1232 }
1233 } else {
1234 BUG_ON(new_dentry->d_parent->d_inode != new_dir);
1235
1236 status = ocfs2_check_dir_for_entry(new_dir,
1237 new_dentry->d_name.name,
1238 new_dentry->d_name.len);
1239 if (status)
1240 goto bail;
1241
1242 status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh,
1243 new_dentry->d_name.name,
1244 new_dentry->d_name.len,
1245 &insert_entry_bh);
1246 if (status < 0) {
1247 mlog_errno(status);
1248 goto bail;
1249 }
1250 }
1251
1252 handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS);
1253 if (IS_ERR(handle)) {
1254 status = PTR_ERR(handle);
1255 handle = NULL;
1256 mlog_errno(status);
1257 goto bail;
1258 }
1259
1260 if (new_de) {
1261 if (S_ISDIR(new_inode->i_mode)) {
1262 if (!ocfs2_empty_dir(new_inode) ||
1263 new_inode->i_nlink != 2) {
1264 status = -ENOTEMPTY;
1265 goto bail;
1266 }
1267 }
1268 status = ocfs2_journal_access(handle, new_inode, newfe_bh,
1269 OCFS2_JOURNAL_ACCESS_WRITE);
1270 if (status < 0) {
1271 mlog_errno(status);
1272 goto bail;
1273 }
1274
1275 if (S_ISDIR(new_inode->i_mode) ||
1276 (newfe->i_links_count == cpu_to_le16(1))){
1277 status = ocfs2_orphan_add(osb, handle, new_inode,
1278 newfe, orphan_name,
1279 orphan_entry_bh);
1280 if (status < 0) {
1281 mlog_errno(status);
1282 goto bail;
1283 }
1284 }
1285
1286 /* change the dirent to point to the correct inode */
1287 status = ocfs2_journal_access(handle, new_dir, new_de_bh,
1288 OCFS2_JOURNAL_ACCESS_WRITE);
1289 if (status < 0) {
1290 mlog_errno(status);
1291 goto bail;
1292 }
1293 new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
1294 new_de->file_type = old_de->file_type;
1295 new_dir->i_version++;
1296 status = ocfs2_journal_dirty(handle, new_de_bh);
1297 if (status < 0) {
1298 mlog_errno(status);
1299 goto bail;
1300 }
1301
1302 if (S_ISDIR(new_inode->i_mode))
1303 newfe->i_links_count = 0;
1304 else
1305 le16_add_cpu(&newfe->i_links_count, -1);
1306
1307 status = ocfs2_journal_dirty(handle, newfe_bh);
1308 if (status < 0) {
1309 mlog_errno(status);
1310 goto bail;
1311 }
1312 } else {
1313 /* if the name was not found in new_dir, add it now */
1314 status = ocfs2_add_entry(handle, new_dentry, old_inode,
1315 OCFS2_I(old_inode)->ip_blkno,
1316 new_dir_bh, insert_entry_bh);
1317 }
1318
1319 old_inode->i_ctime = CURRENT_TIME;
1320 mark_inode_dirty(old_inode);
1321
1322 /* now that the name has been added to new_dir, remove the old name */
1323 status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
1324 if (status < 0) {
1325 mlog_errno(status);
1326 goto bail;
1327 }
1328
1329 if (new_inode) {
1330 new_inode->i_nlink--;
1331 new_inode->i_ctime = CURRENT_TIME;
1332 }
1333 old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
1334 if (old_inode_de_bh) {
1335 status = ocfs2_journal_access(handle, old_inode,
1336 old_inode_de_bh,
1337 OCFS2_JOURNAL_ACCESS_WRITE);
1338 PARENT_INO(old_inode_de_bh->b_data) =
1339 cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
1340 status = ocfs2_journal_dirty(handle, old_inode_de_bh);
1341 old_dir->i_nlink--;
1342 if (new_inode) {
1343 new_inode->i_nlink--;
1344 } else {
1345 new_dir->i_nlink++;
1346 mark_inode_dirty(new_dir);
1347 }
1348 }
1349 mark_inode_dirty(old_dir);
1350 if (new_inode)
1351 mark_inode_dirty(new_inode);
1352
1353 if (old_dir != new_dir)
1354 if (new_dir_nlink != new_dir->i_nlink) {
1355 if (!new_dir_bh) {
1356 mlog(ML_ERROR, "need to change nlink for new "
1357 "dir %"MLFu64" from %d to %d but bh is "
1358 "NULL\n", OCFS2_I(new_dir)->ip_blkno,
1359 (int)new_dir_nlink, new_dir->i_nlink);
1360 } else {
1361 struct ocfs2_dinode *fe;
1362 status = ocfs2_journal_access(handle,
1363 new_dir,
1364 new_dir_bh,
1365 OCFS2_JOURNAL_ACCESS_WRITE);
1366 fe = (struct ocfs2_dinode *) new_dir_bh->b_data;
1367 fe->i_links_count = cpu_to_le16(new_dir->i_nlink);
1368 status = ocfs2_journal_dirty(handle, new_dir_bh);
1369 }
1370 }
1371
1372 if (old_dir_nlink != old_dir->i_nlink) {
1373 if (!old_dir_bh) {
1374 mlog(ML_ERROR, "need to change nlink for old dir "
1375 "%"MLFu64" from %d to %d but bh is NULL!\n",
1376 OCFS2_I(old_dir)->ip_blkno,
1377 (int)old_dir_nlink,
1378 old_dir->i_nlink);
1379 } else {
1380 struct ocfs2_dinode *fe;
1381 status = ocfs2_journal_access(handle, old_dir,
1382 old_dir_bh,
1383 OCFS2_JOURNAL_ACCESS_WRITE);
1384 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1385 fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
1386 status = ocfs2_journal_dirty(handle, old_dir_bh);
1387 }
1388 }
1389
1390 status = 0;
1391bail:
1392 if (rename_lock)
1393 ocfs2_rename_unlock(osb);
1394
1395 if (handle)
1396 ocfs2_commit_trans(handle);
1397
1398 if (new_inode)
1399 sync_mapping_buffers(old_inode->i_mapping);
1400
1401 if (new_inode)
1402 iput(new_inode);
1403 if (newfe_bh)
1404 brelse(newfe_bh);
1405 if (old_dir_bh)
1406 brelse(old_dir_bh);
1407 if (new_dir_bh)
1408 brelse(new_dir_bh);
1409 if (new_de_bh)
1410 brelse(new_de_bh);
1411 if (old_de_bh)
1412 brelse(old_de_bh);
1413 if (old_inode_de_bh)
1414 brelse(old_inode_de_bh);
1415 if (orphan_entry_bh)
1416 brelse(orphan_entry_bh);
1417 if (insert_entry_bh)
1418 brelse(insert_entry_bh);
1419
1420 mlog_exit(status);
1421
1422 return status;
1423}
1424
1425/*
1426 * we expect i_size = strlen(symname). Copy symname into the file
1427 * data, including the null terminator.
1428 */
1429static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1430 struct ocfs2_journal_handle *handle,
1431 struct inode *inode,
1432 const char *symname)
1433{
1434 struct buffer_head **bhs = NULL;
1435 const char *c;
1436 struct super_block *sb = osb->sb;
1437 u64 p_blkno;
1438 int p_blocks;
1439 int virtual, blocks, status, i, bytes_left;
1440
1441 bytes_left = i_size_read(inode) + 1;
1442 /* we can't trust i_blocks because we're actually going to
1443 * write i_size + 1 bytes. */
1444 blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
1445
1446 mlog_entry("i_blocks = %lu, i_size = %llu, blocks = %d\n",
1447 inode->i_blocks, i_size_read(inode), blocks);
1448
1449 /* Sanity check -- make sure we're going to fit. */
1450 if (bytes_left >
1451 ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) {
1452 status = -EIO;
1453 mlog_errno(status);
1454 goto bail;
1455 }
1456
1457 bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL);
1458 if (!bhs) {
1459 status = -ENOMEM;
1460 mlog_errno(status);
1461 goto bail;
1462 }
1463
1464 status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
1465 &p_blocks);
1466 if (status < 0) {
1467 mlog_errno(status);
1468 goto bail;
1469 }
1470
1471 /* links can never be larger than one cluster so we know this
1472 * is all going to be contiguous, but do a sanity check
1473 * anyway. */
1474 if ((p_blocks << sb->s_blocksize_bits) < bytes_left) {
1475 status = -EIO;
1476 mlog_errno(status);
1477 goto bail;
1478 }
1479
1480 virtual = 0;
1481 while(bytes_left > 0) {
1482 c = &symname[virtual * sb->s_blocksize];
1483
1484 bhs[virtual] = sb_getblk(sb, p_blkno);
1485 if (!bhs[virtual]) {
1486 status = -ENOMEM;
1487 mlog_errno(status);
1488 goto bail;
1489 }
1490 ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]);
1491
1492 status = ocfs2_journal_access(handle, inode, bhs[virtual],
1493 OCFS2_JOURNAL_ACCESS_CREATE);
1494 if (status < 0) {
1495 mlog_errno(status);
1496 goto bail;
1497 }
1498
1499 memset(bhs[virtual]->b_data, 0, sb->s_blocksize);
1500
1501 memcpy(bhs[virtual]->b_data, c,
1502 (bytes_left > sb->s_blocksize) ? sb->s_blocksize :
1503 bytes_left);
1504
1505 status = ocfs2_journal_dirty(handle, bhs[virtual]);
1506 if (status < 0) {
1507 mlog_errno(status);
1508 goto bail;
1509 }
1510
1511 virtual++;
1512 p_blkno++;
1513 bytes_left -= sb->s_blocksize;
1514 }
1515
1516 status = 0;
1517bail:
1518
1519 if (bhs) {
1520 for(i = 0; i < blocks; i++)
1521 if (bhs[i])
1522 brelse(bhs[i]);
1523 kfree(bhs);
1524 }
1525
1526 mlog_exit(status);
1527 return status;
1528}
1529
1530static int ocfs2_symlink(struct inode *dir,
1531 struct dentry *dentry,
1532 const char *symname)
1533{
1534 int status, l, credits;
1535 u64 newsize;
1536 struct ocfs2_super *osb = NULL;
1537 struct inode *inode = NULL;
1538 struct super_block *sb;
1539 struct buffer_head *new_fe_bh = NULL;
1540 struct buffer_head *de_bh = NULL;
1541 struct buffer_head *parent_fe_bh = NULL;
1542 struct ocfs2_dinode *fe = NULL;
1543 struct ocfs2_dinode *dirfe;
1544 struct ocfs2_journal_handle *handle = NULL;
1545 struct ocfs2_alloc_context *inode_ac = NULL;
1546 struct ocfs2_alloc_context *data_ac = NULL;
1547
1548 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
1549 dentry, symname, dentry->d_name.len, dentry->d_name.name);
1550
1551 sb = dir->i_sb;
1552 osb = OCFS2_SB(sb);
1553
1554 l = strlen(symname) + 1;
1555
1556 credits = ocfs2_calc_symlink_credits(sb);
1557
1558 handle = ocfs2_alloc_handle(osb);
1559 if (handle == NULL) {
1560 status = -ENOMEM;
1561 mlog_errno(status);
1562 goto bail;
1563 }
1564
1565 /* lock the parent directory */
1566 status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1);
1567 if (status < 0) {
1568 if (status != -ENOENT)
1569 mlog_errno(status);
1570 goto bail;
1571 }
1572
1573 dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
1574 if (!dirfe->i_links_count) {
1575 /* can't make a file in a deleted directory. */
1576 status = -ENOENT;
1577 goto bail;
1578 }
1579
1580 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
1581 dentry->d_name.len);
1582 if (status)
1583 goto bail;
1584
1585 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh,
1586 dentry->d_name.name,
1587 dentry->d_name.len, &de_bh);
1588 if (status < 0) {
1589 mlog_errno(status);
1590 goto bail;
1591 }
1592
1593 status = ocfs2_reserve_new_inode(osb, handle, &inode_ac);
1594 if (status < 0) {
1595 if (status != -ENOSPC)
1596 mlog_errno(status);
1597 goto bail;
1598 }
1599
1600 /* don't reserve bitmap space for fast symlinks. */
1601 if (l > ocfs2_fast_symlink_chars(sb)) {
1602 status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
1603 if (status < 0) {
1604 if (status != -ENOSPC)
1605 mlog_errno(status);
1606 goto bail;
1607 }
1608 }
1609
1610 handle = ocfs2_start_trans(osb, handle, credits);
1611 if (IS_ERR(handle)) {
1612 status = PTR_ERR(handle);
1613 handle = NULL;
1614 mlog_errno(status);
1615 goto bail;
1616 }
1617
1618 status = ocfs2_mknod_locked(osb, dir, dentry,
1619 S_IFLNK | S_IRWXUGO, 0,
1620 &new_fe_bh, parent_fe_bh, handle,
1621 &inode, inode_ac);
1622 if (status < 0) {
1623 mlog_errno(status);
1624 goto bail;
1625 }
1626
1627 fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
1628 inode->i_rdev = 0;
1629 newsize = l - 1;
1630 if (l > ocfs2_fast_symlink_chars(sb)) {
1631 inode->i_op = &ocfs2_symlink_inode_operations;
1632 status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
1633 handle, data_ac, NULL,
1634 NULL);
1635 if (status < 0) {
1636 if (status != -ENOSPC && status != -EINTR) {
1637 mlog(ML_ERROR, "Failed to extend file to "
1638 "%"MLFu64"\n",
1639 newsize);
1640 mlog_errno(status);
1641 status = -ENOSPC;
1642 }
1643 goto bail;
1644 }
1645 i_size_write(inode, newsize);
1646 inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
1647 } else {
1648 inode->i_op = &ocfs2_fast_symlink_inode_operations;
1649 memcpy((char *) fe->id2.i_symlink, symname, l);
1650 i_size_write(inode, newsize);
1651 inode->i_blocks = 0;
1652 }
1653
1654 status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh);
1655 if (status < 0) {
1656 mlog_errno(status);
1657 goto bail;
1658 }
1659
1660 if (!ocfs2_inode_is_fast_symlink(inode)) {
1661 status = ocfs2_create_symlink_data(osb, handle, inode,
1662 symname);
1663 if (status < 0) {
1664 mlog_errno(status);
1665 goto bail;
1666 }
1667 }
1668
1669 status = ocfs2_add_entry(handle, dentry, inode,
1670 le64_to_cpu(fe->i_blkno), parent_fe_bh,
1671 de_bh);
1672 if (status < 0) {
1673 mlog_errno(status);
1674 goto bail;
1675 }
1676
1677 insert_inode_hash(inode);
1678 dentry->d_op = &ocfs2_dentry_ops;
1679 d_instantiate(dentry, inode);
1680bail:
1681 if (handle)
1682 ocfs2_commit_trans(handle);
1683 if (new_fe_bh)
1684 brelse(new_fe_bh);
1685 if (parent_fe_bh)
1686 brelse(parent_fe_bh);
1687 if (de_bh)
1688 brelse(de_bh);
1689 if (inode_ac)
1690 ocfs2_free_alloc_context(inode_ac);
1691 if (data_ac)
1692 ocfs2_free_alloc_context(data_ac);
1693 if ((status < 0) && inode)
1694 iput(inode);
1695
1696 mlog_exit(status);
1697
1698 return status;
1699}
1700
1701int ocfs2_check_dir_entry(struct inode * dir,
1702 struct ocfs2_dir_entry * de,
1703 struct buffer_head * bh,
1704 unsigned long offset)
1705{
1706 const char *error_msg = NULL;
1707 const int rlen = le16_to_cpu(de->rec_len);
1708
1709 if (rlen < OCFS2_DIR_REC_LEN(1))
1710 error_msg = "rec_len is smaller than minimal";
1711 else if (rlen % 4 != 0)
1712 error_msg = "rec_len % 4 != 0";
1713 else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
1714 error_msg = "rec_len is too small for name_len";
1715 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
1716 error_msg = "directory entry across blocks";
1717
1718 if (error_msg != NULL)
1719 mlog(ML_ERROR, "bad entry in directory #%"MLFu64": %s - "
1720 "offset=%lu, inode=%"MLFu64", rec_len=%d, name_len=%d\n",
1721 OCFS2_I(dir)->ip_blkno, error_msg, offset,
1722 le64_to_cpu(de->inode), rlen, de->name_len);
1723 return error_msg == NULL ? 1 : 0;
1724}
1725
1726/* we don't always have a dentry for what we want to add, so people
1727 * like orphan dir can call this instead.
1728 *
1729 * If you pass me insert_bh, I'll skip the search of the other dir
1730 * blocks and put the record in there.
1731 */
1732static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle,
1733 struct inode *dir,
1734 const char *name, int namelen,
1735 struct inode *inode, u64 blkno,
1736 struct buffer_head *parent_fe_bh,
1737 struct buffer_head *insert_bh)
1738{
1739 unsigned long offset;
1740 unsigned short rec_len;
1741 struct ocfs2_dir_entry *de, *de1;
1742 struct super_block *sb;
1743 int retval, status;
1744
1745 mlog_entry_void();
1746
1747 sb = dir->i_sb;
1748
1749 if (!namelen)
1750 return -EINVAL;
1751
1752 rec_len = OCFS2_DIR_REC_LEN(namelen);
1753 offset = 0;
1754 de = (struct ocfs2_dir_entry *) insert_bh->b_data;
1755 while (1) {
1756 BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
1757 /* These checks should've already been passed by the
1758 * prepare function, but I guess we can leave them
1759 * here anyway. */
1760 if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
1761 retval = -ENOENT;
1762 goto bail;
1763 }
1764 if (ocfs2_match(namelen, name, de)) {
1765 retval = -EEXIST;
1766 goto bail;
1767 }
1768 if (((le64_to_cpu(de->inode) == 0) &&
1769 (le16_to_cpu(de->rec_len) >= rec_len)) ||
1770 (le16_to_cpu(de->rec_len) >=
1771 (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
1772 status = ocfs2_journal_access(handle, dir, insert_bh,
1773 OCFS2_JOURNAL_ACCESS_WRITE);
1774 /* By now the buffer is marked for journaling */
1775 offset += le16_to_cpu(de->rec_len);
1776 if (le64_to_cpu(de->inode)) {
1777 de1 = (struct ocfs2_dir_entry *)((char *) de +
1778 OCFS2_DIR_REC_LEN(de->name_len));
1779 de1->rec_len =
1780 cpu_to_le16(le16_to_cpu(de->rec_len) -
1781 OCFS2_DIR_REC_LEN(de->name_len));
1782 de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
1783 de = de1;
1784 }
1785 de->file_type = OCFS2_FT_UNKNOWN;
1786 if (blkno) {
1787 de->inode = cpu_to_le64(blkno);
1788 ocfs2_set_de_type(de, inode->i_mode);
1789 } else
1790 de->inode = 0;
1791 de->name_len = namelen;
1792 memcpy(de->name, name, namelen);
1793
1794 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
1795 dir->i_version++;
1796 status = ocfs2_journal_dirty(handle, insert_bh);
1797 retval = 0;
1798 goto bail;
1799 }
1800 offset += le16_to_cpu(de->rec_len);
1801 de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
1802 }
1803
1804 /* when you think about it, the assert above should prevent us
1805 * from ever getting here. */
1806 retval = -ENOSPC;
1807bail:
1808
1809 mlog_exit(retval);
1810 return retval;
1811}
1812
1813
1814/*
1815 * ocfs2_delete_entry deletes a directory entry by merging it with the
1816 * previous entry
1817 */
1818static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle,
1819 struct inode *dir,
1820 struct ocfs2_dir_entry *de_del,
1821 struct buffer_head *bh)
1822{
1823 struct ocfs2_dir_entry *de, *pde;
1824 int i, status = -ENOENT;
1825
1826 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
1827
1828 i = 0;
1829 pde = NULL;
1830 de = (struct ocfs2_dir_entry *) bh->b_data;
1831 while (i < bh->b_size) {
1832 if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
1833 status = -EIO;
1834 mlog_errno(status);
1835 goto bail;
1836 }
1837 if (de == de_del) {
1838 status = ocfs2_journal_access(handle, dir, bh,
1839 OCFS2_JOURNAL_ACCESS_WRITE);
1840 if (status < 0) {
1841 status = -EIO;
1842 mlog_errno(status);
1843 goto bail;
1844 }
1845 if (pde)
1846 pde->rec_len =
1847 cpu_to_le16(le16_to_cpu(pde->rec_len) +
1848 le16_to_cpu(de->rec_len));
1849 else
1850 de->inode = 0;
1851 dir->i_version++;
1852 status = ocfs2_journal_dirty(handle, bh);
1853 goto bail;
1854 }
1855 i += le16_to_cpu(de->rec_len);
1856 pde = de;
1857 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
1858 }
1859bail:
1860 mlog_exit(status);
1861 return status;
1862}
1863
1864/*
1865 * Returns 0 if not found, -1 on failure, and 1 on success
1866 */
1867static int inline ocfs2_search_dirblock(struct buffer_head *bh,
1868 struct inode *dir,
1869 const char *name, int namelen,
1870 unsigned long offset,
1871 struct ocfs2_dir_entry **res_dir)
1872{
1873 struct ocfs2_dir_entry *de;
1874 char *dlimit, *de_buf;
1875 int de_len;
1876 int ret = 0;
1877
1878 mlog_entry_void();
1879
1880 de_buf = bh->b_data;
1881 dlimit = de_buf + dir->i_sb->s_blocksize;
1882
1883 while (de_buf < dlimit) {
1884 /* this code is executed quadratically often */
1885 /* do minimal checking `by hand' */
1886
1887 de = (struct ocfs2_dir_entry *) de_buf;
1888
1889 if (de_buf + namelen <= dlimit &&
1890 ocfs2_match(namelen, name, de)) {
1891 /* found a match - just to be sure, do a full check */
1892 if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
1893 ret = -1;
1894 goto bail;
1895 }
1896 *res_dir = de;
1897 ret = 1;
1898 goto bail;
1899 }
1900
1901 /* prevent looping on a bad block */
1902 de_len = le16_to_cpu(de->rec_len);
1903 if (de_len <= 0) {
1904 ret = -1;
1905 goto bail;
1906 }
1907
1908 de_buf += de_len;
1909 offset += de_len;
1910 }
1911
1912bail:
1913 mlog_exit(ret);
1914 return ret;
1915}
1916
1917struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
1918 struct inode *dir,
1919 struct ocfs2_dir_entry **res_dir)
1920{
1921 struct super_block *sb;
1922 struct buffer_head *bh_use[NAMEI_RA_SIZE];
1923 struct buffer_head *bh, *ret = NULL;
1924 unsigned long start, block, b;
1925 int ra_max = 0; /* Number of bh's in the readahead
1926 buffer, bh_use[] */
1927 int ra_ptr = 0; /* Current index into readahead
1928 buffer */
1929 int num = 0;
1930 int nblocks, i, err;
1931
1932 mlog_entry_void();
1933
1934 *res_dir = NULL;
1935 sb = dir->i_sb;
1936
1937 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
1938 start = OCFS2_I(dir)->ip_dir_start_lookup;
1939 if (start >= nblocks)
1940 start = 0;
1941 block = start;
1942
1943restart:
1944 do {
1945 /*
1946 * We deal with the read-ahead logic here.
1947 */
1948 if (ra_ptr >= ra_max) {
1949 /* Refill the readahead buffer */
1950 ra_ptr = 0;
1951 b = block;
1952 for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
1953 /*
1954 * Terminate if we reach the end of the
1955 * directory and must wrap, or if our
1956 * search has finished at this block.
1957 */
1958 if (b >= nblocks || (num && block == start)) {
1959 bh_use[ra_max] = NULL;
1960 break;
1961 }
1962 num++;
1963
1964 /* XXX: questionable readahead stuff here */
1965 bh = ocfs2_bread(dir, b++, &err, 1);
1966 bh_use[ra_max] = bh;
1967#if 0 // ???
1968 if (bh)
1969 ll_rw_block(READ, 1, &bh);
1970#endif
1971 }
1972 }
1973 if ((bh = bh_use[ra_ptr++]) == NULL)
1974 goto next;
1975 wait_on_buffer(bh);
1976 if (!buffer_uptodate(bh)) {
1977 /* read error, skip block & hope for the best */
1978 brelse(bh);
1979 goto next;
1980 }
1981 i = ocfs2_search_dirblock(bh, dir, name, namelen,
1982 block << sb->s_blocksize_bits,
1983 res_dir);
1984 if (i == 1) {
1985 OCFS2_I(dir)->ip_dir_start_lookup = block;
1986 ret = bh;
1987 goto cleanup_and_exit;
1988 } else {
1989 brelse(bh);
1990 if (i < 0)
1991 goto cleanup_and_exit;
1992 }
1993 next:
1994 if (++block >= nblocks)
1995 block = 0;
1996 } while (block != start);
1997
1998 /*
1999 * If the directory has grown while we were searching, then
2000 * search the last part of the directory before giving up.
2001 */
2002 block = nblocks;
2003 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
2004 if (block < nblocks) {
2005 start = 0;
2006 goto restart;
2007 }
2008
2009cleanup_and_exit:
2010 /* Clean up the read-ahead blocks */
2011 for (; ra_ptr < ra_max; ra_ptr++)
2012 brelse(bh_use[ra_ptr]);
2013
2014 mlog_exit_ptr(ret);
2015 return ret;
2016}
2017
2018static int ocfs2_blkno_stringify(u64 blkno, char *name)
2019{
2020 int status, namelen;
2021
2022 mlog_entry_void();
2023
2024 namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016"MLFx64,
2025 blkno);
2026 if (namelen <= 0) {
2027 if (namelen)
2028 status = namelen;
2029 else
2030 status = -EINVAL;
2031 mlog_errno(status);
2032 goto bail;
2033 }
2034 if (namelen != OCFS2_ORPHAN_NAMELEN) {
2035 status = -EINVAL;
2036 mlog_errno(status);
2037 goto bail;
2038 }
2039
2040 mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
2041 namelen);
2042
2043 status = 0;
2044bail:
2045 mlog_exit(status);
2046 return status;
2047}
2048
2049static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
2050 struct ocfs2_journal_handle *handle,
2051 struct inode *inode,
2052 char *name,
2053 struct buffer_head **de_bh)
2054{
2055 struct inode *orphan_dir_inode = NULL;
2056 struct buffer_head *orphan_dir_bh = NULL;
2057 int status = 0;
2058
2059 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
2060 if (status < 0) {
2061 mlog_errno(status);
2062 goto leave;
2063 }
2064
2065 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2066 ORPHAN_DIR_SYSTEM_INODE,
2067 osb->slot_num);
2068 if (!orphan_dir_inode) {
2069 status = -ENOENT;
2070 mlog_errno(status);
2071 goto leave;
2072 }
2073
2074 ocfs2_handle_add_inode(handle, orphan_dir_inode);
2075 status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1);
2076 if (status < 0) {
2077 mlog_errno(status);
2078 goto leave;
2079 }
2080
2081 status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
2082 orphan_dir_bh, name,
2083 OCFS2_ORPHAN_NAMELEN, de_bh);
2084 if (status < 0) {
2085 mlog_errno(status);
2086 goto leave;
2087 }
2088
2089leave:
2090 if (orphan_dir_inode)
2091 iput(orphan_dir_inode);
2092
2093 if (orphan_dir_bh)
2094 brelse(orphan_dir_bh);
2095
2096 mlog_exit(status);
2097 return status;
2098}
2099
2100static int ocfs2_orphan_add(struct ocfs2_super *osb,
2101 struct ocfs2_journal_handle *handle,
2102 struct inode *inode,
2103 struct ocfs2_dinode *fe,
2104 char *name,
2105 struct buffer_head *de_bh)
2106{
2107 struct inode *orphan_dir_inode = NULL;
2108 struct buffer_head *orphan_dir_bh = NULL;
2109 int status = 0;
2110 struct ocfs2_dinode *orphan_fe;
2111
2112 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
2113
2114 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2115 ORPHAN_DIR_SYSTEM_INODE,
2116 osb->slot_num);
2117 if (!orphan_dir_inode) {
2118 status = -ENOENT;
2119 mlog_errno(status);
2120 goto leave;
2121 }
2122
2123 status = ocfs2_read_block(osb,
2124 OCFS2_I(orphan_dir_inode)->ip_blkno,
2125 &orphan_dir_bh, OCFS2_BH_CACHED,
2126 orphan_dir_inode);
2127 if (status < 0) {
2128 mlog_errno(status);
2129 goto leave;
2130 }
2131
2132 status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
2133 OCFS2_JOURNAL_ACCESS_WRITE);
2134 if (status < 0) {
2135 mlog_errno(status);
2136 goto leave;
2137 }
2138
2139 /* we're a cluster, and nlink can change on disk from
2140 * underneath us... */
2141 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
2142 if (S_ISDIR(inode->i_mode))
2143 le16_add_cpu(&orphan_fe->i_links_count, 1);
2144 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
2145
2146 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2147 if (status < 0) {
2148 mlog_errno(status);
2149 goto leave;
2150 }
2151
2152 status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
2153 OCFS2_ORPHAN_NAMELEN, inode,
2154 OCFS2_I(inode)->ip_blkno,
2155 orphan_dir_bh, de_bh);
2156 if (status < 0) {
2157 mlog_errno(status);
2158 goto leave;
2159 }
2160
2161 le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
2162
2163 /* Record which orphan dir our inode now resides
2164 * in. delete_inode will use this to determine which orphan
2165 * dir to lock. */
2166 spin_lock(&OCFS2_I(inode)->ip_lock);
2167 OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
2168 spin_unlock(&OCFS2_I(inode)->ip_lock);
2169
2170 mlog(0, "Inode %"MLFu64" orphaned in slot %d\n",
2171 OCFS2_I(inode)->ip_blkno, osb->slot_num);
2172
2173leave:
2174 if (orphan_dir_inode)
2175 iput(orphan_dir_inode);
2176
2177 if (orphan_dir_bh)
2178 brelse(orphan_dir_bh);
2179
2180 mlog_exit(status);
2181 return status;
2182}
2183
2184/* unlike orphan_add, we expect the orphan dir to already be locked here. */
2185int ocfs2_orphan_del(struct ocfs2_super *osb,
2186 struct ocfs2_journal_handle *handle,
2187 struct inode *orphan_dir_inode,
2188 struct inode *inode,
2189 struct buffer_head *orphan_dir_bh)
2190{
2191 char name[OCFS2_ORPHAN_NAMELEN + 1];
2192 struct ocfs2_dinode *orphan_fe;
2193 int status = 0;
2194 struct buffer_head *target_de_bh = NULL;
2195 struct ocfs2_dir_entry *target_de = NULL;
2196
2197 mlog_entry_void();
2198
2199 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
2200 if (status < 0) {
2201 mlog_errno(status);
2202 goto leave;
2203 }
2204
2205 mlog(0, "removing '%s' from orphan dir %"MLFu64" (namelen=%d)\n",
2206 name, OCFS2_I(orphan_dir_inode)->ip_blkno, OCFS2_ORPHAN_NAMELEN);
2207
2208 /* find it's spot in the orphan directory */
2209 target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN,
2210 orphan_dir_inode, &target_de);
2211 if (!target_de_bh) {
2212 status = -ENOENT;
2213 mlog_errno(status);
2214 goto leave;
2215 }
2216
2217 /* remove it from the orphan directory */
2218 status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de,
2219 target_de_bh);
2220 if (status < 0) {
2221 mlog_errno(status);
2222 goto leave;
2223 }
2224
2225 status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh,
2226 OCFS2_JOURNAL_ACCESS_WRITE);
2227 if (status < 0) {
2228 mlog_errno(status);
2229 goto leave;
2230 }
2231
2232 /* do the i_nlink dance! :) */
2233 orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
2234 if (S_ISDIR(inode->i_mode))
2235 le16_add_cpu(&orphan_fe->i_links_count, -1);
2236 orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count);
2237
2238 status = ocfs2_journal_dirty(handle, orphan_dir_bh);
2239 if (status < 0) {
2240 mlog_errno(status);
2241 goto leave;
2242 }
2243
2244leave:
2245 if (target_de_bh)
2246 brelse(target_de_bh);
2247
2248 mlog_exit(status);
2249 return status;
2250}
2251
2252struct inode_operations ocfs2_dir_iops = {
2253 .create = ocfs2_create,
2254 .lookup = ocfs2_lookup,
2255 .link = ocfs2_link,
2256 .unlink = ocfs2_unlink,
2257 .rmdir = ocfs2_unlink,
2258 .symlink = ocfs2_symlink,
2259 .mkdir = ocfs2_mkdir,
2260 .mknod = ocfs2_mknod,
2261 .rename = ocfs2_rename,
2262 .setattr = ocfs2_setattr,
2263 .getattr = ocfs2_getattr,
2264};
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
new file mode 100644
index 000000000000..deaaa97dbf0b
--- /dev/null
+++ b/fs/ocfs2/namei.h
@@ -0,0 +1,58 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * namei.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_NAMEI_H
27#define OCFS2_NAMEI_H
28
29extern struct inode_operations ocfs2_dir_iops;
30
31struct dentry *ocfs2_get_parent(struct dentry *child);
32
33int ocfs2_check_dir_entry (struct inode *dir,
34 struct ocfs2_dir_entry *de,
35 struct buffer_head *bh,
36 unsigned long offset);
37struct buffer_head *ocfs2_find_entry(const char *name,
38 int namelen,
39 struct inode *dir,
40 struct ocfs2_dir_entry **res_dir);
41int ocfs2_orphan_del(struct ocfs2_super *osb,
42 struct ocfs2_journal_handle *handle,
43 struct inode *orphan_dir_inode,
44 struct inode *inode,
45 struct buffer_head *orphan_dir_bh);
46
47static inline int ocfs2_match(int len,
48 const char * const name,
49 struct ocfs2_dir_entry *de)
50{
51 if (len != de->name_len)
52 return 0;
53 if (!de->inode)
54 return 0;
55 return !memcmp(name, de->name, len);
56}
57
58#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h
new file mode 100644
index 000000000000..0b499bccec5a
--- /dev/null
+++ b/fs/ocfs2/ocfs1_fs_compat.h
@@ -0,0 +1,109 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs1_fs_compat.h
5 *
6 * OCFS1 volume header definitions. OCFS2 creates valid but unmountable
7 * OCFS1 volume headers on the first two sectors of an OCFS2 volume.
8 * This allows an OCFS1 volume to see the partition and cleanly fail to
9 * mount it.
10 *
11 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public
15 * License, version 2, as published by the Free Software Foundation.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public
23 * License along with this program; if not, write to the
24 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
25 * Boston, MA 021110-1307, USA.
26 */
27
28#ifndef _OCFS1_FS_COMPAT_H
29#define _OCFS1_FS_COMPAT_H
30
31#define OCFS1_MAX_VOL_SIGNATURE_LEN 128
32#define OCFS1_MAX_MOUNT_POINT_LEN 128
33#define OCFS1_MAX_VOL_ID_LENGTH 16
34#define OCFS1_MAX_VOL_LABEL_LEN 64
35#define OCFS1_MAX_CLUSTER_NAME_LEN 64
36
37#define OCFS1_MAJOR_VERSION (2)
38#define OCFS1_MINOR_VERSION (0)
39#define OCFS1_VOLUME_SIGNATURE "OracleCFS"
40
41/*
42 * OCFS1 superblock. Lives at sector 0.
43 */
44struct ocfs1_vol_disk_hdr
45{
46/*00*/ __u32 minor_version;
47 __u32 major_version;
48/*08*/ __u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
49/*88*/ __u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
50/*108*/ __u64 serial_num;
51/*110*/ __u64 device_size;
52 __u64 start_off;
53/*120*/ __u64 bitmap_off;
54 __u64 publ_off;
55/*130*/ __u64 vote_off;
56 __u64 root_bitmap_off;
57/*140*/ __u64 data_start_off;
58 __u64 root_bitmap_size;
59/*150*/ __u64 root_off;
60 __u64 root_size;
61/*160*/ __u64 cluster_size;
62 __u64 num_nodes;
63/*170*/ __u64 num_clusters;
64 __u64 dir_node_size;
65/*180*/ __u64 file_node_size;
66 __u64 internal_off;
67/*190*/ __u64 node_cfg_off;
68 __u64 node_cfg_size;
69/*1A0*/ __u64 new_cfg_off;
70 __u32 prot_bits;
71 __s32 excl_mount;
72/*1B0*/
73};
74
75
76struct ocfs1_disk_lock
77{
78/*00*/ __u32 curr_master;
79 __u8 file_lock;
80 __u8 compat_pad[3]; /* Not in orignal definition. Used to
81 make the already existing alignment
82 explicit */
83 __u64 last_write_time;
84/*10*/ __u64 last_read_time;
85 __u32 writer_node_num;
86 __u32 reader_node_num;
87/*20*/ __u64 oin_node_map;
88 __u64 dlock_seq_num;
89/*30*/
90};
91
92/*
93 * OCFS1 volume label. Lives at sector 1.
94 */
95struct ocfs1_vol_label
96{
97/*00*/ struct ocfs1_disk_lock disk_lock;
98/*30*/ __u8 label[OCFS1_MAX_VOL_LABEL_LEN];
99/*70*/ __u16 label_len;
100/*72*/ __u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
101/*82*/ __u16 vol_id_len;
102/*84*/ __u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
103/*A4*/ __u16 cluster_name_len;
104/*A6*/
105};
106
107
108#endif /* _OCFS1_FS_COMPAT_H */
109
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
new file mode 100644
index 000000000000..f468c600cf92
--- /dev/null
+++ b/fs/ocfs2/ocfs2.h
@@ -0,0 +1,464 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2.h
5 *
6 * Defines macros and structures used in OCFS2
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_H
27#define OCFS2_H
28
29#include <linux/spinlock.h>
30#include <linux/sched.h>
31#include <linux/wait.h>
32#include <linux/list.h>
33#include <linux/rbtree.h>
34#include <linux/workqueue.h>
35#include <linux/kref.h>
36
37#include "cluster/nodemanager.h"
38#include "cluster/heartbeat.h"
39#include "cluster/tcp.h"
40
41#include "dlm/dlmapi.h"
42
43#include "ocfs2_fs.h"
44#include "endian.h"
45#include "ocfs2_lockid.h"
46
47struct ocfs2_extent_map {
48 u32 em_clusters;
49 struct rb_root em_extents;
50};
51
52/* Most user visible OCFS2 inodes will have very few pieces of
53 * metadata, but larger files (including bitmaps, etc) must be taken
54 * into account when designing an access scheme. We allow a small
55 * amount of inlined blocks to be stored on an array and grow the
56 * structure into a rb tree when necessary. */
57#define OCFS2_INODE_MAX_CACHE_ARRAY 2
58
59struct ocfs2_caching_info {
60 unsigned int ci_num_cached;
61 union {
62 sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
63 struct rb_root ci_tree;
64 } ci_cache;
65};
66
67/* this limits us to 256 nodes
68 * if we need more, we can do a kmalloc for the map */
69#define OCFS2_NODE_MAP_MAX_NODES 256
70struct ocfs2_node_map {
71 u16 num_nodes;
72 unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
73};
74
75enum ocfs2_ast_action {
76 OCFS2_AST_INVALID = 0,
77 OCFS2_AST_ATTACH,
78 OCFS2_AST_CONVERT,
79 OCFS2_AST_DOWNCONVERT,
80};
81
82/* actions for an unlockast function to take. */
83enum ocfs2_unlock_action {
84 OCFS2_UNLOCK_INVALID = 0,
85 OCFS2_UNLOCK_CANCEL_CONVERT,
86 OCFS2_UNLOCK_DROP_LOCK,
87};
88
89/* ocfs2_lock_res->l_flags flags. */
90#define OCFS2_LOCK_ATTACHED (0x00000001) /* have we initialized
91 * the lvb */
92#define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in
93 * dlm_lock */
94#define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to
95 * downconvert*/
96#define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */
97#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
98#define OCFS2_LOCK_REFRESHING (0x00000020)
99#define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization
100 * for shutdown paths */
101#define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track
102 * when to skip queueing
103 * a lock because it's
104 * about to be
105 * dropped. */
106#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
107
108struct ocfs2_lock_res_ops;
109
110typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
111
112struct ocfs2_lock_res {
113 void *l_priv;
114 struct ocfs2_lock_res_ops *l_ops;
115 spinlock_t l_lock;
116
117 struct list_head l_blocked_list;
118 struct list_head l_mask_waiters;
119
120 enum ocfs2_lock_type l_type;
121 unsigned long l_flags;
122 char l_name[OCFS2_LOCK_ID_MAX_LEN];
123 int l_level;
124 unsigned int l_ro_holders;
125 unsigned int l_ex_holders;
126 struct dlm_lockstatus l_lksb;
127
128 /* used from AST/BAST funcs. */
129 enum ocfs2_ast_action l_action;
130 enum ocfs2_unlock_action l_unlock_action;
131 int l_requested;
132 int l_blocking;
133
134 wait_queue_head_t l_event;
135
136 struct list_head l_debug_list;
137};
138
139struct ocfs2_dlm_debug {
140 struct kref d_refcnt;
141 struct dentry *d_locking_state;
142 struct list_head d_lockres_tracking;
143};
144
145enum ocfs2_vol_state
146{
147 VOLUME_INIT = 0,
148 VOLUME_MOUNTED,
149 VOLUME_DISMOUNTED,
150 VOLUME_DISABLED
151};
152
153struct ocfs2_alloc_stats
154{
155 atomic_t moves;
156 atomic_t local_data;
157 atomic_t bitmap_data;
158 atomic_t bg_allocs;
159 atomic_t bg_extends;
160};
161
162enum ocfs2_local_alloc_state
163{
164 OCFS2_LA_UNUSED = 0,
165 OCFS2_LA_ENABLED,
166 OCFS2_LA_DISABLED
167};
168
169enum ocfs2_mount_options
170{
171 OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */
172 OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
173 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
174 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
175 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
176#ifdef OCFS2_ORACORE_WORKAROUNDS
177 OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
178#endif
179};
180
181#define OCFS2_OSB_SOFT_RO 0x0001
182#define OCFS2_OSB_HARD_RO 0x0002
183#define OCFS2_OSB_ERROR_FS 0x0004
184
185struct ocfs2_journal;
186struct ocfs2_journal_handle;
187struct ocfs2_super
188{
189 u32 osb_id; /* id used by the proc interface */
190 struct task_struct *commit_task;
191 struct super_block *sb;
192 struct inode *root_inode;
193 struct inode *sys_root_inode;
194 struct inode *system_inodes[NUM_SYSTEM_INODES];
195
196 struct ocfs2_slot_info *slot_info;
197
198 spinlock_t node_map_lock;
199 struct ocfs2_node_map mounted_map;
200 struct ocfs2_node_map recovery_map;
201 struct ocfs2_node_map umount_map;
202
203 u32 num_clusters;
204 u64 root_blkno;
205 u64 system_dir_blkno;
206 u64 bitmap_blkno;
207 u32 bitmap_cpg;
208 u8 *uuid;
209 char *uuid_str;
210 u8 *vol_label;
211 u64 first_cluster_group_blkno;
212 u32 fs_generation;
213
214 u32 s_feature_compat;
215 u32 s_feature_incompat;
216 u32 s_feature_ro_compat;
217
218 /* Protects s_next_generaion, osb_flags. Could protect more on
219 * osb as it's very short lived. */
220 spinlock_t osb_lock;
221 u32 s_next_generation;
222 unsigned long osb_flags;
223
224 unsigned long s_mount_opt;
225
226 u16 max_slots;
227 u16 num_nodes;
228 s16 node_num;
229 s16 slot_num;
230 int s_sectsize_bits;
231 int s_clustersize;
232 int s_clustersize_bits;
233 struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
234
235 atomic_t vol_state;
236 struct semaphore recovery_lock;
237 struct task_struct *recovery_thread_task;
238 int disable_recovery;
239 wait_queue_head_t checkpoint_event;
240 atomic_t needs_checkpoint;
241 struct ocfs2_journal *journal;
242
243 enum ocfs2_local_alloc_state local_alloc_state;
244 struct buffer_head *local_alloc_bh;
245
246 /* Next two fields are for local node slot recovery during
247 * mount. */
248 int dirty;
249 struct ocfs2_dinode *local_alloc_copy;
250
251 struct ocfs2_alloc_stats alloc_stats;
252 char dev_str[20]; /* "major,minor" of the device */
253
254 struct dlm_ctxt *dlm;
255 struct ocfs2_lock_res osb_super_lockres;
256 struct ocfs2_lock_res osb_rename_lockres;
257 struct dlm_eviction_cb osb_eviction_cb;
258 struct ocfs2_dlm_debug *osb_dlm_debug;
259
260 struct dentry *osb_debug_root;
261
262 wait_queue_head_t recovery_event;
263
264 spinlock_t vote_task_lock;
265 struct task_struct *vote_task;
266 wait_queue_head_t vote_event;
267 unsigned long vote_wake_sequence;
268 unsigned long vote_work_sequence;
269
270 struct list_head blocked_lock_list;
271 unsigned long blocked_lock_count;
272
273 struct list_head vote_list;
274 int vote_count;
275
276 u32 net_key;
277 spinlock_t net_response_lock;
278 unsigned int net_response_ids;
279 struct list_head net_response_list;
280
281 struct o2hb_callback_func osb_hb_up;
282 struct o2hb_callback_func osb_hb_down;
283
284 struct list_head osb_net_handlers;
285
286 wait_queue_head_t osb_mount_event;
287
288 /* Truncate log info */
289 struct inode *osb_tl_inode;
290 struct buffer_head *osb_tl_bh;
291 struct work_struct osb_truncate_log_wq;
292};
293
294#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
295#define OCFS2_MAX_OSB_ID 65536
296
297static inline int ocfs2_should_order_data(struct inode *inode)
298{
299 if (!S_ISREG(inode->i_mode))
300 return 0;
301 if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
302 return 0;
303 return 1;
304}
305
306/* set / clear functions because cluster events can make these happen
307 * in parallel so we want the transitions to be atomic. this also
308 * means that any future flags osb_flags must be protected by spinlock
309 * too! */
310static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
311 unsigned long flag)
312{
313 spin_lock(&osb->osb_lock);
314 osb->osb_flags |= flag;
315 spin_unlock(&osb->osb_lock);
316}
317
318static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
319 int hard)
320{
321 spin_lock(&osb->osb_lock);
322 osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
323 if (hard)
324 osb->osb_flags |= OCFS2_OSB_HARD_RO;
325 else
326 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
327 spin_unlock(&osb->osb_lock);
328}
329
330static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
331{
332 int ret;
333
334 spin_lock(&osb->osb_lock);
335 ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
336 spin_unlock(&osb->osb_lock);
337
338 return ret;
339}
340
341static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
342{
343 int ret;
344
345 spin_lock(&osb->osb_lock);
346 ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
347 spin_unlock(&osb->osb_lock);
348
349 return ret;
350}
351
352#define OCFS2_IS_VALID_DINODE(ptr) \
353 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
354
355#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \
356 typeof(__di) ____di = (__di); \
357 ocfs2_error((__sb), \
358 "Dinode # %"MLFu64" has bad signature %.*s", \
359 (____di)->i_blkno, 7, \
360 (____di)->i_signature); \
361} while (0);
362
363#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
364 (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
365
366#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \
367 typeof(__eb) ____eb = (__eb); \
368 ocfs2_error((__sb), \
369 "Extent Block # %"MLFu64" has bad signature %.*s", \
370 (____eb)->h_blkno, 7, \
371 (____eb)->h_signature); \
372} while (0);
373
374#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
375 (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
376
377#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \
378 typeof(__gd) ____gd = (__gd); \
379 ocfs2_error((__sb), \
380 "Group Descriptor # %"MLFu64" has bad signature %.*s", \
381 (____gd)->bg_blkno, 7, \
382 (____gd)->bg_signature); \
383} while (0);
384
385static inline unsigned long ino_from_blkno(struct super_block *sb,
386 u64 blkno)
387{
388 return (unsigned long)(blkno & (u64)ULONG_MAX);
389}
390
391static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
392 u32 clusters)
393{
394 int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
395 sb->s_blocksize_bits;
396
397 return (u64)clusters << c_to_b_bits;
398}
399
400static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
401 u64 blocks)
402{
403 int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
404 sb->s_blocksize_bits;
405
406 return (u32)(blocks >> b_to_c_bits);
407}
408
409static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
410 u64 bytes)
411{
412 int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
413 unsigned int clusters;
414
415 bytes += OCFS2_SB(sb)->s_clustersize - 1;
416 /* OCFS2 just cannot have enough clusters to overflow this */
417 clusters = (unsigned int)(bytes >> cl_bits);
418
419 return clusters;
420}
421
422static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
423 u64 bytes)
424{
425 bytes += sb->s_blocksize - 1;
426 return bytes >> sb->s_blocksize_bits;
427}
428
429static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
430 u32 clusters)
431{
432 return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
433}
434
435static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
436 u64 bytes)
437{
438 int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
439 unsigned int clusters;
440
441 clusters = ocfs2_clusters_for_bytes(sb, bytes);
442 return (u64)clusters << cl_bits;
443}
444
445static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
446 u64 bytes)
447{
448 u64 blocks;
449
450 blocks = ocfs2_blocks_for_bytes(sb, bytes);
451 return blocks << sb->s_blocksize_bits;
452}
453
454static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
455{
456 return (unsigned long)((bytes + 511) >> 9);
457}
458
459#define ocfs2_set_bit ext2_set_bit
460#define ocfs2_clear_bit ext2_clear_bit
461#define ocfs2_test_bit ext2_test_bit
462#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
463#endif /* OCFS2_H */
464
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
new file mode 100644
index 000000000000..dfb8a5bedfc8
--- /dev/null
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -0,0 +1,638 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_fs.h
5 *
6 * On-disk structures for OCFS2.
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License, version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public
20 * License along with this program; if not, write to the
21 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 * Boston, MA 021110-1307, USA.
23 */
24
25#ifndef _OCFS2_FS_H
26#define _OCFS2_FS_H
27
28/* Version */
29#define OCFS2_MAJOR_REV_LEVEL 0
30#define OCFS2_MINOR_REV_LEVEL 90
31
32/*
33 * An OCFS2 volume starts this way:
34 * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS.
35 * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS.
36 * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock.
37 *
38 * All other structures are found from the superblock information.
39 *
40 * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a
41 * blocksize of 2K, it is 4096 bytes into disk.
42 */
43#define OCFS2_SUPER_BLOCK_BLKNO 2
44
45/*
46 * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could
47 * grow if needed.
48 */
49#define OCFS2_MIN_CLUSTERSIZE 4096
50#define OCFS2_MAX_CLUSTERSIZE 1048576
51
52/*
53 * Blocks cannot be bigger than clusters, so the maximum blocksize is the
54 * minimum cluster size.
55 */
56#define OCFS2_MIN_BLOCKSIZE 512
57#define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE
58
59/* Filesystem magic number */
60#define OCFS2_SUPER_MAGIC 0x7461636f
61
62/* Object signatures */
63#define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2"
64#define OCFS2_INODE_SIGNATURE "INODE01"
65#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
66#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
67
68/* Compatibility flags */
69#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
70 ( OCFS2_SB(sb)->s_feature_compat & (mask) )
71#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \
72 ( OCFS2_SB(sb)->s_feature_ro_compat & (mask) )
73#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \
74 ( OCFS2_SB(sb)->s_feature_incompat & (mask) )
75#define OCFS2_SET_COMPAT_FEATURE(sb,mask) \
76 OCFS2_SB(sb)->s_feature_compat |= (mask)
77#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \
78 OCFS2_SB(sb)->s_feature_ro_compat |= (mask)
79#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \
80 OCFS2_SB(sb)->s_feature_incompat |= (mask)
81#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \
82 OCFS2_SB(sb)->s_feature_compat &= ~(mask)
83#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
84 OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask)
85#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \
86 OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
87
88#define OCFS2_FEATURE_COMPAT_SUPP 0
89#define OCFS2_FEATURE_INCOMPAT_SUPP 0
90#define OCFS2_FEATURE_RO_COMPAT_SUPP 0
91
92/*
93 * Heartbeat-only devices are missing journals and other files. The
94 * filesystem driver can't load them, but the library can. Never put
95 * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*.
96 */
97#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002
98
99
100/*
101 * Flags on ocfs2_dinode.i_flags
102 */
103#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */
104#define OCFS2_UNUSED2_FL (0x00000002)
105#define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */
106#define OCFS2_UNUSED3_FL (0x00000008)
107/* System inode flags */
108#define OCFS2_SYSTEM_FL (0x00000010) /* System inode */
109#define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */
110#define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */
111#define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */
112#define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */
113#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */
114#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
115#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
116
117/*
118 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
119 */
120#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
121
122/*
123 * superblock s_state flags
124 */
125#define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */
126
127/* Limit of space in ocfs2_dir_entry */
128#define OCFS2_MAX_FILENAME_LEN 255
129
130/* Maximum slots on an ocfs2 file system */
131#define OCFS2_MAX_SLOTS 255
132
133/* Slot map indicator for an empty slot */
134#define OCFS2_INVALID_SLOT -1
135
136#define OCFS2_VOL_UUID_LEN 16
137#define OCFS2_MAX_VOL_LABEL_LEN 64
138
139/* Journal limits (in bytes) */
140#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
141#define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024)
142
143struct ocfs2_system_inode_info {
144 char *si_name;
145 int si_iflags;
146 int si_mode;
147};
148
149/* System file index */
150enum {
151 BAD_BLOCK_SYSTEM_INODE = 0,
152 GLOBAL_INODE_ALLOC_SYSTEM_INODE,
153 SLOT_MAP_SYSTEM_INODE,
154#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
155 HEARTBEAT_SYSTEM_INODE,
156 GLOBAL_BITMAP_SYSTEM_INODE,
157#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
158 ORPHAN_DIR_SYSTEM_INODE,
159 EXTENT_ALLOC_SYSTEM_INODE,
160 INODE_ALLOC_SYSTEM_INODE,
161 JOURNAL_SYSTEM_INODE,
162 LOCAL_ALLOC_SYSTEM_INODE,
163 TRUNCATE_LOG_SYSTEM_INODE,
164 NUM_SYSTEM_INODES
165};
166
167static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
168 /* Global system inodes (single copy) */
169 /* The first two are only used from userspace mfks/tunefs */
170 [BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 },
171 [GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
172
173 /* These are used by the running filesystem */
174 [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 },
175 [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
176 [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 },
177
178 /* Slot-specific system inodes (one copy per slot) */
179 [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
180 [EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
181 [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
182 [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
183 [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
184 [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
185};
186
187/* Parameter passed from mount.ocfs2 to module */
188#define OCFS2_HB_NONE "heartbeat=none"
189#define OCFS2_HB_LOCAL "heartbeat=local"
190
191/*
192 * OCFS2 directory file types. Only the low 3 bits are used. The
193 * other bits are reserved for now.
194 */
195#define OCFS2_FT_UNKNOWN 0
196#define OCFS2_FT_REG_FILE 1
197#define OCFS2_FT_DIR 2
198#define OCFS2_FT_CHRDEV 3
199#define OCFS2_FT_BLKDEV 4
200#define OCFS2_FT_FIFO 5
201#define OCFS2_FT_SOCK 6
202#define OCFS2_FT_SYMLINK 7
203
204#define OCFS2_FT_MAX 8
205
206/*
207 * OCFS2_DIR_PAD defines the directory entries boundaries
208 *
209 * NOTE: It must be a multiple of 4
210 */
211#define OCFS2_DIR_PAD 4
212#define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1)
213#define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name)
214#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \
215 OCFS2_DIR_ROUND) & \
216 ~OCFS2_DIR_ROUND)
217
218#define OCFS2_LINK_MAX 32000
219
220#define S_SHIFT 12
221static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
222 [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE,
223 [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR,
224 [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV,
225 [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV,
226 [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO,
227 [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
228 [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK,
229};
230
231
232/*
233 * Convenience casts
234 */
235#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super))
236
237/*
238 * On disk extent record for OCFS2
239 * It describes a range of clusters on disk.
240 */
241struct ocfs2_extent_rec {
242/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */
243 __le32 e_clusters; /* Clusters covered by this extent */
244 __le64 e_blkno; /* Physical disk offset, in blocks */
245/*10*/
246};
247
248struct ocfs2_chain_rec {
249 __le32 c_free; /* Number of free bits in this chain. */
250 __le32 c_total; /* Number of total bits in this chain */
251 __le64 c_blkno; /* Physical disk offset (blocks) of 1st group */
252};
253
254struct ocfs2_truncate_rec {
255 __le32 t_start; /* 1st cluster in this log */
256 __le32 t_clusters; /* Number of total clusters covered */
257};
258
259/*
260 * On disk extent list for OCFS2 (node in the tree). Note that this
261 * is contained inside ocfs2_dinode or ocfs2_extent_block, so the
262 * offsets are relative to ocfs2_dinode.id2.i_list or
263 * ocfs2_extent_block.h_list, respectively.
264 */
265struct ocfs2_extent_list {
266/*00*/ __le16 l_tree_depth; /* Extent tree depth from this
267 point. 0 means data extents
268 hang directly off this
269 header (a leaf) */
270 __le16 l_count; /* Number of extent records */
271 __le16 l_next_free_rec; /* Next unused extent slot */
272 __le16 l_reserved1;
273 __le64 l_reserved2; /* Pad to
274 sizeof(ocfs2_extent_rec) */
275/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */
276};
277
278/*
279 * On disk allocation chain list for OCFS2. Note that this is
280 * contained inside ocfs2_dinode, so the offsets are relative to
281 * ocfs2_dinode.id2.i_chain.
282 */
283struct ocfs2_chain_list {
284/*00*/ __le16 cl_cpg; /* Clusters per Block Group */
285 __le16 cl_bpc; /* Bits per cluster */
286 __le16 cl_count; /* Total chains in this list */
287 __le16 cl_next_free_rec; /* Next unused chain slot */
288 __le64 cl_reserved1;
289/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */
290};
291
292/*
293 * On disk deallocation log for OCFS2. Note that this is
294 * contained inside ocfs2_dinode, so the offsets are relative to
295 * ocfs2_dinode.id2.i_dealloc.
296 */
297struct ocfs2_truncate_log {
298/*00*/ __le16 tl_count; /* Total records in this log */
299 __le16 tl_used; /* Number of records in use */
300 __le32 tl_reserved1;
301/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */
302};
303
304/*
305 * On disk extent block (indirect block) for OCFS2
306 */
307struct ocfs2_extent_block
308{
309/*00*/ __u8 h_signature[8]; /* Signature for verification */
310 __le64 h_reserved1;
311/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this
312 extent_header belongs to */
313 __le16 h_suballoc_bit; /* Bit offset in suballocator
314 block group */
315 __le32 h_fs_generation; /* Must match super block */
316 __le64 h_blkno; /* Offset on disk, in blocks */
317/*20*/ __le64 h_reserved3;
318 __le64 h_next_leaf_blk; /* Offset on disk, in blocks,
319 of next leaf header pointing
320 to data */
321/*30*/ struct ocfs2_extent_list h_list; /* Extent record list */
322/* Actual on-disk size is one block */
323};
324
325/*
326 * On disk superblock for OCFS2
327 * Note that it is contained inside an ocfs2_dinode, so all offsets
328 * are relative to the start of ocfs2_dinode.id2.
329 */
330struct ocfs2_super_block {
331/*00*/ __le16 s_major_rev_level;
332 __le16 s_minor_rev_level;
333 __le16 s_mnt_count;
334 __le16 s_max_mnt_count;
335 __le16 s_state; /* File system state */
336 __le16 s_errors; /* Behaviour when detecting errors */
337 __le32 s_checkinterval; /* Max time between checks */
338/*10*/ __le64 s_lastcheck; /* Time of last check */
339 __le32 s_creator_os; /* OS */
340 __le32 s_feature_compat; /* Compatible feature set */
341/*20*/ __le32 s_feature_incompat; /* Incompatible feature set */
342 __le32 s_feature_ro_compat; /* Readonly-compatible feature set */
343 __le64 s_root_blkno; /* Offset, in blocks, of root directory
344 dinode */
345/*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system
346 directory dinode */
347 __le32 s_blocksize_bits; /* Blocksize for this fs */
348 __le32 s_clustersize_bits; /* Clustersize for this fs */
349/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts
350 before tunefs required */
351 __le16 s_reserved1;
352 __le32 s_reserved2;
353 __le64 s_first_cluster_group; /* Block offset of 1st cluster
354 * group header */
355/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
356/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
357/*A0*/
358};
359
360/*
361 * Local allocation bitmap for OCFS2 slots
362 * Note that it exists inside an ocfs2_dinode, so all offsets are
363 * relative to the start of ocfs2_dinode.id2.
364 */
365struct ocfs2_local_alloc
366{
367/*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */
368 __le16 la_size; /* Size of included bitmap, in bytes */
369 __le16 la_reserved1;
370 __le64 la_reserved2;
371/*10*/ __u8 la_bitmap[0];
372};
373
374/*
375 * On disk inode for OCFS2
376 */
377struct ocfs2_dinode {
378/*00*/ __u8 i_signature[8]; /* Signature for validation */
379 __le32 i_generation; /* Generation number */
380 __le16 i_suballoc_slot; /* Slot suballocator this inode
381 belongs to */
382 __le16 i_suballoc_bit; /* Bit offset in suballocator
383 block group */
384/*10*/ __le32 i_reserved0;
385 __le32 i_clusters; /* Cluster count */
386 __le32 i_uid; /* Owner UID */
387 __le32 i_gid; /* Owning GID */
388/*20*/ __le64 i_size; /* Size in bytes */
389 __le16 i_mode; /* File mode */
390 __le16 i_links_count; /* Links count */
391 __le32 i_flags; /* File flags */
392/*30*/ __le64 i_atime; /* Access time */
393 __le64 i_ctime; /* Creation time */
394/*40*/ __le64 i_mtime; /* Modification time */
395 __le64 i_dtime; /* Deletion time */
396/*50*/ __le64 i_blkno; /* Offset on disk, in blocks */
397 __le64 i_last_eb_blk; /* Pointer to last extent
398 block */
399/*60*/ __le32 i_fs_generation; /* Generation per fs-instance */
400 __le32 i_atime_nsec;
401 __le32 i_ctime_nsec;
402 __le32 i_mtime_nsec;
403/*70*/ __le64 i_reserved1[9];
404/*B8*/ union {
405 __le64 i_pad1; /* Generic way to refer to this
406 64bit union */
407 struct {
408 __le64 i_rdev; /* Device number */
409 } dev1;
410 struct { /* Info for bitmap system
411 inodes */
412 __le32 i_used; /* Bits (ie, clusters) used */
413 __le32 i_total; /* Total bits (clusters)
414 available */
415 } bitmap1;
416 struct { /* Info for journal system
417 inodes */
418 __le32 ij_flags; /* Mounted, version, etc. */
419 __le32 ij_pad;
420 } journal1;
421 } id1; /* Inode type dependant 1 */
422/*C0*/ union {
423 struct ocfs2_super_block i_super;
424 struct ocfs2_local_alloc i_lab;
425 struct ocfs2_chain_list i_chain;
426 struct ocfs2_extent_list i_list;
427 struct ocfs2_truncate_log i_dealloc;
428 __u8 i_symlink[0];
429 } id2;
430/* Actual on-disk size is one block */
431};
432
433/*
434 * On-disk directory entry structure for OCFS2
435 *
436 * Packed as this structure could be accessed unaligned on 64-bit platforms
437 */
438struct ocfs2_dir_entry {
439/*00*/ __le64 inode; /* Inode number */
440 __le16 rec_len; /* Directory entry length */
441 __u8 name_len; /* Name length */
442 __u8 file_type;
443/*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */
444/* Actual on-disk length specified by rec_len */
445} __attribute__ ((packed));
446
447/*
448 * On disk allocator group structure for OCFS2
449 */
450struct ocfs2_group_desc
451{
452/*00*/ __u8 bg_signature[8]; /* Signature for validation */
453 __le16 bg_size; /* Size of included bitmap in
454 bytes. */
455 __le16 bg_bits; /* Bits represented by this
456 group. */
457 __le16 bg_free_bits_count; /* Free bits count */
458 __le16 bg_chain; /* What chain I am in. */
459/*10*/ __le32 bg_generation;
460 __le32 bg_reserved1;
461 __le64 bg_next_group; /* Next group in my list, in
462 blocks */
463/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
464 blocks */
465 __le64 bg_blkno; /* Offset on disk, in blocks */
466/*30*/ __le64 bg_reserved2[2];
467/*40*/ __u8 bg_bitmap[0];
468};
469
470#ifdef __KERNEL__
471static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
472{
473 return sb->s_blocksize -
474 offsetof(struct ocfs2_dinode, id2.i_symlink);
475}
476
477static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
478{
479 int size;
480
481 size = sb->s_blocksize -
482 offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
483
484 return size / sizeof(struct ocfs2_extent_rec);
485}
486
487static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
488{
489 int size;
490
491 size = sb->s_blocksize -
492 offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
493
494 return size / sizeof(struct ocfs2_chain_rec);
495}
496
497static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
498{
499 int size;
500
501 size = sb->s_blocksize -
502 offsetof(struct ocfs2_extent_block, h_list.l_recs);
503
504 return size / sizeof(struct ocfs2_extent_rec);
505}
506
507static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
508{
509 u16 size;
510
511 size = sb->s_blocksize -
512 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
513
514 return size;
515}
516
517static inline int ocfs2_group_bitmap_size(struct super_block *sb)
518{
519 int size;
520
521 size = sb->s_blocksize -
522 offsetof(struct ocfs2_group_desc, bg_bitmap);
523
524 return size;
525}
526
527static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
528{
529 int size;
530
531 size = sb->s_blocksize -
532 offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
533
534 return size / sizeof(struct ocfs2_truncate_rec);
535}
536#else
537static inline int ocfs2_fast_symlink_chars(int blocksize)
538{
539 return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
540}
541
542static inline int ocfs2_extent_recs_per_inode(int blocksize)
543{
544 int size;
545
546 size = blocksize -
547 offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
548
549 return size / sizeof(struct ocfs2_extent_rec);
550}
551
552static inline int ocfs2_chain_recs_per_inode(int blocksize)
553{
554 int size;
555
556 size = blocksize -
557 offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
558
559 return size / sizeof(struct ocfs2_chain_rec);
560}
561
562static inline int ocfs2_extent_recs_per_eb(int blocksize)
563{
564 int size;
565
566 size = blocksize -
567 offsetof(struct ocfs2_extent_block, h_list.l_recs);
568
569 return size / sizeof(struct ocfs2_extent_rec);
570}
571
572static inline int ocfs2_local_alloc_size(int blocksize)
573{
574 int size;
575
576 size = blocksize -
577 offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
578
579 return size;
580}
581
582static inline int ocfs2_group_bitmap_size(int blocksize)
583{
584 int size;
585
586 size = blocksize -
587 offsetof(struct ocfs2_group_desc, bg_bitmap);
588
589 return size;
590}
591
592static inline int ocfs2_truncate_recs_per_inode(int blocksize)
593{
594 int size;
595
596 size = blocksize -
597 offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
598
599 return size / sizeof(struct ocfs2_truncate_rec);
600}
601#endif /* __KERNEL__ */
602
603
604static inline int ocfs2_system_inode_is_global(int type)
605{
606 return ((type >= 0) &&
607 (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE));
608}
609
610static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
611 int type, int slot)
612{
613 int chars;
614
615 /*
616 * Global system inodes can only have one copy. Everything
617 * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode
618 * list has a copy per slot.
619 */
620 if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
621 chars = snprintf(buf, len,
622 ocfs2_system_inodes[type].si_name);
623 else
624 chars = snprintf(buf, len,
625 ocfs2_system_inodes[type].si_name,
626 slot);
627
628 return chars;
629}
630
631static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
632 umode_t mode)
633{
634 de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
635}
636
637#endif /* _OCFS2_FS_H */
638
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
new file mode 100644
index 000000000000..7dd9e1e705b0
--- /dev/null
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -0,0 +1,73 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ocfs2_lockid.h
5 *
6 * Defines OCFS2 lockid bits.
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_LOCKID_H
27#define OCFS2_LOCKID_H
28
29/* lock ids are made up in the following manner:
30 * name[0] --> type
31 * name[1-6] --> 6 pad characters, reserved for now
32 * name[7-22] --> block number, expressed in hex as 16 chars
33 * name[23-30] --> i_generation, expressed in hex 8 chars
34 * name[31] --> '\0' */
35#define OCFS2_LOCK_ID_MAX_LEN 32
36#define OCFS2_LOCK_ID_PAD "000000"
37
38enum ocfs2_lock_type {
39 OCFS2_LOCK_TYPE_META = 0,
40 OCFS2_LOCK_TYPE_DATA,
41 OCFS2_LOCK_TYPE_SUPER,
42 OCFS2_LOCK_TYPE_RENAME,
43 OCFS2_LOCK_TYPE_RW,
44 OCFS2_NUM_LOCK_TYPES
45};
46
47static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
48{
49 char c;
50 switch (type) {
51 case OCFS2_LOCK_TYPE_META:
52 c = 'M';
53 break;
54 case OCFS2_LOCK_TYPE_DATA:
55 c = 'D';
56 break;
57 case OCFS2_LOCK_TYPE_SUPER:
58 c = 'S';
59 break;
60 case OCFS2_LOCK_TYPE_RENAME:
61 c = 'R';
62 break;
63 case OCFS2_LOCK_TYPE_RW:
64 c = 'W';
65 break;
66 default:
67 c = '\0';
68 }
69
70 return c;
71}
72
73#endif /* OCFS2_LOCKID_H */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
new file mode 100644
index 000000000000..871627961d6d
--- /dev/null
+++ b/fs/ocfs2/slot_map.c
@@ -0,0 +1,303 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * slot_map.c
5 *
6 *
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/smp_lock.h>
30
31#define MLOG_MASK_PREFIX ML_SUPER
32#include <cluster/masklog.h>
33
34#include "ocfs2.h"
35
36#include "dlmglue.h"
37#include "extent_map.h"
38#include "heartbeat.h"
39#include "inode.h"
40#include "slot_map.h"
41#include "super.h"
42#include "sysfile.h"
43
44#include "buffer_head_io.h"
45
46static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
47 s16 global);
48static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
49 s16 slot_num,
50 s16 node_num);
51
52/* Use the slot information we've collected to create a map of mounted
53 * nodes. Should be holding an EX on super block. assumes slot info is
54 * up to date. Note that we call this *after* we find a slot, so our
55 * own node should be set in the map too... */
56void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
57{
58 int i;
59 struct ocfs2_slot_info *si = osb->slot_info;
60
61 spin_lock(&si->si_lock);
62
63 for (i = 0; i < si->si_size; i++)
64 if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
65 ocfs2_node_map_set_bit(osb, &osb->mounted_map,
66 si->si_global_node_nums[i]);
67
68 spin_unlock(&si->si_lock);
69}
70
71/* post the slot information on disk into our slot_info struct. */
72void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
73{
74 int i;
75 __le16 *disk_info;
76
77 /* we don't read the slot block here as ocfs2_super_lock
78 * should've made sure we have the most recent copy. */
79 spin_lock(&si->si_lock);
80 disk_info = (__le16 *) si->si_bh->b_data;
81
82 for (i = 0; i < si->si_size; i++)
83 si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
84
85 spin_unlock(&si->si_lock);
86}
87
88/* post the our slot info stuff into it's destination bh and write it
89 * out. */
90int ocfs2_update_disk_slots(struct ocfs2_super *osb,
91 struct ocfs2_slot_info *si)
92{
93 int status, i;
94 __le16 *disk_info = (__le16 *) si->si_bh->b_data;
95
96 spin_lock(&si->si_lock);
97 for (i = 0; i < si->si_size; i++)
98 disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
99 spin_unlock(&si->si_lock);
100
101 status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
102 if (status < 0)
103 mlog_errno(status);
104
105 return status;
106}
107
108/* try to find global node in the slot info. Returns
109 * OCFS2_INVALID_SLOT if nothing is found. */
110static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
111 s16 global)
112{
113 int i;
114 s16 ret = OCFS2_INVALID_SLOT;
115
116 for(i = 0; i < si->si_num_slots; i++) {
117 if (global == si->si_global_node_nums[i]) {
118 ret = (s16) i;
119 break;
120 }
121 }
122 return ret;
123}
124
125static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
126{
127 int i;
128 s16 ret = OCFS2_INVALID_SLOT;
129
130 for(i = 0; i < si->si_num_slots; i++) {
131 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
132 ret = (s16) i;
133 break;
134 }
135 }
136 return ret;
137}
138
139s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
140 s16 global)
141{
142 s16 ret;
143
144 spin_lock(&si->si_lock);
145 ret = __ocfs2_node_num_to_slot(si, global);
146 spin_unlock(&si->si_lock);
147 return ret;
148}
149
150static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
151 s16 slot_num,
152 s16 node_num)
153{
154 BUG_ON(slot_num == OCFS2_INVALID_SLOT);
155 BUG_ON(slot_num >= si->si_num_slots);
156 BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
157 (node_num >= O2NM_MAX_NODES));
158
159 si->si_global_node_nums[slot_num] = node_num;
160}
161
162void ocfs2_clear_slot(struct ocfs2_slot_info *si,
163 s16 slot_num)
164{
165 spin_lock(&si->si_lock);
166 __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
167 spin_unlock(&si->si_lock);
168}
169
170int ocfs2_init_slot_info(struct ocfs2_super *osb)
171{
172 int status, i;
173 u64 blkno;
174 struct inode *inode = NULL;
175 struct buffer_head *bh = NULL;
176 struct ocfs2_slot_info *si;
177
178 si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL);
179 if (!si) {
180 status = -ENOMEM;
181 mlog_errno(status);
182 goto bail;
183 }
184
185 spin_lock_init(&si->si_lock);
186 si->si_num_slots = osb->max_slots;
187 si->si_size = OCFS2_MAX_SLOTS;
188
189 for(i = 0; i < si->si_num_slots; i++)
190 si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
191
192 inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
193 OCFS2_INVALID_SLOT);
194 if (!inode) {
195 status = -EINVAL;
196 mlog_errno(status);
197 goto bail;
198 }
199
200 status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
201 if (status < 0) {
202 mlog_errno(status);
203 goto bail;
204 }
205
206 status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
207 if (status < 0) {
208 mlog_errno(status);
209 goto bail;
210 }
211
212 si->si_inode = inode;
213 si->si_bh = bh;
214 osb->slot_info = si;
215bail:
216 if (status < 0 && si)
217 ocfs2_free_slot_info(si);
218
219 return status;
220}
221
222void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
223{
224 if (si->si_inode)
225 iput(si->si_inode);
226 if (si->si_bh)
227 brelse(si->si_bh);
228 kfree(si);
229}
230
231int ocfs2_find_slot(struct ocfs2_super *osb)
232{
233 int status;
234 s16 slot;
235 struct ocfs2_slot_info *si;
236
237 mlog_entry_void();
238
239 si = osb->slot_info;
240
241 ocfs2_update_slot_info(si);
242
243 spin_lock(&si->si_lock);
244 /* search for ourselves first and take the slot if it already
245 * exists. Perhaps we need to mark this in a variable for our
246 * own journal recovery? Possibly not, though we certainly
247 * need to warn to the user */
248 slot = __ocfs2_node_num_to_slot(si, osb->node_num);
249 if (slot == OCFS2_INVALID_SLOT) {
250 /* if no slot yet, then just take 1st available
251 * one. */
252 slot = __ocfs2_find_empty_slot(si);
253 if (slot == OCFS2_INVALID_SLOT) {
254 spin_unlock(&si->si_lock);
255 mlog(ML_ERROR, "no free slots available!\n");
256 status = -EINVAL;
257 goto bail;
258 }
259 } else
260 mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
261 slot);
262
263 __ocfs2_fill_slot(si, slot, osb->node_num);
264 osb->slot_num = slot;
265 spin_unlock(&si->si_lock);
266
267 mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
268
269 status = ocfs2_update_disk_slots(osb, si);
270 if (status < 0)
271 mlog_errno(status);
272
273bail:
274 mlog_exit(status);
275 return status;
276}
277
278void ocfs2_put_slot(struct ocfs2_super *osb)
279{
280 int status;
281 struct ocfs2_slot_info *si = osb->slot_info;
282
283 if (!si)
284 return;
285
286 ocfs2_update_slot_info(si);
287
288 spin_lock(&si->si_lock);
289 __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
290 osb->slot_num = OCFS2_INVALID_SLOT;
291 spin_unlock(&si->si_lock);
292
293 status = ocfs2_update_disk_slots(osb, si);
294 if (status < 0) {
295 mlog_errno(status);
296 goto bail;
297 }
298
299bail:
300 osb->slot_info = NULL;
301 ocfs2_free_slot_info(si);
302}
303
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
new file mode 100644
index 000000000000..d8c8ceed031b
--- /dev/null
+++ b/fs/ocfs2/slot_map.h
@@ -0,0 +1,66 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * slotmap.h
5 *
6 * description here
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef SLOTMAP_H
28#define SLOTMAP_H
29
30struct ocfs2_slot_info {
31 spinlock_t si_lock;
32
33 struct inode *si_inode;
34 struct buffer_head *si_bh;
35 unsigned int si_num_slots;
36 unsigned int si_size;
37 s16 si_global_node_nums[OCFS2_MAX_SLOTS];
38};
39
40int ocfs2_init_slot_info(struct ocfs2_super *osb);
41void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
42
43int ocfs2_find_slot(struct ocfs2_super *osb);
44void ocfs2_put_slot(struct ocfs2_super *osb);
45
46void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
47int ocfs2_update_disk_slots(struct ocfs2_super *osb,
48 struct ocfs2_slot_info *si);
49
50s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
51 s16 global);
52void ocfs2_clear_slot(struct ocfs2_slot_info *si,
53 s16 slot_num);
54
55void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
56
57static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
58 int slot_num)
59{
60 BUG_ON(slot_num == OCFS2_INVALID_SLOT);
61 assert_spin_locked(&si->si_lock);
62
63 return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
64}
65
66#endif
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
new file mode 100644
index 000000000000..c46c164aefbb
--- /dev/null
+++ b/fs/ocfs2/suballoc.c
@@ -0,0 +1,1651 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * suballoc.c
5 *
6 * metadata alloc and free
7 * Inspired by ext3 block groups.
8 *
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "dlmglue.h"
39#include "inode.h"
40#include "journal.h"
41#include "localalloc.h"
42#include "suballoc.h"
43#include "super.h"
44#include "sysfile.h"
45#include "uptodate.h"
46
47#include "buffer_head_io.h"
48
49static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
52static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
53 struct inode *alloc_inode,
54 struct buffer_head *bg_bh,
55 u64 group_blkno,
56 u16 my_chain,
57 struct ocfs2_chain_list *cl);
58static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
59 struct inode *alloc_inode,
60 struct buffer_head *bh);
61
62static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
63 struct ocfs2_alloc_context *ac);
64
65static int ocfs2_cluster_group_search(struct inode *inode,
66 struct buffer_head *group_bh,
67 u32 bits_wanted, u32 min_bits,
68 u16 *bit_off, u16 *bits_found);
69static int ocfs2_block_group_search(struct inode *inode,
70 struct buffer_head *group_bh,
71 u32 bits_wanted, u32 min_bits,
72 u16 *bit_off, u16 *bits_found);
73static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
74 u32 bits_wanted,
75 u32 min_bits,
76 u16 *bit_off,
77 unsigned int *num_bits,
78 u64 *bg_blkno);
79static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
80 struct ocfs2_alloc_context *ac,
81 u32 bits_wanted,
82 u32 min_bits,
83 u16 *bit_off,
84 unsigned int *num_bits,
85 u64 *bg_blkno);
86static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
87 int nr);
88static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
89 struct buffer_head *bg_bh,
90 unsigned int bits_wanted,
91 u16 *bit_off,
92 u16 *bits_found);
93static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
94 struct inode *alloc_inode,
95 struct ocfs2_group_desc *bg,
96 struct buffer_head *group_bh,
97 unsigned int bit_off,
98 unsigned int num_bits);
99static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
100 struct inode *alloc_inode,
101 struct ocfs2_group_desc *bg,
102 struct buffer_head *group_bh,
103 unsigned int bit_off,
104 unsigned int num_bits);
105
106static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
107 struct inode *alloc_inode,
108 struct buffer_head *fe_bh,
109 struct buffer_head *bg_bh,
110 struct buffer_head *prev_bg_bh,
111 u16 chain);
112static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
113 u32 wanted);
114static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
115 struct inode *alloc_inode,
116 struct buffer_head *alloc_bh,
117 unsigned int start_bit,
118 u64 bg_blkno,
119 unsigned int count);
120static inline u64 ocfs2_which_suballoc_group(u64 block,
121 unsigned int bit);
122static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
123 u64 bg_blkno,
124 u16 bg_bit_off);
125static inline u64 ocfs2_which_cluster_group(struct inode *inode,
126 u32 cluster);
127static inline void ocfs2_block_to_cluster_group(struct inode *inode,
128 u64 data_blkno,
129 u64 *bg_blkno,
130 u16 *bg_bit_off);
131
132void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
133{
134 if (ac->ac_inode)
135 iput(ac->ac_inode);
136 if (ac->ac_bh)
137 brelse(ac->ac_bh);
138 kfree(ac);
139}
140
141static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
142{
143 return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
144}
145
146static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
147 struct inode *alloc_inode,
148 struct buffer_head *bg_bh,
149 u64 group_blkno,
150 u16 my_chain,
151 struct ocfs2_chain_list *cl)
152{
153 int status = 0;
154 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
155 struct super_block * sb = alloc_inode->i_sb;
156
157 mlog_entry_void();
158
159 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
160 ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") "
161 "!= b_blocknr (%llu)", group_blkno,
162 (unsigned long long) bg_bh->b_blocknr);
163 status = -EIO;
164 goto bail;
165 }
166
167 status = ocfs2_journal_access(handle,
168 alloc_inode,
169 bg_bh,
170 OCFS2_JOURNAL_ACCESS_CREATE);
171 if (status < 0) {
172 mlog_errno(status);
173 goto bail;
174 }
175
176 memset(bg, 0, sb->s_blocksize);
177 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
178 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
179 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
180 bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
181 bg->bg_chain = cpu_to_le16(my_chain);
182 bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
183 bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
184 bg->bg_blkno = cpu_to_le64(group_blkno);
185 /* set the 1st bit in the bitmap to account for the descriptor block */
186 ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
187 bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
188
189 status = ocfs2_journal_dirty(handle, bg_bh);
190 if (status < 0)
191 mlog_errno(status);
192
193 /* There is no need to zero out or otherwise initialize the
194 * other blocks in a group - All valid FS metadata in a block
195 * group stores the superblock fs_generation value at
196 * allocation time. */
197
198bail:
199 mlog_exit(status);
200 return status;
201}
202
203static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
204{
205 u16 curr, best;
206
207 best = curr = 0;
208 while (curr < le16_to_cpu(cl->cl_count)) {
209 if (le32_to_cpu(cl->cl_recs[best].c_total) >
210 le32_to_cpu(cl->cl_recs[curr].c_total))
211 best = curr;
212 curr++;
213 }
214 return best;
215}
216
217/*
218 * We expect the block group allocator to already be locked.
219 */
220static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
221 struct inode *alloc_inode,
222 struct buffer_head *bh)
223{
224 int status, credits;
225 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
226 struct ocfs2_chain_list *cl;
227 struct ocfs2_alloc_context *ac = NULL;
228 struct ocfs2_journal_handle *handle = NULL;
229 u32 bit_off, num_bits;
230 u16 alloc_rec;
231 u64 bg_blkno;
232 struct buffer_head *bg_bh = NULL;
233 struct ocfs2_group_desc *bg;
234
235 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
236
237 mlog_entry_void();
238
239 handle = ocfs2_alloc_handle(osb);
240 if (!handle) {
241 status = -ENOMEM;
242 mlog_errno(status);
243 goto bail;
244 }
245
246 cl = &fe->id2.i_chain;
247 status = ocfs2_reserve_clusters(osb,
248 handle,
249 le16_to_cpu(cl->cl_cpg),
250 &ac);
251 if (status < 0) {
252 if (status != -ENOSPC)
253 mlog_errno(status);
254 goto bail;
255 }
256
257 credits = ocfs2_calc_group_alloc_credits(osb->sb,
258 le16_to_cpu(cl->cl_cpg));
259 handle = ocfs2_start_trans(osb, handle, credits);
260 if (IS_ERR(handle)) {
261 status = PTR_ERR(handle);
262 handle = NULL;
263 mlog_errno(status);
264 goto bail;
265 }
266
267 status = ocfs2_claim_clusters(osb,
268 handle,
269 ac,
270 le16_to_cpu(cl->cl_cpg),
271 &bit_off,
272 &num_bits);
273 if (status < 0) {
274 if (status != -ENOSPC)
275 mlog_errno(status);
276 goto bail;
277 }
278
279 alloc_rec = ocfs2_find_smallest_chain(cl);
280
281 /* setup the group */
282 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
283 mlog(0, "new descriptor, record %u, at block %"MLFu64"\n",
284 alloc_rec, bg_blkno);
285
286 bg_bh = sb_getblk(osb->sb, bg_blkno);
287 if (!bg_bh) {
288 status = -EIO;
289 mlog_errno(status);
290 goto bail;
291 }
292 ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
293
294 status = ocfs2_block_group_fill(handle,
295 alloc_inode,
296 bg_bh,
297 bg_blkno,
298 alloc_rec,
299 cl);
300 if (status < 0) {
301 mlog_errno(status);
302 goto bail;
303 }
304
305 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
306
307 status = ocfs2_journal_access(handle, alloc_inode,
308 bh, OCFS2_JOURNAL_ACCESS_WRITE);
309 if (status < 0) {
310 mlog_errno(status);
311 goto bail;
312 }
313
314 le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
315 le16_to_cpu(bg->bg_free_bits_count));
316 le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
317 cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno);
318 if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
319 le16_add_cpu(&cl->cl_next_free_rec, 1);
320
321 le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
322 le16_to_cpu(bg->bg_free_bits_count));
323 le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
324 le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
325
326 status = ocfs2_journal_dirty(handle, bh);
327 if (status < 0) {
328 mlog_errno(status);
329 goto bail;
330 }
331
332 spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
333 OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
334 fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
335 le32_to_cpu(fe->i_clusters)));
336 spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
337 i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
338 alloc_inode->i_blocks =
339 ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
340
341 status = 0;
342bail:
343 if (handle)
344 ocfs2_commit_trans(handle);
345
346 if (ac)
347 ocfs2_free_alloc_context(ac);
348
349 if (bg_bh)
350 brelse(bg_bh);
351
352 mlog_exit(status);
353 return status;
354}
355
356static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
357 struct ocfs2_alloc_context *ac)
358{
359 int status;
360 u32 bits_wanted = ac->ac_bits_wanted;
361 struct inode *alloc_inode = ac->ac_inode;
362 struct buffer_head *bh = NULL;
363 struct ocfs2_journal_handle *handle = ac->ac_handle;
364 struct ocfs2_dinode *fe;
365 u32 free_bits;
366
367 mlog_entry_void();
368
369 BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
370
371 ocfs2_handle_add_inode(handle, alloc_inode);
372 status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
373 if (status < 0) {
374 mlog_errno(status);
375 goto bail;
376 }
377
378 fe = (struct ocfs2_dinode *) bh->b_data;
379 if (!OCFS2_IS_VALID_DINODE(fe)) {
380 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
381 status = -EIO;
382 goto bail;
383 }
384 if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
385 ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator "
386 "# %"MLFu64, le64_to_cpu(fe->i_blkno));
387 status = -EIO;
388 goto bail;
389 }
390
391 free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
392 le32_to_cpu(fe->id1.bitmap1.i_used);
393
394 if (bits_wanted > free_bits) {
395 /* cluster bitmap never grows */
396 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
397 mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
398 bits_wanted, free_bits);
399 status = -ENOSPC;
400 goto bail;
401 }
402
403 status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
404 if (status < 0) {
405 if (status != -ENOSPC)
406 mlog_errno(status);
407 goto bail;
408 }
409 atomic_inc(&osb->alloc_stats.bg_extends);
410
411 /* You should never ask for this much metadata */
412 BUG_ON(bits_wanted >
413 (le32_to_cpu(fe->id1.bitmap1.i_total)
414 - le32_to_cpu(fe->id1.bitmap1.i_used)));
415 }
416
417 get_bh(bh);
418 ac->ac_bh = bh;
419bail:
420 if (bh)
421 brelse(bh);
422
423 mlog_exit(status);
424 return status;
425}
426
427int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
428 struct ocfs2_journal_handle *handle,
429 struct ocfs2_dinode *fe,
430 struct ocfs2_alloc_context **ac)
431{
432 int status;
433 struct inode *alloc_inode = NULL;
434
435 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
436 if (!(*ac)) {
437 status = -ENOMEM;
438 mlog_errno(status);
439 goto bail;
440 }
441
442 (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
443 (*ac)->ac_handle = handle;
444 (*ac)->ac_which = OCFS2_AC_USE_META;
445
446#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
447 alloc_inode = ocfs2_get_system_file_inode(osb,
448 EXTENT_ALLOC_SYSTEM_INODE,
449 0);
450#else
451 alloc_inode = ocfs2_get_system_file_inode(osb,
452 EXTENT_ALLOC_SYSTEM_INODE,
453 osb->slot_num);
454#endif
455 if (!alloc_inode) {
456 status = -ENOMEM;
457 mlog_errno(status);
458 goto bail;
459 }
460
461 (*ac)->ac_inode = igrab(alloc_inode);
462 (*ac)->ac_group_search = ocfs2_block_group_search;
463
464 status = ocfs2_reserve_suballoc_bits(osb, (*ac));
465 if (status < 0) {
466 if (status != -ENOSPC)
467 mlog_errno(status);
468 goto bail;
469 }
470
471 status = 0;
472bail:
473 if ((status < 0) && *ac) {
474 ocfs2_free_alloc_context(*ac);
475 *ac = NULL;
476 }
477
478 if (alloc_inode)
479 iput(alloc_inode);
480
481 mlog_exit(status);
482 return status;
483}
484
485int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
486 struct ocfs2_journal_handle *handle,
487 struct ocfs2_alloc_context **ac)
488{
489 int status;
490 struct inode *alloc_inode = NULL;
491
492 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
493 if (!(*ac)) {
494 status = -ENOMEM;
495 mlog_errno(status);
496 goto bail;
497 }
498
499 (*ac)->ac_bits_wanted = 1;
500 (*ac)->ac_handle = handle;
501 (*ac)->ac_which = OCFS2_AC_USE_INODE;
502
503 alloc_inode = ocfs2_get_system_file_inode(osb,
504 INODE_ALLOC_SYSTEM_INODE,
505 osb->slot_num);
506 if (!alloc_inode) {
507 status = -ENOMEM;
508 mlog_errno(status);
509 goto bail;
510 }
511
512 (*ac)->ac_inode = igrab(alloc_inode);
513 (*ac)->ac_group_search = ocfs2_block_group_search;
514
515 status = ocfs2_reserve_suballoc_bits(osb, *ac);
516 if (status < 0) {
517 if (status != -ENOSPC)
518 mlog_errno(status);
519 goto bail;
520 }
521
522 status = 0;
523bail:
524 if ((status < 0) && *ac) {
525 ocfs2_free_alloc_context(*ac);
526 *ac = NULL;
527 }
528
529 if (alloc_inode)
530 iput(alloc_inode);
531
532 mlog_exit(status);
533 return status;
534}
535
536/* local alloc code has to do the same thing, so rather than do this
537 * twice.. */
538int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
539 struct ocfs2_alloc_context *ac)
540{
541 int status;
542
543 ac->ac_inode = ocfs2_get_system_file_inode(osb,
544 GLOBAL_BITMAP_SYSTEM_INODE,
545 OCFS2_INVALID_SLOT);
546 if (!ac->ac_inode) {
547 status = -EINVAL;
548 mlog(ML_ERROR, "Could not get bitmap inode!\n");
549 goto bail;
550 }
551 ac->ac_which = OCFS2_AC_USE_MAIN;
552 ac->ac_group_search = ocfs2_cluster_group_search;
553
554 status = ocfs2_reserve_suballoc_bits(osb, ac);
555 if (status < 0 && status != -ENOSPC)
556 mlog_errno(status);
557bail:
558 return status;
559}
560
561/* Callers don't need to care which bitmap (local alloc or main) to
562 * use so we figure it out for them, but unfortunately this clutters
563 * things a bit. */
564int ocfs2_reserve_clusters(struct ocfs2_super *osb,
565 struct ocfs2_journal_handle *handle,
566 u32 bits_wanted,
567 struct ocfs2_alloc_context **ac)
568{
569 int status;
570
571 mlog_entry_void();
572
573 BUG_ON(!handle);
574
575 *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
576 if (!(*ac)) {
577 status = -ENOMEM;
578 mlog_errno(status);
579 goto bail;
580 }
581
582 (*ac)->ac_bits_wanted = bits_wanted;
583 (*ac)->ac_handle = handle;
584
585 status = -ENOSPC;
586 if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
587 status = ocfs2_reserve_local_alloc_bits(osb,
588 handle,
589 bits_wanted,
590 *ac);
591 if ((status < 0) && (status != -ENOSPC)) {
592 mlog_errno(status);
593 goto bail;
594 } else if (status == -ENOSPC) {
595 /* reserve_local_bits will return enospc with
596 * the local alloc inode still locked, so we
597 * can change this safely here. */
598 mlog(0, "Disabling local alloc\n");
599 /* We set to OCFS2_LA_DISABLED so that umount
600 * can clean up what's left of the local
601 * allocation */
602 osb->local_alloc_state = OCFS2_LA_DISABLED;
603 }
604 }
605
606 if (status == -ENOSPC) {
607 status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
608 if (status < 0) {
609 if (status != -ENOSPC)
610 mlog_errno(status);
611 goto bail;
612 }
613 }
614
615 status = 0;
616bail:
617 if ((status < 0) && *ac) {
618 ocfs2_free_alloc_context(*ac);
619 *ac = NULL;
620 }
621
622 mlog_exit(status);
623 return status;
624}
625
626/*
627 * More or less lifted from ext3. I'll leave their description below:
628 *
629 * "For ext3 allocations, we must not reuse any blocks which are
630 * allocated in the bitmap buffer's "last committed data" copy. This
631 * prevents deletes from freeing up the page for reuse until we have
632 * committed the delete transaction.
633 *
634 * If we didn't do this, then deleting something and reallocating it as
635 * data would allow the old block to be overwritten before the
636 * transaction committed (because we force data to disk before commit).
637 * This would lead to corruption if we crashed between overwriting the
638 * data and committing the delete.
639 *
640 * @@@ We may want to make this allocation behaviour conditional on
641 * data-writes at some point, and disable it for metadata allocations or
642 * sync-data inodes."
643 *
644 * Note: OCFS2 already does this differently for metadata vs data
645 * allocations, as those bitmaps are seperate and undo access is never
646 * called on a metadata group descriptor.
647 */
648static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
649 int nr)
650{
651 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
652
653 if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
654 return 0;
655 if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
656 return 1;
657
658 bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
659 return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
660}
661
662static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
663 struct buffer_head *bg_bh,
664 unsigned int bits_wanted,
665 u16 *bit_off,
666 u16 *bits_found)
667{
668 void *bitmap;
669 u16 best_offset, best_size;
670 int offset, start, found, status = 0;
671 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
672
673 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
674 OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
675 return -EIO;
676 }
677
678 found = start = best_offset = best_size = 0;
679 bitmap = bg->bg_bitmap;
680
681 while((offset = ocfs2_find_next_zero_bit(bitmap,
682 le16_to_cpu(bg->bg_bits),
683 start)) != -1) {
684 if (offset == le16_to_cpu(bg->bg_bits))
685 break;
686
687 if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
688 /* We found a zero, but we can't use it as it
689 * hasn't been put to disk yet! */
690 found = 0;
691 start = offset + 1;
692 } else if (offset == start) {
693 /* we found a zero */
694 found++;
695 /* move start to the next bit to test */
696 start++;
697 } else {
698 /* got a zero after some ones */
699 found = 1;
700 start = offset + 1;
701 }
702 if (found > best_size) {
703 best_size = found;
704 best_offset = start - found;
705 }
706 /* we got everything we needed */
707 if (found == bits_wanted) {
708 /* mlog(0, "Found it all!\n"); */
709 break;
710 }
711 }
712
713 /* XXX: I think the first clause is equivalent to the second
714 * - jlbec */
715 if (found == bits_wanted) {
716 *bit_off = start - found;
717 *bits_found = found;
718 } else if (best_size) {
719 *bit_off = best_offset;
720 *bits_found = best_size;
721 } else {
722 status = -ENOSPC;
723 /* No error log here -- see the comment above
724 * ocfs2_test_bg_bit_allocatable */
725 }
726
727 return status;
728}
729
730static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
731 struct inode *alloc_inode,
732 struct ocfs2_group_desc *bg,
733 struct buffer_head *group_bh,
734 unsigned int bit_off,
735 unsigned int num_bits)
736{
737 int status;
738 void *bitmap = bg->bg_bitmap;
739 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
740
741 mlog_entry_void();
742
743 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
744 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
745 status = -EIO;
746 goto bail;
747 }
748 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
749
750 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
751 num_bits);
752
753 if (ocfs2_is_cluster_bitmap(alloc_inode))
754 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
755
756 status = ocfs2_journal_access(handle,
757 alloc_inode,
758 group_bh,
759 journal_type);
760 if (status < 0) {
761 mlog_errno(status);
762 goto bail;
763 }
764
765 le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
766
767 while(num_bits--)
768 ocfs2_set_bit(bit_off++, bitmap);
769
770 status = ocfs2_journal_dirty(handle,
771 group_bh);
772 if (status < 0) {
773 mlog_errno(status);
774 goto bail;
775 }
776
777bail:
778 mlog_exit(status);
779 return status;
780}
781
782/* find the one with the most empty bits */
783static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
784{
785 u16 curr, best;
786
787 BUG_ON(!cl->cl_next_free_rec);
788
789 best = curr = 0;
790 while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
791 if (le32_to_cpu(cl->cl_recs[curr].c_free) >
792 le32_to_cpu(cl->cl_recs[best].c_free))
793 best = curr;
794 curr++;
795 }
796
797 BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
798 return best;
799}
800
801static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
802 struct inode *alloc_inode,
803 struct buffer_head *fe_bh,
804 struct buffer_head *bg_bh,
805 struct buffer_head *prev_bg_bh,
806 u16 chain)
807{
808 int status;
809 /* there is a really tiny chance the journal calls could fail,
810 * but we wouldn't want inconsistent blocks in *any* case. */
811 u64 fe_ptr, bg_ptr, prev_bg_ptr;
812 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
813 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
814 struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
815
816 if (!OCFS2_IS_VALID_DINODE(fe)) {
817 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
818 status = -EIO;
819 goto out;
820 }
821 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
822 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
823 status = -EIO;
824 goto out;
825 }
826 if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
827 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
828 status = -EIO;
829 goto out;
830 }
831
832 mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to "
833 "top, prev = %"MLFu64"\n",
834 fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno);
835
836 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
837 bg_ptr = le64_to_cpu(bg->bg_next_group);
838 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
839
840 status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
841 OCFS2_JOURNAL_ACCESS_WRITE);
842 if (status < 0) {
843 mlog_errno(status);
844 goto out_rollback;
845 }
846
847 prev_bg->bg_next_group = bg->bg_next_group;
848
849 status = ocfs2_journal_dirty(handle, prev_bg_bh);
850 if (status < 0) {
851 mlog_errno(status);
852 goto out_rollback;
853 }
854
855 status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
856 OCFS2_JOURNAL_ACCESS_WRITE);
857 if (status < 0) {
858 mlog_errno(status);
859 goto out_rollback;
860 }
861
862 bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
863
864 status = ocfs2_journal_dirty(handle, bg_bh);
865 if (status < 0) {
866 mlog_errno(status);
867 goto out_rollback;
868 }
869
870 status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
871 OCFS2_JOURNAL_ACCESS_WRITE);
872 if (status < 0) {
873 mlog_errno(status);
874 goto out_rollback;
875 }
876
877 fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
878
879 status = ocfs2_journal_dirty(handle, fe_bh);
880 if (status < 0) {
881 mlog_errno(status);
882 goto out_rollback;
883 }
884
885 status = 0;
886out_rollback:
887 if (status < 0) {
888 fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
889 bg->bg_next_group = cpu_to_le64(bg_ptr);
890 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
891 }
892out:
893 mlog_exit(status);
894 return status;
895}
896
897static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
898 u32 wanted)
899{
900 return le16_to_cpu(bg->bg_free_bits_count) > wanted;
901}
902
903/* return 0 on success, -ENOSPC to keep searching and any other < 0
904 * value on error. */
905static int ocfs2_cluster_group_search(struct inode *inode,
906 struct buffer_head *group_bh,
907 u32 bits_wanted, u32 min_bits,
908 u16 *bit_off, u16 *bits_found)
909{
910 int search = -ENOSPC;
911 int ret;
912 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
913 u16 tmp_off, tmp_found;
914
915 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
916
917 if (bg->bg_free_bits_count) {
918 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
919 group_bh, bits_wanted,
920 &tmp_off, &tmp_found);
921 if (ret)
922 return ret;
923
924 /* ocfs2_block_group_find_clear_bits() might
925 * return success, but we still want to return
926 * -ENOSPC unless it found the minimum number
927 * of bits. */
928 if (min_bits <= tmp_found) {
929 *bit_off = tmp_off;
930 *bits_found = tmp_found;
931 search = 0; /* success */
932 }
933 }
934
935 return search;
936}
937
938static int ocfs2_block_group_search(struct inode *inode,
939 struct buffer_head *group_bh,
940 u32 bits_wanted, u32 min_bits,
941 u16 *bit_off, u16 *bits_found)
942{
943 int ret = -ENOSPC;
944 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
945
946 BUG_ON(min_bits != 1);
947 BUG_ON(ocfs2_is_cluster_bitmap(inode));
948
949 if (bg->bg_free_bits_count)
950 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
951 group_bh, bits_wanted,
952 bit_off, bits_found);
953
954 return ret;
955}
956
957static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
958 u32 bits_wanted,
959 u32 min_bits,
960 u16 *bit_off,
961 unsigned int *num_bits,
962 u64 *bg_blkno)
963{
964 int status;
965 u16 chain, tmp_bits;
966 u32 tmp_used;
967 u64 next_group;
968 struct ocfs2_journal_handle *handle = ac->ac_handle;
969 struct inode *alloc_inode = ac->ac_inode;
970 struct buffer_head *group_bh = NULL;
971 struct buffer_head *prev_group_bh = NULL;
972 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
973 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
974 struct ocfs2_group_desc *bg;
975
976 chain = ac->ac_chain;
977 mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n",
978 bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno);
979
980 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
981 le64_to_cpu(cl->cl_recs[chain].c_blkno),
982 &group_bh, OCFS2_BH_CACHED, alloc_inode);
983 if (status < 0) {
984 mlog_errno(status);
985 goto bail;
986 }
987 bg = (struct ocfs2_group_desc *) group_bh->b_data;
988 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
989 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
990 status = -EIO;
991 goto bail;
992 }
993
994 status = -ENOSPC;
995 /* for now, the chain search is a bit simplistic. We just use
996 * the 1st group with any empty bits. */
997 while ((status = ac->ac_group_search(alloc_inode, group_bh,
998 bits_wanted, min_bits, bit_off,
999 &tmp_bits)) == -ENOSPC) {
1000 if (!bg->bg_next_group)
1001 break;
1002
1003 if (prev_group_bh) {
1004 brelse(prev_group_bh);
1005 prev_group_bh = NULL;
1006 }
1007 next_group = le64_to_cpu(bg->bg_next_group);
1008 prev_group_bh = group_bh;
1009 group_bh = NULL;
1010 status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1011 next_group, &group_bh,
1012 OCFS2_BH_CACHED, alloc_inode);
1013 if (status < 0) {
1014 mlog_errno(status);
1015 goto bail;
1016 }
1017 bg = (struct ocfs2_group_desc *) group_bh->b_data;
1018 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1019 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1020 status = -EIO;
1021 goto bail;
1022 }
1023 }
1024 if (status < 0) {
1025 if (status != -ENOSPC)
1026 mlog_errno(status);
1027 goto bail;
1028 }
1029
1030 mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n",
1031 tmp_bits, bg->bg_blkno);
1032
1033 *num_bits = tmp_bits;
1034
1035 BUG_ON(*num_bits == 0);
1036
1037 /*
1038 * Keep track of previous block descriptor read. When
1039 * we find a target, if we have read more than X
1040 * number of descriptors, and the target is reasonably
1041 * empty, relink him to top of his chain.
1042 *
1043 * We've read 0 extra blocks and only send one more to
1044 * the transaction, yet the next guy to search has a
1045 * much easier time.
1046 *
1047 * Do this *after* figuring out how many bits we're taking out
1048 * of our target group.
1049 */
1050 if (ac->ac_allow_chain_relink &&
1051 (prev_group_bh) &&
1052 (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1053 status = ocfs2_relink_block_group(handle, alloc_inode,
1054 ac->ac_bh, group_bh,
1055 prev_group_bh, chain);
1056 if (status < 0) {
1057 mlog_errno(status);
1058 goto bail;
1059 }
1060 }
1061
1062 /* Ok, claim our bits now: set the info on dinode, chainlist
1063 * and then the group */
1064 status = ocfs2_journal_access(handle,
1065 alloc_inode,
1066 ac->ac_bh,
1067 OCFS2_JOURNAL_ACCESS_WRITE);
1068 if (status < 0) {
1069 mlog_errno(status);
1070 goto bail;
1071 }
1072
1073 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1074 fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1075 le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1076
1077 status = ocfs2_journal_dirty(handle,
1078 ac->ac_bh);
1079 if (status < 0) {
1080 mlog_errno(status);
1081 goto bail;
1082 }
1083
1084 status = ocfs2_block_group_set_bits(handle,
1085 alloc_inode,
1086 bg,
1087 group_bh,
1088 *bit_off,
1089 *num_bits);
1090 if (status < 0) {
1091 mlog_errno(status);
1092 goto bail;
1093 }
1094
1095 mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n",
1096 *num_bits, fe->i_blkno);
1097
1098 *bg_blkno = le64_to_cpu(bg->bg_blkno);
1099bail:
1100 if (group_bh)
1101 brelse(group_bh);
1102 if (prev_group_bh)
1103 brelse(prev_group_bh);
1104
1105 mlog_exit(status);
1106 return status;
1107}
1108
1109/* will give out up to bits_wanted contiguous bits. */
1110static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1111 struct ocfs2_alloc_context *ac,
1112 u32 bits_wanted,
1113 u32 min_bits,
1114 u16 *bit_off,
1115 unsigned int *num_bits,
1116 u64 *bg_blkno)
1117{
1118 int status;
1119 u16 victim, i;
1120 struct ocfs2_chain_list *cl;
1121 struct ocfs2_dinode *fe;
1122
1123 mlog_entry_void();
1124
1125 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1126 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1127 BUG_ON(!ac->ac_bh);
1128
1129 fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1130 if (!OCFS2_IS_VALID_DINODE(fe)) {
1131 OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
1132 status = -EIO;
1133 goto bail;
1134 }
1135 if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1136 le32_to_cpu(fe->id1.bitmap1.i_total)) {
1137 ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u"
1138 "used bits but only %u total.",
1139 le64_to_cpu(fe->i_blkno),
1140 le32_to_cpu(fe->id1.bitmap1.i_used),
1141 le32_to_cpu(fe->id1.bitmap1.i_total));
1142 status = -EIO;
1143 goto bail;
1144 }
1145
1146 cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1147
1148 victim = ocfs2_find_victim_chain(cl);
1149 ac->ac_chain = victim;
1150 ac->ac_allow_chain_relink = 1;
1151
1152 status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
1153 num_bits, bg_blkno);
1154 if (!status)
1155 goto bail;
1156 if (status < 0 && status != -ENOSPC) {
1157 mlog_errno(status);
1158 goto bail;
1159 }
1160
1161 mlog(0, "Search of victim chain %u came up with nothing, "
1162 "trying all chains now.\n", victim);
1163
1164 /* If we didn't pick a good victim, then just default to
1165 * searching each chain in order. Don't allow chain relinking
1166 * because we only calculate enough journal credits for one
1167 * relink per alloc. */
1168 ac->ac_allow_chain_relink = 0;
1169 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1170 if (i == victim)
1171 continue;
1172 if (!cl->cl_recs[i].c_free)
1173 continue;
1174
1175 ac->ac_chain = i;
1176 status = ocfs2_search_chain(ac, bits_wanted, min_bits,
1177 bit_off, num_bits,
1178 bg_blkno);
1179 if (!status)
1180 break;
1181 if (status < 0 && status != -ENOSPC) {
1182 mlog_errno(status);
1183 goto bail;
1184 }
1185 }
1186bail:
1187
1188 mlog_exit(status);
1189 return status;
1190}
1191
1192int ocfs2_claim_metadata(struct ocfs2_super *osb,
1193 struct ocfs2_journal_handle *handle,
1194 struct ocfs2_alloc_context *ac,
1195 u32 bits_wanted,
1196 u16 *suballoc_bit_start,
1197 unsigned int *num_bits,
1198 u64 *blkno_start)
1199{
1200 int status;
1201 u64 bg_blkno;
1202
1203 BUG_ON(!ac);
1204 BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1205 BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1206 BUG_ON(ac->ac_handle != handle);
1207
1208 status = ocfs2_claim_suballoc_bits(osb,
1209 ac,
1210 bits_wanted,
1211 1,
1212 suballoc_bit_start,
1213 num_bits,
1214 &bg_blkno);
1215 if (status < 0) {
1216 mlog_errno(status);
1217 goto bail;
1218 }
1219 atomic_inc(&osb->alloc_stats.bg_allocs);
1220
1221 *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1222 ac->ac_bits_given += (*num_bits);
1223 status = 0;
1224bail:
1225 mlog_exit(status);
1226 return status;
1227}
1228
1229int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1230 struct ocfs2_journal_handle *handle,
1231 struct ocfs2_alloc_context *ac,
1232 u16 *suballoc_bit,
1233 u64 *fe_blkno)
1234{
1235 int status;
1236 unsigned int num_bits;
1237 u64 bg_blkno;
1238
1239 mlog_entry_void();
1240
1241 BUG_ON(!ac);
1242 BUG_ON(ac->ac_bits_given != 0);
1243 BUG_ON(ac->ac_bits_wanted != 1);
1244 BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1245 BUG_ON(ac->ac_handle != handle);
1246
1247 status = ocfs2_claim_suballoc_bits(osb,
1248 ac,
1249 1,
1250 1,
1251 suballoc_bit,
1252 &num_bits,
1253 &bg_blkno);
1254 if (status < 0) {
1255 mlog_errno(status);
1256 goto bail;
1257 }
1258 atomic_inc(&osb->alloc_stats.bg_allocs);
1259
1260 BUG_ON(num_bits != 1);
1261
1262 *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1263 ac->ac_bits_given++;
1264 status = 0;
1265bail:
1266 mlog_exit(status);
1267 return status;
1268}
1269
1270/* translate a group desc. blkno and it's bitmap offset into
1271 * disk cluster offset. */
1272static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1273 u64 bg_blkno,
1274 u16 bg_bit_off)
1275{
1276 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1277 u32 cluster = 0;
1278
1279 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1280
1281 if (bg_blkno != osb->first_cluster_group_blkno)
1282 cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1283 cluster += (u32) bg_bit_off;
1284 return cluster;
1285}
1286
1287/* given a cluster offset, calculate which block group it belongs to
1288 * and return that block offset. */
1289static inline u64 ocfs2_which_cluster_group(struct inode *inode,
1290 u32 cluster)
1291{
1292 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1293 u32 group_no;
1294
1295 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1296
1297 group_no = cluster / osb->bitmap_cpg;
1298 if (!group_no)
1299 return osb->first_cluster_group_blkno;
1300 return ocfs2_clusters_to_blocks(inode->i_sb,
1301 group_no * osb->bitmap_cpg);
1302}
1303
1304/* given the block number of a cluster start, calculate which cluster
1305 * group and descriptor bitmap offset that corresponds to. */
1306static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1307 u64 data_blkno,
1308 u64 *bg_blkno,
1309 u16 *bg_bit_off)
1310{
1311 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1312 u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1313
1314 BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1315
1316 *bg_blkno = ocfs2_which_cluster_group(inode,
1317 data_cluster);
1318
1319 if (*bg_blkno == osb->first_cluster_group_blkno)
1320 *bg_bit_off = (u16) data_cluster;
1321 else
1322 *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1323 data_blkno - *bg_blkno);
1324}
1325
1326/*
1327 * min_bits - minimum contiguous chunk from this total allocation we
1328 * can handle. set to what we asked for originally for a full
1329 * contig. allocation, set to '1' to indicate we can deal with extents
1330 * of any size.
1331 */
1332int ocfs2_claim_clusters(struct ocfs2_super *osb,
1333 struct ocfs2_journal_handle *handle,
1334 struct ocfs2_alloc_context *ac,
1335 u32 min_clusters,
1336 u32 *cluster_start,
1337 u32 *num_clusters)
1338{
1339 int status;
1340 unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1341 u64 bg_blkno;
1342 u16 bg_bit_off;
1343
1344 mlog_entry_void();
1345
1346 BUG_ON(!ac);
1347 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1348
1349 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1350 && ac->ac_which != OCFS2_AC_USE_MAIN);
1351 BUG_ON(ac->ac_handle != handle);
1352
1353 if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1354 status = ocfs2_claim_local_alloc_bits(osb,
1355 handle,
1356 ac,
1357 bits_wanted,
1358 cluster_start,
1359 num_clusters);
1360 if (!status)
1361 atomic_inc(&osb->alloc_stats.local_data);
1362 } else {
1363 if (min_clusters > (osb->bitmap_cpg - 1)) {
1364 /* The only paths asking for contiguousness
1365 * should know about this already. */
1366 mlog(ML_ERROR, "minimum allocation requested exceeds "
1367 "group bitmap size!");
1368 status = -ENOSPC;
1369 goto bail;
1370 }
1371 /* clamp the current request down to a realistic size. */
1372 if (bits_wanted > (osb->bitmap_cpg - 1))
1373 bits_wanted = osb->bitmap_cpg - 1;
1374
1375 status = ocfs2_claim_suballoc_bits(osb,
1376 ac,
1377 bits_wanted,
1378 min_clusters,
1379 &bg_bit_off,
1380 num_clusters,
1381 &bg_blkno);
1382 if (!status) {
1383 *cluster_start =
1384 ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1385 bg_blkno,
1386 bg_bit_off);
1387 atomic_inc(&osb->alloc_stats.bitmap_data);
1388 }
1389 }
1390 if (status < 0) {
1391 if (status != -ENOSPC)
1392 mlog_errno(status);
1393 goto bail;
1394 }
1395
1396 ac->ac_bits_given += *num_clusters;
1397
1398bail:
1399 mlog_exit(status);
1400 return status;
1401}
1402
1403static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
1404 struct inode *alloc_inode,
1405 struct ocfs2_group_desc *bg,
1406 struct buffer_head *group_bh,
1407 unsigned int bit_off,
1408 unsigned int num_bits)
1409{
1410 int status;
1411 unsigned int tmp;
1412 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1413 struct ocfs2_group_desc *undo_bg = NULL;
1414
1415 mlog_entry_void();
1416
1417 if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1418 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1419 status = -EIO;
1420 goto bail;
1421 }
1422
1423 mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1424
1425 if (ocfs2_is_cluster_bitmap(alloc_inode))
1426 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1427
1428 status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1429 journal_type);
1430 if (status < 0) {
1431 mlog_errno(status);
1432 goto bail;
1433 }
1434
1435 if (ocfs2_is_cluster_bitmap(alloc_inode))
1436 undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1437
1438 tmp = num_bits;
1439 while(tmp--) {
1440 ocfs2_clear_bit((bit_off + tmp),
1441 (unsigned long *) bg->bg_bitmap);
1442 if (ocfs2_is_cluster_bitmap(alloc_inode))
1443 ocfs2_set_bit(bit_off + tmp,
1444 (unsigned long *) undo_bg->bg_bitmap);
1445 }
1446 le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1447
1448 status = ocfs2_journal_dirty(handle, group_bh);
1449 if (status < 0)
1450 mlog_errno(status);
1451bail:
1452 return status;
1453}
1454
1455/*
1456 * expects the suballoc inode to already be locked.
1457 */
1458static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
1459 struct inode *alloc_inode,
1460 struct buffer_head *alloc_bh,
1461 unsigned int start_bit,
1462 u64 bg_blkno,
1463 unsigned int count)
1464{
1465 int status = 0;
1466 u32 tmp_used;
1467 struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1468 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1469 struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1470 struct buffer_head *group_bh = NULL;
1471 struct ocfs2_group_desc *group;
1472
1473 mlog_entry_void();
1474
1475 if (!OCFS2_IS_VALID_DINODE(fe)) {
1476 OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
1477 status = -EIO;
1478 goto bail;
1479 }
1480 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1481
1482 mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64
1483 ", starting at %u\n",
1484 OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno,
1485 start_bit);
1486
1487 status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
1488 alloc_inode);
1489 if (status < 0) {
1490 mlog_errno(status);
1491 goto bail;
1492 }
1493
1494 group = (struct ocfs2_group_desc *) group_bh->b_data;
1495 if (!OCFS2_IS_VALID_GROUP_DESC(group)) {
1496 OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group);
1497 status = -EIO;
1498 goto bail;
1499 }
1500 BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1501
1502 status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1503 group, group_bh,
1504 start_bit, count);
1505 if (status < 0) {
1506 mlog_errno(status);
1507 goto bail;
1508 }
1509
1510 status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1511 OCFS2_JOURNAL_ACCESS_WRITE);
1512 if (status < 0) {
1513 mlog_errno(status);
1514 goto bail;
1515 }
1516
1517 le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1518 count);
1519 tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1520 fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1521
1522 status = ocfs2_journal_dirty(handle, alloc_bh);
1523 if (status < 0) {
1524 mlog_errno(status);
1525 goto bail;
1526 }
1527
1528bail:
1529 if (group_bh)
1530 brelse(group_bh);
1531
1532 mlog_exit(status);
1533 return status;
1534}
1535
1536static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1537{
1538 u64 group = block - (u64) bit;
1539
1540 return group;
1541}
1542
1543int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
1544 struct inode *inode_alloc_inode,
1545 struct buffer_head *inode_alloc_bh,
1546 struct ocfs2_dinode *di)
1547{
1548 u64 blk = le64_to_cpu(di->i_blkno);
1549 u16 bit = le16_to_cpu(di->i_suballoc_bit);
1550 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1551
1552 return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1553 inode_alloc_bh, bit, bg_blkno, 1);
1554}
1555
1556int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
1557 struct inode *eb_alloc_inode,
1558 struct buffer_head *eb_alloc_bh,
1559 struct ocfs2_extent_block *eb)
1560{
1561 u64 blk = le64_to_cpu(eb->h_blkno);
1562 u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1563 u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1564
1565 return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1566 bit, bg_blkno, 1);
1567}
1568
1569int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
1570 struct inode *bitmap_inode,
1571 struct buffer_head *bitmap_bh,
1572 u64 start_blk,
1573 unsigned int num_clusters)
1574{
1575 int status;
1576 u16 bg_start_bit;
1577 u64 bg_blkno;
1578 struct ocfs2_dinode *fe;
1579
1580 /* You can't ever have a contiguous set of clusters
1581 * bigger than a block group bitmap so we never have to worry
1582 * about looping on them. */
1583
1584 mlog_entry_void();
1585
1586 /* This is expensive. We can safely remove once this stuff has
1587 * gotten tested really well. */
1588 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1589
1590 fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1591
1592 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1593 &bg_start_bit);
1594
1595 mlog(0, "want to free %u clusters starting at block %"MLFu64"\n",
1596 num_clusters, start_blk);
1597 mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n",
1598 bg_blkno, bg_start_bit);
1599
1600 status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1601 bg_start_bit, bg_blkno,
1602 num_clusters);
1603 if (status < 0)
1604 mlog_errno(status);
1605
1606 mlog_exit(status);
1607 return status;
1608}
1609
1610static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1611{
1612 printk("Block Group:\n");
1613 printk("bg_signature: %s\n", bg->bg_signature);
1614 printk("bg_size: %u\n", bg->bg_size);
1615 printk("bg_bits: %u\n", bg->bg_bits);
1616 printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1617 printk("bg_chain: %u\n", bg->bg_chain);
1618 printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation));
1619 printk("bg_next_group: %"MLFu64"\n", bg->bg_next_group);
1620 printk("bg_parent_dinode: %"MLFu64"\n", bg->bg_parent_dinode);
1621 printk("bg_blkno: %"MLFu64"\n", bg->bg_blkno);
1622}
1623
1624static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1625{
1626 int i;
1627
1628 printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno);
1629 printk("i_signature: %s\n", fe->i_signature);
1630 printk("i_size: %"MLFu64"\n", fe->i_size);
1631 printk("i_clusters: %u\n", fe->i_clusters);
1632 printk("i_generation: %u\n",
1633 le32_to_cpu(fe->i_generation));
1634 printk("id1.bitmap1.i_used: %u\n",
1635 le32_to_cpu(fe->id1.bitmap1.i_used));
1636 printk("id1.bitmap1.i_total: %u\n",
1637 le32_to_cpu(fe->id1.bitmap1.i_total));
1638 printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg);
1639 printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc);
1640 printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count);
1641 printk("id2.i_chain.cl_next_free_rec: %u\n",
1642 fe->id2.i_chain.cl_next_free_rec);
1643 for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1644 printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i,
1645 fe->id2.i_chain.cl_recs[i].c_free);
1646 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
1647 fe->id2.i_chain.cl_recs[i].c_total);
1648 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i,
1649 fe->id2.i_chain.cl_recs[i].c_blkno);
1650 }
1651}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
new file mode 100644
index 000000000000..a76c82a7ceac
--- /dev/null
+++ b/fs/ocfs2/suballoc.h
@@ -0,0 +1,132 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * suballoc.h
5 *
6 * Defines sub allocator api
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef _CHAINALLOC_H_
27#define _CHAINALLOC_H_
28
29typedef int (group_search_t)(struct inode *,
30 struct buffer_head *,
31 u32,
32 u32,
33 u16 *,
34 u16 *);
35
36struct ocfs2_alloc_context {
37 struct inode *ac_inode; /* which bitmap are we allocating from? */
38 struct buffer_head *ac_bh; /* file entry bh */
39 u32 ac_bits_wanted;
40 u32 ac_bits_given;
41#define OCFS2_AC_USE_LOCAL 1
42#define OCFS2_AC_USE_MAIN 2
43#define OCFS2_AC_USE_INODE 3
44#define OCFS2_AC_USE_META 4
45 u32 ac_which;
46 struct ocfs2_journal_handle *ac_handle;
47
48 /* these are used by the chain search */
49 u16 ac_chain;
50 int ac_allow_chain_relink;
51 group_search_t *ac_group_search;
52};
53
54void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
55static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
56{
57 return ac->ac_bits_wanted - ac->ac_bits_given;
58}
59
60int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
61 struct ocfs2_journal_handle *handle,
62 struct ocfs2_dinode *fe,
63 struct ocfs2_alloc_context **ac);
64int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
65 struct ocfs2_journal_handle *handle,
66 struct ocfs2_alloc_context **ac);
67int ocfs2_reserve_clusters(struct ocfs2_super *osb,
68 struct ocfs2_journal_handle *handle,
69 u32 bits_wanted,
70 struct ocfs2_alloc_context **ac);
71
72int ocfs2_claim_metadata(struct ocfs2_super *osb,
73 struct ocfs2_journal_handle *handle,
74 struct ocfs2_alloc_context *ac,
75 u32 bits_wanted,
76 u16 *suballoc_bit_start,
77 u32 *num_bits,
78 u64 *blkno_start);
79int ocfs2_claim_new_inode(struct ocfs2_super *osb,
80 struct ocfs2_journal_handle *handle,
81 struct ocfs2_alloc_context *ac,
82 u16 *suballoc_bit,
83 u64 *fe_blkno);
84int ocfs2_claim_clusters(struct ocfs2_super *osb,
85 struct ocfs2_journal_handle *handle,
86 struct ocfs2_alloc_context *ac,
87 u32 min_clusters,
88 u32 *cluster_start,
89 u32 *num_clusters);
90
91int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
92 struct inode *inode_alloc_inode,
93 struct buffer_head *inode_alloc_bh,
94 struct ocfs2_dinode *di);
95int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
96 struct inode *eb_alloc_inode,
97 struct buffer_head *eb_alloc_bh,
98 struct ocfs2_extent_block *eb);
99int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
100 struct inode *bitmap_inode,
101 struct buffer_head *bitmap_bh,
102 u64 start_blk,
103 unsigned int num_clusters);
104
105static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
106 u64 bg_blkno)
107{
108 /* This should work for all block group descriptors as only
109 * the 1st group descriptor of the cluster bitmap is
110 * different. */
111
112 if (bg_blkno == osb->first_cluster_group_blkno)
113 return 0;
114
115 /* the rest of the block groups are located at the beginning
116 * of their 1st cluster, so a direct translation just
117 * works. */
118 return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
119}
120
121static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
122{
123 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
124 return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
125}
126
127/* This is for local alloc ONLY. Others should use the task-specific
128 * apis above. */
129int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
130 struct ocfs2_alloc_context *ac);
131
132#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
new file mode 100644
index 000000000000..48bf7f0ce544
--- /dev/null
+++ b/fs/ocfs2/super.c
@@ -0,0 +1,1733 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * super.c
5 *
6 * load/unload driver, mount/dismount volumes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/utsname.h>
32#include <linux/init.h>
33#include <linux/random.h>
34#include <linux/statfs.h>
35#include <linux/moduleparam.h>
36#include <linux/blkdev.h>
37#include <linux/socket.h>
38#include <linux/inet.h>
39#include <linux/parser.h>
40#include <linux/crc32.h>
41#include <linux/debugfs.h>
42
43#include <cluster/nodemanager.h>
44
45#define MLOG_MASK_PREFIX ML_SUPER
46#include <cluster/masklog.h>
47
48#include "ocfs2.h"
49
50/* this should be the only file to include a version 1 header */
51#include "ocfs1_fs_compat.h"
52
53#include "alloc.h"
54#include "dlmglue.h"
55#include "export.h"
56#include "extent_map.h"
57#include "heartbeat.h"
58#include "inode.h"
59#include "journal.h"
60#include "localalloc.h"
61#include "namei.h"
62#include "slot_map.h"
63#include "super.h"
64#include "sysfile.h"
65#include "uptodate.h"
66#include "ver.h"
67#include "vote.h"
68
69#include "buffer_head_io.h"
70
71/*
72 * Globals
73 */
74static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED;
75
76static u32 osb_id; /* Keeps track of next available OSB Id */
77
78static kmem_cache_t *ocfs2_inode_cachep = NULL;
79
80kmem_cache_t *ocfs2_lock_cache = NULL;
81
82/* OCFS2 needs to schedule several differnt types of work which
83 * require cluster locking, disk I/O, recovery waits, etc. Since these
84 * types of work tend to be heavy we avoid using the kernel events
85 * workqueue and schedule on our own. */
86struct workqueue_struct *ocfs2_wq = NULL;
87
88static struct dentry *ocfs2_debugfs_root = NULL;
89
90MODULE_AUTHOR("Oracle");
91MODULE_LICENSE("GPL");
92
93static int ocfs2_parse_options(struct super_block *sb, char *options,
94 unsigned long *mount_opt, int is_remount);
95static void ocfs2_put_super(struct super_block *sb);
96static int ocfs2_mount_volume(struct super_block *sb);
97static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
98static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err);
99static int ocfs2_initialize_mem_caches(void);
100static void ocfs2_free_mem_caches(void);
101static void ocfs2_delete_osb(struct ocfs2_super *osb);
102
103static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf);
104
105static int ocfs2_sync_fs(struct super_block *sb, int wait);
106
107static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
108static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
109static int ocfs2_release_system_inodes(struct ocfs2_super *osb);
110static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
111static int ocfs2_check_volume(struct ocfs2_super *osb);
112static int ocfs2_verify_volume(struct ocfs2_dinode *di,
113 struct buffer_head *bh,
114 u32 sectsize);
115static int ocfs2_initialize_super(struct super_block *sb,
116 struct buffer_head *bh,
117 int sector_size);
118static int ocfs2_get_sector(struct super_block *sb,
119 struct buffer_head **bh,
120 int block,
121 int sect_size);
122static void ocfs2_write_super(struct super_block *sb);
123static struct inode *ocfs2_alloc_inode(struct super_block *sb);
124static void ocfs2_destroy_inode(struct inode *inode);
125
126static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
127
128static struct super_operations ocfs2_sops = {
129 .statfs = ocfs2_statfs,
130 .alloc_inode = ocfs2_alloc_inode,
131 .destroy_inode = ocfs2_destroy_inode,
132 .drop_inode = ocfs2_drop_inode,
133 .clear_inode = ocfs2_clear_inode,
134 .delete_inode = ocfs2_delete_inode,
135 .sync_fs = ocfs2_sync_fs,
136 .write_super = ocfs2_write_super,
137 .put_super = ocfs2_put_super,
138 .remount_fs = ocfs2_remount,
139};
140
141enum {
142 Opt_barrier,
143 Opt_err_panic,
144 Opt_err_ro,
145 Opt_intr,
146 Opt_nointr,
147 Opt_hb_none,
148 Opt_hb_local,
149 Opt_data_ordered,
150 Opt_data_writeback,
151 Opt_err,
152};
153
154static match_table_t tokens = {
155 {Opt_barrier, "barrier=%u"},
156 {Opt_err_panic, "errors=panic"},
157 {Opt_err_ro, "errors=remount-ro"},
158 {Opt_intr, "intr"},
159 {Opt_nointr, "nointr"},
160 {Opt_hb_none, OCFS2_HB_NONE},
161 {Opt_hb_local, OCFS2_HB_LOCAL},
162 {Opt_data_ordered, "data=ordered"},
163 {Opt_data_writeback, "data=writeback"},
164 {Opt_err, NULL}
165};
166
167/*
168 * write_super and sync_fs ripped right out of ext3.
169 */
170static void ocfs2_write_super(struct super_block *sb)
171{
172 if (down_trylock(&sb->s_lock) == 0)
173 BUG();
174 sb->s_dirt = 0;
175}
176
177static int ocfs2_sync_fs(struct super_block *sb, int wait)
178{
179 int status = 0;
180 tid_t target;
181 struct ocfs2_super *osb = OCFS2_SB(sb);
182
183 sb->s_dirt = 0;
184
185 if (ocfs2_is_hard_readonly(osb))
186 return -EROFS;
187
188 if (wait) {
189 status = ocfs2_flush_truncate_log(osb);
190 if (status < 0)
191 mlog_errno(status);
192 } else {
193 ocfs2_schedule_truncate_log_flush(osb, 0);
194 }
195
196 if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
197 if (wait)
198 log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
199 target);
200 }
201 return 0;
202}
203
204static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
205{
206 struct inode *new = NULL;
207 int status = 0;
208 int i;
209
210 mlog_entry_void();
211
212 new = ocfs2_iget(osb, osb->root_blkno);
213 if (IS_ERR(new)) {
214 status = PTR_ERR(new);
215 mlog_errno(status);
216 goto bail;
217 }
218 osb->root_inode = new;
219
220 new = ocfs2_iget(osb, osb->system_dir_blkno);
221 if (IS_ERR(new)) {
222 status = PTR_ERR(new);
223 mlog_errno(status);
224 goto bail;
225 }
226 osb->sys_root_inode = new;
227
228 for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
229 i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
230 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
231 if (!new) {
232 ocfs2_release_system_inodes(osb);
233 status = -EINVAL;
234 mlog_errno(status);
235 /* FIXME: Should ERROR_RO_FS */
236 mlog(ML_ERROR, "Unable to load system inode %d, "
237 "possibly corrupt fs?", i);
238 goto bail;
239 }
240 // the array now has one ref, so drop this one
241 iput(new);
242 }
243
244bail:
245 mlog_exit(status);
246 return status;
247}
248
249static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
250{
251 struct inode *new = NULL;
252 int status = 0;
253 int i;
254
255 mlog_entry_void();
256
257 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
258 i < NUM_SYSTEM_INODES;
259 i++) {
260 new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
261 if (!new) {
262 ocfs2_release_system_inodes(osb);
263 status = -EINVAL;
264 mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
265 status, i, osb->slot_num);
266 goto bail;
267 }
268 /* the array now has one ref, so drop this one */
269 iput(new);
270 }
271
272bail:
273 mlog_exit(status);
274 return status;
275}
276
277static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
278{
279 int status = 0, i;
280 struct inode *inode;
281
282 mlog_entry_void();
283
284 for (i = 0; i < NUM_SYSTEM_INODES; i++) {
285 inode = osb->system_inodes[i];
286 if (inode) {
287 iput(inode);
288 osb->system_inodes[i] = NULL;
289 }
290 }
291
292 inode = osb->sys_root_inode;
293 if (inode) {
294 iput(inode);
295 osb->sys_root_inode = NULL;
296 }
297
298 inode = osb->root_inode;
299 if (inode) {
300 iput(inode);
301 osb->root_inode = NULL;
302 }
303
304 mlog_exit(status);
305 return status;
306}
307
308/* We're allocating fs objects, use GFP_NOFS */
309static struct inode *ocfs2_alloc_inode(struct super_block *sb)
310{
311 struct ocfs2_inode_info *oi;
312
313 oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS);
314 if (!oi)
315 return NULL;
316
317 return &oi->vfs_inode;
318}
319
320static void ocfs2_destroy_inode(struct inode *inode)
321{
322 kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode));
323}
324
325/* From xfs_super.c:xfs_max_file_offset
326 * Copyright (c) 2000-2004 Silicon Graphics, Inc.
327 */
328static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
329{
330 unsigned int pagefactor = 1;
331 unsigned int bitshift = BITS_PER_LONG - 1;
332
333 /* Figure out maximum filesize, on Linux this can depend on
334 * the filesystem blocksize (on 32 bit platforms).
335 * __block_prepare_write does this in an [unsigned] long...
336 * page->index << (PAGE_CACHE_SHIFT - bbits)
337 * So, for page sized blocks (4K on 32 bit platforms),
338 * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
339 * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
340 * but for smaller blocksizes it is less (bbits = log2 bsize).
341 * Note1: get_block_t takes a long (implicit cast from above)
342 * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
343 * can optionally convert the [unsigned] long from above into
344 * an [unsigned] long long.
345 */
346
347#if BITS_PER_LONG == 32
348# if defined(CONFIG_LBD)
349 BUG_ON(sizeof(sector_t) != 8);
350 pagefactor = PAGE_CACHE_SIZE;
351 bitshift = BITS_PER_LONG;
352# else
353 pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
354# endif
355#endif
356
357 return (((unsigned long long)pagefactor) << bitshift) - 1;
358}
359
360static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
361{
362 int incompat_features;
363 int ret = 0;
364 unsigned long parsed_options;
365 struct ocfs2_super *osb = OCFS2_SB(sb);
366
367 if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
368 ret = -EINVAL;
369 goto out;
370 }
371
372 if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) !=
373 (parsed_options & OCFS2_MOUNT_HB_LOCAL)) {
374 ret = -EINVAL;
375 mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n");
376 goto out;
377 }
378
379 if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) !=
380 (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) {
381 ret = -EINVAL;
382 mlog(ML_ERROR, "Cannot change data mode on remount\n");
383 goto out;
384 }
385
386 /* We're going to/from readonly mode. */
387 if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
388 /* Lock here so the check of HARD_RO and the potential
389 * setting of SOFT_RO is atomic. */
390 spin_lock(&osb->osb_lock);
391 if (osb->osb_flags & OCFS2_OSB_HARD_RO) {
392 mlog(ML_ERROR, "Remount on readonly device is forbidden.\n");
393 ret = -EROFS;
394 goto unlock_osb;
395 }
396
397 if (*flags & MS_RDONLY) {
398 mlog(0, "Going to ro mode.\n");
399 sb->s_flags |= MS_RDONLY;
400 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
401 } else {
402 mlog(0, "Making ro filesystem writeable.\n");
403
404 if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
405 mlog(ML_ERROR, "Cannot remount RDWR "
406 "filesystem due to previous errors.\n");
407 ret = -EROFS;
408 goto unlock_osb;
409 }
410 incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP);
411 if (incompat_features) {
412 mlog(ML_ERROR, "Cannot remount RDWR because "
413 "of unsupported optional features "
414 "(%x).\n", incompat_features);
415 ret = -EINVAL;
416 goto unlock_osb;
417 }
418 sb->s_flags &= ~MS_RDONLY;
419 osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
420 }
421unlock_osb:
422 spin_unlock(&osb->osb_lock);
423 }
424
425 if (!ret) {
426 if (!ocfs2_is_hard_readonly(osb))
427 ocfs2_set_journal_params(osb);
428
429 /* Only save off the new mount options in case of a successful
430 * remount. */
431 osb->s_mount_opt = parsed_options;
432 }
433out:
434 return ret;
435}
436
437static int ocfs2_sb_probe(struct super_block *sb,
438 struct buffer_head **bh,
439 int *sector_size)
440{
441 int status = 0, tmpstat;
442 struct ocfs1_vol_disk_hdr *hdr;
443 struct ocfs2_dinode *di;
444 int blksize;
445
446 *bh = NULL;
447
448 /* may be > 512 */
449 *sector_size = bdev_hardsect_size(sb->s_bdev);
450 if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
451 mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
452 *sector_size, OCFS2_MAX_BLOCKSIZE);
453 status = -EINVAL;
454 goto bail;
455 }
456
457 /* Can this really happen? */
458 if (*sector_size < OCFS2_MIN_BLOCKSIZE)
459 *sector_size = OCFS2_MIN_BLOCKSIZE;
460
461 /* check block zero for old format */
462 status = ocfs2_get_sector(sb, bh, 0, *sector_size);
463 if (status < 0) {
464 mlog_errno(status);
465 goto bail;
466 }
467 hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data;
468 if (hdr->major_version == OCFS1_MAJOR_VERSION) {
469 mlog(ML_ERROR, "incompatible version: %u.%u\n",
470 hdr->major_version, hdr->minor_version);
471 status = -EINVAL;
472 }
473 if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE,
474 strlen(OCFS1_VOLUME_SIGNATURE)) == 0) {
475 mlog(ML_ERROR, "incompatible volume signature: %8s\n",
476 hdr->signature);
477 status = -EINVAL;
478 }
479 brelse(*bh);
480 *bh = NULL;
481 if (status < 0) {
482 mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be "
483 "upgraded before mounting with ocfs v2\n");
484 goto bail;
485 }
486
487 /*
488 * Now check at magic offset for 512, 1024, 2048, 4096
489 * blocksizes. 4096 is the maximum blocksize because it is
490 * the minimum clustersize.
491 */
492 status = -EINVAL;
493 for (blksize = *sector_size;
494 blksize <= OCFS2_MAX_BLOCKSIZE;
495 blksize <<= 1) {
496 tmpstat = ocfs2_get_sector(sb, bh,
497 OCFS2_SUPER_BLOCK_BLKNO,
498 blksize);
499 if (tmpstat < 0) {
500 status = tmpstat;
501 mlog_errno(status);
502 goto bail;
503 }
504 di = (struct ocfs2_dinode *) (*bh)->b_data;
505 status = ocfs2_verify_volume(di, *bh, blksize);
506 if (status >= 0)
507 goto bail;
508 brelse(*bh);
509 *bh = NULL;
510 if (status != -EAGAIN)
511 break;
512 }
513
514bail:
515 return status;
516}
517
518static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
519{
520 struct dentry *root;
521 int status, sector_size;
522 unsigned long parsed_opt;
523 struct inode *inode = NULL;
524 struct ocfs2_super *osb = NULL;
525 struct buffer_head *bh = NULL;
526
527 mlog_entry("%p, %p, %i", sb, data, silent);
528
529 /* for now we only have one cluster/node, make sure we see it
530 * in the heartbeat universe */
531 if (!o2hb_check_local_node_heartbeating()) {
532 status = -EINVAL;
533 goto read_super_error;
534 }
535
536 /* probe for superblock */
537 status = ocfs2_sb_probe(sb, &bh, &sector_size);
538 if (status < 0) {
539 mlog(ML_ERROR, "superblock probe failed!\n");
540 goto read_super_error;
541 }
542
543 status = ocfs2_initialize_super(sb, bh, sector_size);
544 osb = OCFS2_SB(sb);
545 if (status < 0) {
546 mlog_errno(status);
547 goto read_super_error;
548 }
549 brelse(bh);
550 bh = NULL;
551
552 if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) {
553 status = -EINVAL;
554 goto read_super_error;
555 }
556 osb->s_mount_opt = parsed_opt;
557
558 sb->s_magic = OCFS2_SUPER_MAGIC;
559
560 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
561 * heartbeat=none */
562 if (bdev_read_only(sb->s_bdev)) {
563 if (!(sb->s_flags & MS_RDONLY)) {
564 status = -EACCES;
565 mlog(ML_ERROR, "Readonly device detected but readonly "
566 "mount was not specified.\n");
567 goto read_super_error;
568 }
569
570 /* You should not be able to start a local heartbeat
571 * on a readonly device. */
572 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
573 status = -EROFS;
574 mlog(ML_ERROR, "Local heartbeat specified on readonly "
575 "device.\n");
576 goto read_super_error;
577 }
578
579 status = ocfs2_check_journals_nolocks(osb);
580 if (status < 0) {
581 if (status == -EROFS)
582 mlog(ML_ERROR, "Recovery required on readonly "
583 "file system, but write access is "
584 "unavailable.\n");
585 else
586 mlog_errno(status);
587 goto read_super_error;
588 }
589
590 ocfs2_set_ro_flag(osb, 1);
591
592 printk(KERN_NOTICE "Readonly device detected. No cluster "
593 "services will be utilized for this mount. Recovery "
594 "will be skipped.\n");
595 }
596
597 if (!ocfs2_is_hard_readonly(osb)) {
598 /* If this isn't a hard readonly mount, then we need
599 * to make sure that heartbeat is in a valid state,
600 * and that we mark ourselves soft readonly is -oro
601 * was specified. */
602 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
603 mlog(ML_ERROR, "No heartbeat for device (%s)\n",
604 sb->s_id);
605 status = -EINVAL;
606 goto read_super_error;
607 }
608
609 if (sb->s_flags & MS_RDONLY)
610 ocfs2_set_ro_flag(osb, 0);
611 }
612
613 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
614 ocfs2_debugfs_root);
615 if (!osb->osb_debug_root) {
616 status = -EINVAL;
617 mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
618 goto read_super_error;
619 }
620
621 status = ocfs2_mount_volume(sb);
622 if (osb->root_inode)
623 inode = igrab(osb->root_inode);
624
625 if (status < 0)
626 goto read_super_error;
627
628 if (!inode) {
629 status = -EIO;
630 mlog_errno(status);
631 goto read_super_error;
632 }
633
634 root = d_alloc_root(inode);
635 if (!root) {
636 status = -ENOMEM;
637 mlog_errno(status);
638 goto read_super_error;
639 }
640
641 sb->s_root = root;
642
643 ocfs2_complete_mount_recovery(osb);
644
645 printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s "
646 "data mode.\n",
647 MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num,
648 osb->slot_num,
649 osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
650 "ordered");
651
652 atomic_set(&osb->vol_state, VOLUME_MOUNTED);
653 wake_up(&osb->osb_mount_event);
654
655 mlog_exit(status);
656 return status;
657
658read_super_error:
659 if (bh != NULL)
660 brelse(bh);
661
662 if (inode)
663 iput(inode);
664
665 if (osb) {
666 atomic_set(&osb->vol_state, VOLUME_DISABLED);
667 wake_up(&osb->osb_mount_event);
668 ocfs2_dismount_volume(sb, 1);
669 }
670
671 mlog_exit(status);
672 return status;
673}
674
675static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type,
676 int flags,
677 const char *dev_name,
678 void *data)
679{
680 return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
681}
682
683static struct file_system_type ocfs2_fs_type = {
684 .owner = THIS_MODULE,
685 .name = "ocfs2",
686 .get_sb = ocfs2_get_sb, /* is this called when we mount
687 * the fs? */
688 .kill_sb = kill_block_super, /* set to the generic one
689 * right now, but do we
690 * need to change that? */
691 .fs_flags = FS_REQUIRES_DEV,
692 .next = NULL
693};
694
695static int ocfs2_parse_options(struct super_block *sb,
696 char *options,
697 unsigned long *mount_opt,
698 int is_remount)
699{
700 int status;
701 char *p;
702
703 mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
704 options ? options : "(none)");
705
706 *mount_opt = 0;
707
708 if (!options) {
709 status = 1;
710 goto bail;
711 }
712
713 while ((p = strsep(&options, ",")) != NULL) {
714 int token, option;
715 substring_t args[MAX_OPT_ARGS];
716
717 if (!*p)
718 continue;
719
720 token = match_token(p, tokens, args);
721 switch (token) {
722 case Opt_hb_local:
723 *mount_opt |= OCFS2_MOUNT_HB_LOCAL;
724 break;
725 case Opt_hb_none:
726 *mount_opt &= ~OCFS2_MOUNT_HB_LOCAL;
727 break;
728 case Opt_barrier:
729 if (match_int(&args[0], &option)) {
730 status = 0;
731 goto bail;
732 }
733 if (option)
734 *mount_opt |= OCFS2_MOUNT_BARRIER;
735 else
736 *mount_opt &= ~OCFS2_MOUNT_BARRIER;
737 break;
738 case Opt_intr:
739 *mount_opt &= ~OCFS2_MOUNT_NOINTR;
740 break;
741 case Opt_nointr:
742 *mount_opt |= OCFS2_MOUNT_NOINTR;
743 break;
744 case Opt_err_panic:
745 *mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
746 break;
747 case Opt_err_ro:
748 *mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
749 break;
750 case Opt_data_ordered:
751 *mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
752 break;
753 case Opt_data_writeback:
754 *mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
755 break;
756 default:
757 mlog(ML_ERROR,
758 "Unrecognized mount option \"%s\" "
759 "or missing value\n", p);
760 status = 0;
761 goto bail;
762 }
763 }
764
765 status = 1;
766
767bail:
768 mlog_exit(status);
769 return status;
770}
771
772static int __init ocfs2_init(void)
773{
774 int status;
775
776 mlog_entry_void();
777
778 ocfs2_print_version();
779
780 if (init_ocfs2_extent_maps())
781 return -ENOMEM;
782
783 status = init_ocfs2_uptodate_cache();
784 if (status < 0) {
785 mlog_errno(status);
786 goto leave;
787 }
788
789 status = ocfs2_initialize_mem_caches();
790 if (status < 0) {
791 mlog_errno(status);
792 goto leave;
793 }
794
795 ocfs2_wq = create_singlethread_workqueue("ocfs2_wq");
796 if (!ocfs2_wq) {
797 status = -ENOMEM;
798 goto leave;
799 }
800
801 spin_lock(&ocfs2_globals_lock);
802 osb_id = 0;
803 spin_unlock(&ocfs2_globals_lock);
804
805 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
806 if (!ocfs2_debugfs_root) {
807 status = -EFAULT;
808 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
809 }
810
811leave:
812 if (status < 0) {
813 ocfs2_free_mem_caches();
814 exit_ocfs2_uptodate_cache();
815 exit_ocfs2_extent_maps();
816 }
817
818 mlog_exit(status);
819
820 if (status >= 0) {
821 return register_filesystem(&ocfs2_fs_type);
822 } else
823 return -1;
824}
825
826static void __exit ocfs2_exit(void)
827{
828 mlog_entry_void();
829
830 if (ocfs2_wq) {
831 flush_workqueue(ocfs2_wq);
832 destroy_workqueue(ocfs2_wq);
833 }
834
835 debugfs_remove(ocfs2_debugfs_root);
836
837 ocfs2_free_mem_caches();
838
839 unregister_filesystem(&ocfs2_fs_type);
840
841 exit_ocfs2_extent_maps();
842
843 exit_ocfs2_uptodate_cache();
844
845 mlog_exit_void();
846}
847
848static void ocfs2_put_super(struct super_block *sb)
849{
850 mlog_entry("(0x%p)\n", sb);
851
852 ocfs2_sync_blockdev(sb);
853 ocfs2_dismount_volume(sb, 0);
854
855 mlog_exit_void();
856}
857
858static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
859{
860 struct ocfs2_super *osb;
861 u32 numbits, freebits;
862 int status;
863 struct ocfs2_dinode *bm_lock;
864 struct buffer_head *bh = NULL;
865 struct inode *inode = NULL;
866
867 mlog_entry("(%p, %p)\n", sb, buf);
868
869 osb = OCFS2_SB(sb);
870
871 inode = ocfs2_get_system_file_inode(osb,
872 GLOBAL_BITMAP_SYSTEM_INODE,
873 OCFS2_INVALID_SLOT);
874 if (!inode) {
875 mlog(ML_ERROR, "failed to get bitmap inode\n");
876 status = -EIO;
877 goto bail;
878 }
879
880 status = ocfs2_meta_lock(inode, NULL, &bh, 0);
881 if (status < 0) {
882 mlog_errno(status);
883 goto bail;
884 }
885
886 bm_lock = (struct ocfs2_dinode *) bh->b_data;
887
888 numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total);
889 freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
890
891 buf->f_type = OCFS2_SUPER_MAGIC;
892 buf->f_bsize = sb->s_blocksize;
893 buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
894 buf->f_blocks = ((sector_t) numbits) *
895 (osb->s_clustersize >> osb->sb->s_blocksize_bits);
896 buf->f_bfree = ((sector_t) freebits) *
897 (osb->s_clustersize >> osb->sb->s_blocksize_bits);
898 buf->f_bavail = buf->f_bfree;
899 buf->f_files = numbits;
900 buf->f_ffree = freebits;
901
902 brelse(bh);
903
904 ocfs2_meta_unlock(inode, 0);
905 status = 0;
906bail:
907 if (inode)
908 iput(inode);
909
910 mlog_exit(status);
911
912 return status;
913}
914
915static void ocfs2_inode_init_once(void *data,
916 kmem_cache_t *cachep,
917 unsigned long flags)
918{
919 struct ocfs2_inode_info *oi = data;
920
921 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
922 SLAB_CTOR_CONSTRUCTOR) {
923 oi->ip_flags = 0;
924 oi->ip_open_count = 0;
925 spin_lock_init(&oi->ip_lock);
926 ocfs2_extent_map_init(&oi->vfs_inode);
927 INIT_LIST_HEAD(&oi->ip_handle_list);
928 INIT_LIST_HEAD(&oi->ip_io_markers);
929 oi->ip_handle = NULL;
930 oi->ip_created_trans = 0;
931 oi->ip_last_trans = 0;
932 oi->ip_dir_start_lookup = 0;
933
934 init_rwsem(&oi->ip_alloc_sem);
935 init_MUTEX(&(oi->ip_io_sem));
936
937 oi->ip_blkno = 0ULL;
938 oi->ip_clusters = 0;
939
940 ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
941 ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
942 ocfs2_lock_res_init_once(&oi->ip_data_lockres);
943
944 ocfs2_metadata_cache_init(&oi->vfs_inode);
945
946 inode_init_once(&oi->vfs_inode);
947 }
948}
949
950static int ocfs2_initialize_mem_caches(void)
951{
952 ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache",
953 sizeof(struct ocfs2_inode_info),
954 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
955 ocfs2_inode_init_once, NULL);
956 if (!ocfs2_inode_cachep)
957 return -ENOMEM;
958
959 ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
960 sizeof(struct ocfs2_journal_lock),
961 0,
962 SLAB_NO_REAP|SLAB_HWCACHE_ALIGN,
963 NULL, NULL);
964 if (!ocfs2_lock_cache)
965 return -ENOMEM;
966
967 return 0;
968}
969
970static void ocfs2_free_mem_caches(void)
971{
972 if (ocfs2_inode_cachep)
973 kmem_cache_destroy(ocfs2_inode_cachep);
974 if (ocfs2_lock_cache)
975 kmem_cache_destroy(ocfs2_lock_cache);
976
977 ocfs2_inode_cachep = NULL;
978 ocfs2_lock_cache = NULL;
979}
980
981static int ocfs2_get_sector(struct super_block *sb,
982 struct buffer_head **bh,
983 int block,
984 int sect_size)
985{
986 if (!sb_set_blocksize(sb, sect_size)) {
987 mlog(ML_ERROR, "unable to set blocksize\n");
988 return -EIO;
989 }
990
991 *bh = sb_getblk(sb, block);
992 if (!*bh) {
993 mlog_errno(-EIO);
994 return -EIO;
995 }
996 lock_buffer(*bh);
997 if (!buffer_dirty(*bh))
998 clear_buffer_uptodate(*bh);
999 unlock_buffer(*bh);
1000 ll_rw_block(READ, 1, bh);
1001 wait_on_buffer(*bh);
1002 return 0;
1003}
1004
1005/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
1006static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
1007{
1008 int status;
1009
1010 /* XXX hold a ref on the node while mounte? easy enough, if
1011 * desirable. */
1012 osb->node_num = o2nm_this_node();
1013 if (osb->node_num == O2NM_MAX_NODES) {
1014 mlog(ML_ERROR, "could not find this host's node number\n");
1015 status = -ENOENT;
1016 goto bail;
1017 }
1018
1019 mlog(ML_NOTICE, "I am node %d\n", osb->node_num);
1020
1021 status = 0;
1022bail:
1023 return status;
1024}
1025
1026static int ocfs2_mount_volume(struct super_block *sb)
1027{
1028 int status = 0;
1029 int unlock_super = 0;
1030 struct ocfs2_super *osb = OCFS2_SB(sb);
1031
1032 mlog_entry_void();
1033
1034 if (ocfs2_is_hard_readonly(osb))
1035 goto leave;
1036
1037 status = ocfs2_fill_local_node_info(osb);
1038 if (status < 0) {
1039 mlog_errno(status);
1040 goto leave;
1041 }
1042
1043 status = ocfs2_register_hb_callbacks(osb);
1044 if (status < 0) {
1045 mlog_errno(status);
1046 goto leave;
1047 }
1048
1049 status = ocfs2_dlm_init(osb);
1050 if (status < 0) {
1051 mlog_errno(status);
1052 goto leave;
1053 }
1054
1055 /* requires vote_thread to be running. */
1056 status = ocfs2_register_net_handlers(osb);
1057 if (status < 0) {
1058 mlog_errno(status);
1059 goto leave;
1060 }
1061
1062 status = ocfs2_super_lock(osb, 1);
1063 if (status < 0) {
1064 mlog_errno(status);
1065 goto leave;
1066 }
1067 unlock_super = 1;
1068
1069 /* This will load up the node map and add ourselves to it. */
1070 status = ocfs2_find_slot(osb);
1071 if (status < 0) {
1072 mlog_errno(status);
1073 goto leave;
1074 }
1075
1076 ocfs2_populate_mounted_map(osb);
1077
1078 /* load all node-local system inodes */
1079 status = ocfs2_init_local_system_inodes(osb);
1080 if (status < 0) {
1081 mlog_errno(status);
1082 goto leave;
1083 }
1084
1085 status = ocfs2_check_volume(osb);
1086 if (status < 0) {
1087 mlog_errno(status);
1088 goto leave;
1089 }
1090
1091 status = ocfs2_truncate_log_init(osb);
1092 if (status < 0) {
1093 mlog_errno(status);
1094 goto leave;
1095 }
1096
1097 /* This should be sent *after* we recovered our journal as it
1098 * will cause other nodes to unmark us as needing
1099 * recovery. However, we need to send it *before* dropping the
1100 * super block lock as otherwise their recovery threads might
1101 * try to clean us up while we're live! */
1102 status = ocfs2_request_mount_vote(osb);
1103 if (status < 0)
1104 mlog_errno(status);
1105
1106leave:
1107 if (unlock_super)
1108 ocfs2_super_unlock(osb, 1);
1109
1110 mlog_exit(status);
1111 return status;
1112}
1113
1114/* we can't grab the goofy sem lock from inside wait_event, so we use
1115 * memory barriers to make sure that we'll see the null task before
1116 * being woken up */
1117static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
1118{
1119 mb();
1120 return osb->recovery_thread_task != NULL;
1121}
1122
1123static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1124{
1125 int tmp;
1126 struct ocfs2_super *osb = NULL;
1127
1128 mlog_entry("(0x%p)\n", sb);
1129
1130 BUG_ON(!sb);
1131 osb = OCFS2_SB(sb);
1132 BUG_ON(!osb);
1133
1134 ocfs2_shutdown_local_alloc(osb);
1135
1136 ocfs2_truncate_log_shutdown(osb);
1137
1138 /* disable any new recovery threads and wait for any currently
1139 * running ones to exit. Do this before setting the vol_state. */
1140 down(&osb->recovery_lock);
1141 osb->disable_recovery = 1;
1142 up(&osb->recovery_lock);
1143 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
1144
1145 /* At this point, we know that no more recovery threads can be
1146 * launched, so wait for any recovery completion work to
1147 * complete. */
1148 flush_workqueue(ocfs2_wq);
1149
1150 ocfs2_journal_shutdown(osb);
1151
1152 ocfs2_sync_blockdev(sb);
1153
1154 /* No dlm means we've failed during mount, so skip all the
1155 * steps which depended on that to complete. */
1156 if (osb->dlm) {
1157 tmp = ocfs2_super_lock(osb, 1);
1158 if (tmp < 0) {
1159 mlog_errno(tmp);
1160 return;
1161 }
1162
1163 tmp = ocfs2_request_umount_vote(osb);
1164 if (tmp < 0)
1165 mlog_errno(tmp);
1166
1167 if (osb->slot_num != OCFS2_INVALID_SLOT)
1168 ocfs2_put_slot(osb);
1169
1170 ocfs2_super_unlock(osb, 1);
1171 }
1172
1173 ocfs2_release_system_inodes(osb);
1174
1175 if (osb->dlm) {
1176 ocfs2_unregister_net_handlers(osb);
1177
1178 ocfs2_dlm_shutdown(osb);
1179 }
1180
1181 ocfs2_clear_hb_callbacks(osb);
1182
1183 debugfs_remove(osb->osb_debug_root);
1184
1185 if (!mnt_err)
1186 ocfs2_stop_heartbeat(osb);
1187
1188 atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
1189
1190 printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n",
1191 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num);
1192
1193 ocfs2_delete_osb(osb);
1194 kfree(osb);
1195 sb->s_dev = 0;
1196 sb->s_fs_info = NULL;
1197}
1198
1199static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid,
1200 unsigned uuid_bytes)
1201{
1202 int i, ret;
1203 char *ptr;
1204
1205 BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN);
1206
1207 osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL);
1208 if (osb->uuid_str == NULL)
1209 return -ENOMEM;
1210
1211 memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN);
1212
1213 for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
1214 /* print with null */
1215 ret = snprintf(ptr, 3, "%02X", uuid[i]);
1216 if (ret != 2) /* drop super cleans up */
1217 return -EINVAL;
1218 /* then only advance past the last char */
1219 ptr += 2;
1220 }
1221
1222 return 0;
1223}
1224
1225static int ocfs2_initialize_super(struct super_block *sb,
1226 struct buffer_head *bh,
1227 int sector_size)
1228{
1229 int status = 0;
1230 int i;
1231 struct ocfs2_dinode *di = NULL;
1232 struct inode *inode = NULL;
1233 struct buffer_head *bitmap_bh = NULL;
1234 struct ocfs2_journal *journal;
1235 __le32 uuid_net_key;
1236 struct ocfs2_super *osb;
1237
1238 mlog_entry_void();
1239
1240 osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL);
1241 if (!osb) {
1242 status = -ENOMEM;
1243 mlog_errno(status);
1244 goto bail;
1245 }
1246
1247 sb->s_fs_info = osb;
1248 sb->s_op = &ocfs2_sops;
1249 sb->s_export_op = &ocfs2_export_ops;
1250 sb->s_flags |= MS_NOATIME;
1251 /* this is needed to support O_LARGEFILE */
1252 sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits);
1253
1254 osb->sb = sb;
1255 /* Save off for ocfs2_rw_direct */
1256 osb->s_sectsize_bits = blksize_bits(sector_size);
1257 if (!osb->s_sectsize_bits)
1258 BUG();
1259
1260 osb->net_response_ids = 0;
1261 spin_lock_init(&osb->net_response_lock);
1262 INIT_LIST_HEAD(&osb->net_response_list);
1263
1264 INIT_LIST_HEAD(&osb->osb_net_handlers);
1265 init_waitqueue_head(&osb->recovery_event);
1266 spin_lock_init(&osb->vote_task_lock);
1267 init_waitqueue_head(&osb->vote_event);
1268 osb->vote_work_sequence = 0;
1269 osb->vote_wake_sequence = 0;
1270 INIT_LIST_HEAD(&osb->blocked_lock_list);
1271 osb->blocked_lock_count = 0;
1272 INIT_LIST_HEAD(&osb->vote_list);
1273 spin_lock_init(&osb->osb_lock);
1274
1275 atomic_set(&osb->alloc_stats.moves, 0);
1276 atomic_set(&osb->alloc_stats.local_data, 0);
1277 atomic_set(&osb->alloc_stats.bitmap_data, 0);
1278 atomic_set(&osb->alloc_stats.bg_allocs, 0);
1279 atomic_set(&osb->alloc_stats.bg_extends, 0);
1280
1281 ocfs2_init_node_maps(osb);
1282
1283 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
1284 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1285
1286 init_MUTEX(&osb->recovery_lock);
1287
1288 osb->disable_recovery = 0;
1289 osb->recovery_thread_task = NULL;
1290
1291 init_waitqueue_head(&osb->checkpoint_event);
1292 atomic_set(&osb->needs_checkpoint, 0);
1293
1294 osb->node_num = O2NM_INVALID_NODE_NUM;
1295 osb->slot_num = OCFS2_INVALID_SLOT;
1296
1297 osb->local_alloc_state = OCFS2_LA_UNUSED;
1298 osb->local_alloc_bh = NULL;
1299
1300 ocfs2_setup_hb_callbacks(osb);
1301
1302 init_waitqueue_head(&osb->osb_mount_event);
1303
1304 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
1305 if (!osb->vol_label) {
1306 mlog(ML_ERROR, "unable to alloc vol label\n");
1307 status = -ENOMEM;
1308 goto bail;
1309 }
1310
1311 osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL);
1312 if (!osb->uuid) {
1313 mlog(ML_ERROR, "unable to alloc uuid\n");
1314 status = -ENOMEM;
1315 goto bail;
1316 }
1317
1318 di = (struct ocfs2_dinode *)bh->b_data;
1319
1320 osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
1321 if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
1322 mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
1323 osb->max_slots);
1324 status = -EINVAL;
1325 goto bail;
1326 }
1327 mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
1328
1329 osb->s_feature_compat =
1330 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
1331 osb->s_feature_ro_compat =
1332 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat);
1333 osb->s_feature_incompat =
1334 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat);
1335
1336 if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) {
1337 mlog(ML_ERROR, "couldn't mount because of unsupported "
1338 "optional features (%x).\n", i);
1339 status = -EINVAL;
1340 goto bail;
1341 }
1342 if (!(osb->sb->s_flags & MS_RDONLY) &&
1343 (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) {
1344 mlog(ML_ERROR, "couldn't mount RDWR because of "
1345 "unsupported optional features (%x).\n", i);
1346 status = -EINVAL;
1347 goto bail;
1348 }
1349
1350 get_random_bytes(&osb->s_next_generation, sizeof(u32));
1351
1352 /* FIXME
1353 * This should be done in ocfs2_journal_init(), but unknown
1354 * ordering issues will cause the filesystem to crash.
1355 * If anyone wants to figure out what part of the code
1356 * refers to osb->journal before ocfs2_journal_init() is run,
1357 * be my guest.
1358 */
1359 /* initialize our journal structure */
1360
1361 journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL);
1362 if (!journal) {
1363 mlog(ML_ERROR, "unable to alloc journal\n");
1364 status = -ENOMEM;
1365 goto bail;
1366 }
1367 osb->journal = journal;
1368 journal->j_osb = osb;
1369
1370 atomic_set(&journal->j_num_trans, 0);
1371 init_rwsem(&journal->j_trans_barrier);
1372 init_waitqueue_head(&journal->j_checkpointed);
1373 spin_lock_init(&journal->j_lock);
1374 journal->j_trans_id = (unsigned long) 1;
1375 INIT_LIST_HEAD(&journal->j_la_cleanups);
1376 INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb);
1377 journal->j_state = OCFS2_JOURNAL_FREE;
1378
1379 /* get some pseudo constants for clustersize bits */
1380 osb->s_clustersize_bits =
1381 le32_to_cpu(di->id2.i_super.s_clustersize_bits);
1382 osb->s_clustersize = 1 << osb->s_clustersize_bits;
1383 mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
1384
1385 if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
1386 osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
1387 mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n",
1388 osb->s_clustersize);
1389 status = -EINVAL;
1390 goto bail;
1391 }
1392
1393 if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1)
1394 > (u32)~0UL) {
1395 mlog(ML_ERROR, "Volume might try to write to blocks beyond "
1396 "what jbd can address in 32 bits.\n");
1397 status = -EINVAL;
1398 goto bail;
1399 }
1400
1401 if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid,
1402 sizeof(di->id2.i_super.s_uuid))) {
1403 mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n");
1404 status = -ENOMEM;
1405 goto bail;
1406 }
1407
1408 memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key));
1409 osb->net_key = le32_to_cpu(uuid_net_key);
1410
1411 strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
1412 osb->vol_label[63] = '\0';
1413 osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
1414 osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
1415 osb->first_cluster_group_blkno =
1416 le64_to_cpu(di->id2.i_super.s_first_cluster_group);
1417 osb->fs_generation = le32_to_cpu(di->i_fs_generation);
1418 mlog(0, "vol_label: %s\n", osb->vol_label);
1419 mlog(0, "uuid: %s\n", osb->uuid_str);
1420 mlog(0, "root_blkno=%"MLFu64", system_dir_blkno=%"MLFu64"\n",
1421 osb->root_blkno, osb->system_dir_blkno);
1422
1423 osb->osb_dlm_debug = ocfs2_new_dlm_debug();
1424 if (!osb->osb_dlm_debug) {
1425 status = -ENOMEM;
1426 mlog_errno(status);
1427 goto bail;
1428 }
1429
1430 atomic_set(&osb->vol_state, VOLUME_INIT);
1431
1432 /* load root, system_dir, and all global system inodes */
1433 status = ocfs2_init_global_system_inodes(osb);
1434 if (status < 0) {
1435 mlog_errno(status);
1436 goto bail;
1437 }
1438
1439 /*
1440 * global bitmap
1441 */
1442 inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
1443 OCFS2_INVALID_SLOT);
1444 if (!inode) {
1445 status = -EINVAL;
1446 mlog_errno(status);
1447 goto bail;
1448 }
1449
1450 osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno;
1451
1452 status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0,
1453 inode);
1454 iput(inode);
1455 if (status < 0) {
1456 mlog_errno(status);
1457 goto bail;
1458 }
1459
1460 di = (struct ocfs2_dinode *) bitmap_bh->b_data;
1461 osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
1462 osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total);
1463 brelse(bitmap_bh);
1464 mlog(0, "cluster bitmap inode: %"MLFu64", clusters per group: %u\n",
1465 osb->bitmap_blkno, osb->bitmap_cpg);
1466
1467 status = ocfs2_init_slot_info(osb);
1468 if (status < 0) {
1469 mlog_errno(status);
1470 goto bail;
1471 }
1472
1473 /* Link this osb onto the global linked list of all osb structures. */
1474 /* The Global Link List is mainted for the whole driver . */
1475 spin_lock(&ocfs2_globals_lock);
1476 osb->osb_id = osb_id;
1477 if (osb_id < OCFS2_MAX_OSB_ID)
1478 osb_id++;
1479 else {
1480 mlog(ML_ERROR, "Too many volumes mounted\n");
1481 status = -ENOMEM;
1482 }
1483 spin_unlock(&ocfs2_globals_lock);
1484
1485bail:
1486 mlog_exit(status);
1487 return status;
1488}
1489
1490/*
1491 * will return: -EAGAIN if it is ok to keep searching for superblocks
1492 * -EINVAL if there is a bad superblock
1493 * 0 on success
1494 */
1495static int ocfs2_verify_volume(struct ocfs2_dinode *di,
1496 struct buffer_head *bh,
1497 u32 blksz)
1498{
1499 int status = -EAGAIN;
1500
1501 mlog_entry_void();
1502
1503 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
1504 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
1505 status = -EINVAL;
1506 if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
1507 mlog(ML_ERROR, "found superblock with incorrect block "
1508 "size: found %u, should be %u\n",
1509 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
1510 blksz);
1511 } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
1512 OCFS2_MAJOR_REV_LEVEL ||
1513 le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
1514 OCFS2_MINOR_REV_LEVEL) {
1515 mlog(ML_ERROR, "found superblock with bad version: "
1516 "found %u.%u, should be %u.%u\n",
1517 le16_to_cpu(di->id2.i_super.s_major_rev_level),
1518 le16_to_cpu(di->id2.i_super.s_minor_rev_level),
1519 OCFS2_MAJOR_REV_LEVEL,
1520 OCFS2_MINOR_REV_LEVEL);
1521 } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) {
1522 mlog(ML_ERROR, "bad block number on superblock: "
1523 "found %"MLFu64", should be %llu\n",
1524 di->i_blkno, (unsigned long long)bh->b_blocknr);
1525 } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 ||
1526 le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) {
1527 mlog(ML_ERROR, "bad cluster size found: %u\n",
1528 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits));
1529 } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) {
1530 mlog(ML_ERROR, "bad root_blkno: 0\n");
1531 } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) {
1532 mlog(ML_ERROR, "bad system_dir_blkno: 0\n");
1533 } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) {
1534 mlog(ML_ERROR,
1535 "Superblock slots found greater than file system "
1536 "maximum: found %u, max %u\n",
1537 le16_to_cpu(di->id2.i_super.s_max_slots),
1538 OCFS2_MAX_SLOTS);
1539 } else {
1540 /* found it! */
1541 status = 0;
1542 }
1543 }
1544
1545 mlog_exit(status);
1546 return status;
1547}
1548
1549static int ocfs2_check_volume(struct ocfs2_super *osb)
1550{
1551 int status = 0;
1552 int dirty;
1553 struct ocfs2_dinode *local_alloc = NULL; /* only used if we
1554 * recover
1555 * ourselves. */
1556
1557 mlog_entry_void();
1558
1559 /* Init our journal object. */
1560 status = ocfs2_journal_init(osb->journal, &dirty);
1561 if (status < 0) {
1562 mlog(ML_ERROR, "Could not initialize journal!\n");
1563 goto finally;
1564 }
1565
1566 /* If the journal was unmounted cleanly then we don't want to
1567 * recover anything. Otherwise, journal_load will do that
1568 * dirty work for us :) */
1569 if (!dirty) {
1570 status = ocfs2_journal_wipe(osb->journal, 0);
1571 if (status < 0) {
1572 mlog_errno(status);
1573 goto finally;
1574 }
1575 } else {
1576 mlog(ML_NOTICE, "File system was not unmounted cleanly, "
1577 "recovering volume.\n");
1578 }
1579
1580 /* will play back anything left in the journal. */
1581 ocfs2_journal_load(osb->journal);
1582
1583 if (dirty) {
1584 /* recover my local alloc if we didn't unmount cleanly. */
1585 status = ocfs2_begin_local_alloc_recovery(osb,
1586 osb->slot_num,
1587 &local_alloc);
1588 if (status < 0) {
1589 mlog_errno(status);
1590 goto finally;
1591 }
1592 /* we complete the recovery process after we've marked
1593 * ourselves as mounted. */
1594 }
1595
1596 mlog(0, "Journal loaded.\n");
1597
1598 status = ocfs2_load_local_alloc(osb);
1599 if (status < 0) {
1600 mlog_errno(status);
1601 goto finally;
1602 }
1603
1604 if (dirty) {
1605 /* Recovery will be completed after we've mounted the
1606 * rest of the volume. */
1607 osb->dirty = 1;
1608 osb->local_alloc_copy = local_alloc;
1609 local_alloc = NULL;
1610 }
1611
1612 /* go through each journal, trylock it and if you get the
1613 * lock, and it's marked as dirty, set the bit in the recover
1614 * map and launch a recovery thread for it. */
1615 status = ocfs2_mark_dead_nodes(osb);
1616 if (status < 0)
1617 mlog_errno(status);
1618
1619finally:
1620 if (local_alloc)
1621 kfree(local_alloc);
1622
1623 mlog_exit(status);
1624 return status;
1625}
1626
1627/*
1628 * The routine gets called from dismount or close whenever a dismount on
1629 * volume is requested and the osb open count becomes 1.
1630 * It will remove the osb from the global list and also free up all the
1631 * initialized resources and fileobject.
1632 */
1633static void ocfs2_delete_osb(struct ocfs2_super *osb)
1634{
1635 mlog_entry_void();
1636
1637 /* This function assumes that the caller has the main osb resource */
1638
1639 if (osb->slot_info)
1640 ocfs2_free_slot_info(osb->slot_info);
1641
1642 /* FIXME
1643 * This belongs in journal shutdown, but because we have to
1644 * allocate osb->journal at the start of ocfs2_initalize_osb(),
1645 * we free it here.
1646 */
1647 kfree(osb->journal);
1648 if (osb->local_alloc_copy)
1649 kfree(osb->local_alloc_copy);
1650 kfree(osb->uuid_str);
1651 ocfs2_put_dlm_debug(osb->osb_dlm_debug);
1652 memset(osb, 0, sizeof(struct ocfs2_super));
1653
1654 mlog_exit_void();
1655}
1656
1657/* Put OCFS2 into a readonly state, or (if the user specifies it),
1658 * panic(). We do not support continue-on-error operation. */
1659static void ocfs2_handle_error(struct super_block *sb)
1660{
1661 struct ocfs2_super *osb = OCFS2_SB(sb);
1662
1663 if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
1664 panic("OCFS2: (device %s): panic forced after error\n",
1665 sb->s_id);
1666
1667 ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
1668
1669 if (sb->s_flags & MS_RDONLY &&
1670 (ocfs2_is_soft_readonly(osb) ||
1671 ocfs2_is_hard_readonly(osb)))
1672 return;
1673
1674 printk(KERN_CRIT "File system is now read-only due to the potential "
1675 "of on-disk corruption. Please run fsck.ocfs2 once the file "
1676 "system is unmounted.\n");
1677 sb->s_flags |= MS_RDONLY;
1678 ocfs2_set_ro_flag(osb, 0);
1679}
1680
1681static char error_buf[1024];
1682
1683void __ocfs2_error(struct super_block *sb,
1684 const char *function,
1685 const char *fmt, ...)
1686{
1687 va_list args;
1688
1689 va_start(args, fmt);
1690 vsprintf(error_buf, fmt, args);
1691 va_end(args);
1692
1693 /* Not using mlog here because we want to show the actual
1694 * function the error came from. */
1695 printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n",
1696 sb->s_id, function, error_buf);
1697
1698 ocfs2_handle_error(sb);
1699}
1700
1701/* Handle critical errors. This is intentionally more drastic than
1702 * ocfs2_handle_error, so we only use for things like journal errors,
1703 * etc. */
1704void __ocfs2_abort(struct super_block* sb,
1705 const char *function,
1706 const char *fmt, ...)
1707{
1708 va_list args;
1709
1710 va_start(args, fmt);
1711 vsprintf(error_buf, fmt, args);
1712 va_end(args);
1713
1714 printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n",
1715 sb->s_id, function, error_buf);
1716
1717 /* We don't have the cluster support yet to go straight to
1718 * hard readonly in here. Until then, we want to keep
1719 * ocfs2_abort() so that we can at least mark critical
1720 * errors.
1721 *
1722 * TODO: This should abort the journal and alert other nodes
1723 * that our slot needs recovery. */
1724
1725 /* Force a panic(). This stinks, but it's better than letting
1726 * things continue without having a proper hard readonly
1727 * here. */
1728 OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
1729 ocfs2_handle_error(sb);
1730}
1731
1732module_init(ocfs2_init);
1733module_exit(ocfs2_exit);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
new file mode 100644
index 000000000000..c564177dfbdc
--- /dev/null
+++ b/fs/ocfs2/super.h
@@ -0,0 +1,44 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * super.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_SUPER_H
27#define OCFS2_SUPER_H
28
29extern struct workqueue_struct *ocfs2_wq;
30
31int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
32 int node_num);
33
34void __ocfs2_error(struct super_block *sb,
35 const char *function,
36 const char *fmt, ...);
37#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
38
39void __ocfs2_abort(struct super_block *sb,
40 const char *function,
41 const char *fmt, ...);
42#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
43
44#endif /* OCFS2_SUPER_H */
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
new file mode 100644
index 000000000000..f6986bd79e75
--- /dev/null
+++ b/fs/ocfs2/symlink.c
@@ -0,0 +1,180 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * linux/cluster/ssi/cfs/symlink.c
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of
9 * the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE
14 * or NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net
22 *
23 * Copyright (C) 1992 Rick Sladkey
24 *
25 * Optimization changes Copyright (C) 1994 Florian La Roche
26 *
27 * Jun 7 1999, cache symlink lookups in the page cache. -DaveM
28 *
29 * Portions Copyright (C) 2001 Compaq Computer Corporation
30 *
31 * ocfs2 symlink handling code.
32 *
33 * Copyright (C) 2004, 2005 Oracle.
34 *
35 */
36
37#include <linux/fs.h>
38#include <linux/types.h>
39#include <linux/slab.h>
40#include <linux/pagemap.h>
41#include <linux/utsname.h>
42
43#define MLOG_MASK_PREFIX ML_NAMEI
44#include <cluster/masklog.h>
45
46#include "ocfs2.h"
47
48#include "alloc.h"
49#include "file.h"
50#include "inode.h"
51#include "journal.h"
52#include "symlink.h"
53
54#include "buffer_head_io.h"
55
56static char *ocfs2_page_getlink(struct dentry * dentry,
57 struct page **ppage);
58static char *ocfs2_fast_symlink_getlink(struct inode *inode,
59 struct buffer_head **bh);
60
61/* get the link contents into pagecache */
62static char *ocfs2_page_getlink(struct dentry * dentry,
63 struct page **ppage)
64{
65 struct page * page;
66 struct address_space *mapping = dentry->d_inode->i_mapping;
67 page = read_cache_page(mapping, 0,
68 (filler_t *)mapping->a_ops->readpage, NULL);
69 if (IS_ERR(page))
70 goto sync_fail;
71 wait_on_page_locked(page);
72 if (!PageUptodate(page))
73 goto async_fail;
74 *ppage = page;
75 return kmap(page);
76
77async_fail:
78 page_cache_release(page);
79 return ERR_PTR(-EIO);
80
81sync_fail:
82 return (char*)page;
83}
84
85static char *ocfs2_fast_symlink_getlink(struct inode *inode,
86 struct buffer_head **bh)
87{
88 int status;
89 char *link = NULL;
90 struct ocfs2_dinode *fe;
91
92 mlog_entry_void();
93
94 status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
95 OCFS2_I(inode)->ip_blkno,
96 bh,
97 OCFS2_BH_CACHED,
98 inode);
99 if (status < 0) {
100 mlog_errno(status);
101 link = ERR_PTR(status);
102 goto bail;
103 }
104
105 fe = (struct ocfs2_dinode *) (*bh)->b_data;
106 link = (char *) fe->id2.i_symlink;
107bail:
108 mlog_exit(status);
109
110 return link;
111}
112
113static int ocfs2_readlink(struct dentry *dentry,
114 char __user *buffer,
115 int buflen)
116{
117 int ret;
118 char *link;
119 struct buffer_head *bh = NULL;
120 struct inode *inode = dentry->d_inode;
121
122 mlog_entry_void();
123
124 link = ocfs2_fast_symlink_getlink(inode, &bh);
125 if (IS_ERR(link)) {
126 ret = PTR_ERR(link);
127 goto out;
128 }
129
130 ret = vfs_readlink(dentry, buffer, buflen, link);
131
132 brelse(bh);
133out:
134 mlog_exit(ret);
135 return ret;
136}
137
138static void *ocfs2_follow_link(struct dentry *dentry,
139 struct nameidata *nd)
140{
141 int status;
142 char *link;
143 struct inode *inode = dentry->d_inode;
144 struct page *page = NULL;
145 struct buffer_head *bh = NULL;
146
147 if (ocfs2_inode_is_fast_symlink(inode))
148 link = ocfs2_fast_symlink_getlink(inode, &bh);
149 else
150 link = ocfs2_page_getlink(dentry, &page);
151 if (IS_ERR(link)) {
152 status = PTR_ERR(link);
153 mlog_errno(status);
154 goto bail;
155 }
156
157 status = vfs_follow_link(nd, link);
158 if (status)
159 mlog_errno(status);
160bail:
161 if (page) {
162 kunmap(page);
163 page_cache_release(page);
164 }
165 if (bh)
166 brelse(bh);
167
168 return ERR_PTR(status);
169}
170
171struct inode_operations ocfs2_symlink_inode_operations = {
172 .readlink = page_readlink,
173 .follow_link = ocfs2_follow_link,
174 .getattr = ocfs2_getattr,
175};
176struct inode_operations ocfs2_fast_symlink_inode_operations = {
177 .readlink = ocfs2_readlink,
178 .follow_link = ocfs2_follow_link,
179 .getattr = ocfs2_getattr,
180};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h
new file mode 100644
index 000000000000..1ea9e4d9e9eb
--- /dev/null
+++ b/fs/ocfs2/symlink.h
@@ -0,0 +1,42 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * symlink.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_SYMLINK_H
27#define OCFS2_SYMLINK_H
28
29extern struct inode_operations ocfs2_symlink_inode_operations;
30extern struct inode_operations ocfs2_fast_symlink_inode_operations;
31
32/*
33 * Test whether an inode is a fast symlink.
34 */
35static inline int ocfs2_inode_is_fast_symlink(struct inode *inode)
36{
37 return (S_ISLNK(inode->i_mode) &&
38 inode->i_blocks == 0);
39}
40
41
42#endif /* OCFS2_SYMLINK_H */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
new file mode 100644
index 000000000000..600a8bc5b541
--- /dev/null
+++ b/fs/ocfs2/sysfile.c
@@ -0,0 +1,131 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sysfile.c
5 *
6 * Initialize, read, write, etc. system files.
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30
31#include "ocfs2.h"
32
33#define MLOG_MASK_PREFIX ML_INODE
34#include <cluster/masklog.h>
35
36#include "alloc.h"
37#include "dir.h"
38#include "inode.h"
39#include "journal.h"
40#include "sysfile.h"
41
42#include "buffer_head_io.h"
43
44static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
45 int type,
46 u32 slot);
47
48static inline int is_global_system_inode(int type);
49static inline int is_in_system_inode_array(struct ocfs2_super *osb,
50 int type,
51 u32 slot);
52
53static inline int is_global_system_inode(int type)
54{
55 return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE &&
56 type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE;
57}
58
59static inline int is_in_system_inode_array(struct ocfs2_super *osb,
60 int type,
61 u32 slot)
62{
63 return slot == osb->slot_num || is_global_system_inode(type);
64}
65
66struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
67 int type,
68 u32 slot)
69{
70 struct inode *inode = NULL;
71 struct inode **arr = NULL;
72
73 /* avoid the lookup if cached in local system file array */
74 if (is_in_system_inode_array(osb, type, slot))
75 arr = &(osb->system_inodes[type]);
76
77 if (arr && ((inode = *arr) != NULL)) {
78 /* get a ref in addition to the array ref */
79 inode = igrab(inode);
80 if (!inode)
81 BUG();
82
83 return inode;
84 }
85
86 /* this gets one ref thru iget */
87 inode = _ocfs2_get_system_file_inode(osb, type, slot);
88
89 /* add one more if putting into array for first time */
90 if (arr && inode) {
91 *arr = igrab(inode);
92 if (!*arr)
93 BUG();
94 }
95 return inode;
96}
97
98static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
99 int type,
100 u32 slot)
101{
102 char namebuf[40];
103 struct inode *inode = NULL;
104 u64 blkno;
105 struct buffer_head *dirent_bh = NULL;
106 struct ocfs2_dir_entry *de = NULL;
107 int status = 0;
108
109 ocfs2_sprintf_system_inode_name(namebuf,
110 sizeof(namebuf),
111 type, slot);
112
113 status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf),
114 &blkno, osb->sys_root_inode,
115 &dirent_bh, &de);
116 if (status < 0) {
117 goto bail;
118 }
119
120 inode = ocfs2_iget(osb, blkno);
121 if (IS_ERR(inode)) {
122 mlog_errno(PTR_ERR(inode));
123 inode = NULL;
124 goto bail;
125 }
126bail:
127 if (dirent_bh)
128 brelse(dirent_bh);
129 return inode;
130}
131
diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h
new file mode 100644
index 000000000000..cc9ea661ffc1
--- /dev/null
+++ b/fs/ocfs2/sysfile.h
@@ -0,0 +1,33 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * sysfile.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_SYSFILE_H
27#define OCFS2_SYSFILE_H
28
29struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb,
30 int type,
31 u32 slot);
32
33#endif /* OCFS2_SYSFILE_H */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
new file mode 100644
index 000000000000..3a0458fd3e1b
--- /dev/null
+++ b/fs/ocfs2/uptodate.c
@@ -0,0 +1,544 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * uptodate.c
5 *
6 * Tracking the up-to-date-ness of a local buffer_head with respect to
7 * the cluster.
8 *
9 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 *
26 * Standard buffer head caching flags (uptodate, etc) are insufficient
27 * in a clustered environment - a buffer may be marked up to date on
28 * our local node but could have been modified by another cluster
29 * member. As a result an additional (and performant) caching scheme
30 * is required. A further requirement is that we consume as little
31 * memory as possible - we never pin buffer_head structures in order
32 * to cache them.
33 *
34 * We track the existence of up to date buffers on the inodes which
35 * are associated with them. Because we don't want to pin
36 * buffer_heads, this is only a (strong) hint and several other checks
37 * are made in the I/O path to ensure that we don't use a stale or
38 * invalid buffer without going to disk:
39 * - buffer_jbd is used liberally - if a bh is in the journal on
40 * this node then it *must* be up to date.
41 * - the standard buffer_uptodate() macro is used to detect buffers
42 * which may be invalid (even if we have an up to date tracking
43 * item for them)
44 *
45 * For a full understanding of how this code works together, one
46 * should read the callers in dlmglue.c, the I/O functions in
47 * buffer_head_io.c and ocfs2_journal_access in journal.c
48 */
49
50#include <linux/fs.h>
51#include <linux/types.h>
52#include <linux/slab.h>
53#include <linux/highmem.h>
54#include <linux/buffer_head.h>
55#include <linux/rbtree.h>
56#include <linux/jbd.h>
57
58#define MLOG_MASK_PREFIX ML_UPTODATE
59
60#include <cluster/masklog.h>
61
62#include "ocfs2.h"
63
64#include "inode.h"
65#include "uptodate.h"
66
67struct ocfs2_meta_cache_item {
68 struct rb_node c_node;
69 sector_t c_block;
70};
71
72static kmem_cache_t *ocfs2_uptodate_cachep = NULL;
73
74void ocfs2_metadata_cache_init(struct inode *inode)
75{
76 struct ocfs2_inode_info *oi = OCFS2_I(inode);
77 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
78
79 oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
80 ci->ci_num_cached = 0;
81}
82
83/* No lock taken here as 'root' is not expected to be visible to other
84 * processes. */
85static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
86{
87 unsigned int purged = 0;
88 struct rb_node *node;
89 struct ocfs2_meta_cache_item *item;
90
91 while ((node = rb_last(root)) != NULL) {
92 item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
93
94 mlog(0, "Purge item %llu\n",
95 (unsigned long long) item->c_block);
96
97 rb_erase(&item->c_node, root);
98 kmem_cache_free(ocfs2_uptodate_cachep, item);
99
100 purged++;
101 }
102 return purged;
103}
104
105/* Called from locking and called from ocfs2_clear_inode. Dump the
106 * cache for a given inode.
107 *
108 * This function is a few more lines longer than necessary due to some
109 * accounting done here, but I think it's worth tracking down those
110 * bugs sooner -- Mark */
111void ocfs2_metadata_cache_purge(struct inode *inode)
112{
113 struct ocfs2_inode_info *oi = OCFS2_I(inode);
114 unsigned int tree, to_purge, purged;
115 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
116 struct rb_root root = RB_ROOT;
117
118 spin_lock(&oi->ip_lock);
119 tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
120 to_purge = ci->ci_num_cached;
121
122 mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge,
123 tree ? "array" : "tree", oi->ip_blkno);
124
125 /* If we're a tree, save off the root so that we can safely
126 * initialize the cache. We do the work to free tree members
127 * without the spinlock. */
128 if (tree)
129 root = ci->ci_cache.ci_tree;
130
131 ocfs2_metadata_cache_init(inode);
132 spin_unlock(&oi->ip_lock);
133
134 purged = ocfs2_purge_copied_metadata_tree(&root);
135 /* If possible, track the number wiped so that we can more
136 * easily detect counting errors. Unfortunately, this is only
137 * meaningful for trees. */
138 if (tree && purged != to_purge)
139 mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n",
140 oi->ip_blkno, to_purge, purged);
141}
142
143/* Returns the index in the cache array, -1 if not found.
144 * Requires ip_lock. */
145static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
146 sector_t item)
147{
148 int i;
149
150 for (i = 0; i < ci->ci_num_cached; i++) {
151 if (item == ci->ci_cache.ci_array[i])
152 return i;
153 }
154
155 return -1;
156}
157
158/* Returns the cache item if found, otherwise NULL.
159 * Requires ip_lock. */
160static struct ocfs2_meta_cache_item *
161ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
162 sector_t block)
163{
164 struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
165 struct ocfs2_meta_cache_item *item = NULL;
166
167 while (n) {
168 item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
169
170 if (block < item->c_block)
171 n = n->rb_left;
172 else if (block > item->c_block)
173 n = n->rb_right;
174 else
175 return item;
176 }
177
178 return NULL;
179}
180
181static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
182 struct buffer_head *bh)
183{
184 int index = -1;
185 struct ocfs2_meta_cache_item *item = NULL;
186
187 spin_lock(&oi->ip_lock);
188
189 mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n",
190 oi->ip_blkno, (unsigned long long) bh->b_blocknr,
191 !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
192
193 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
194 index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
195 bh->b_blocknr);
196 else
197 item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
198 bh->b_blocknr);
199
200 spin_unlock(&oi->ip_lock);
201
202 mlog(0, "index = %d, item = %p\n", index, item);
203
204 return (index != -1) || (item != NULL);
205}
206
207/* Warning: even if it returns true, this does *not* guarantee that
208 * the block is stored in our inode metadata cache. */
209int ocfs2_buffer_uptodate(struct inode *inode,
210 struct buffer_head *bh)
211{
212 /* Doesn't matter if the bh is in our cache or not -- if it's
213 * not marked uptodate then we know it can't have correct
214 * data. */
215 if (!buffer_uptodate(bh))
216 return 0;
217
218 /* OCFS2 does not allow multiple nodes to be changing the same
219 * block at the same time. */
220 if (buffer_jbd(bh))
221 return 1;
222
223 /* Ok, locally the buffer is marked as up to date, now search
224 * our cache to see if we can trust that. */
225 return ocfs2_buffer_cached(OCFS2_I(inode), bh);
226}
227
228/* Requires ip_lock */
229static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
230 sector_t block)
231{
232 BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
233
234 mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
235 ci->ci_num_cached);
236
237 ci->ci_cache.ci_array[ci->ci_num_cached] = block;
238 ci->ci_num_cached++;
239}
240
241/* By now the caller should have checked that the item does *not*
242 * exist in the tree.
243 * Requires ip_lock. */
244static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
245 struct ocfs2_meta_cache_item *new)
246{
247 sector_t block = new->c_block;
248 struct rb_node *parent = NULL;
249 struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
250 struct ocfs2_meta_cache_item *tmp;
251
252 mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
253 ci->ci_num_cached);
254
255 while(*p) {
256 parent = *p;
257
258 tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
259
260 if (block < tmp->c_block)
261 p = &(*p)->rb_left;
262 else if (block > tmp->c_block)
263 p = &(*p)->rb_right;
264 else {
265 /* This should never happen! */
266 mlog(ML_ERROR, "Duplicate block %llu cached!\n",
267 (unsigned long long) block);
268 BUG();
269 }
270 }
271
272 rb_link_node(&new->c_node, parent, p);
273 rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
274 ci->ci_num_cached++;
275}
276
277static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
278 struct ocfs2_caching_info *ci)
279{
280 assert_spin_locked(&oi->ip_lock);
281
282 return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
283 (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
284}
285
286/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
287 * pointers in tree after we use them - this allows caller to detect
288 * when to free in case of error. */
289static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
290 struct ocfs2_meta_cache_item **tree)
291{
292 int i;
293 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
294
295 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
296 "Inode %"MLFu64", num cached = %u, should be %u\n",
297 oi->ip_blkno, ci->ci_num_cached,
298 OCFS2_INODE_MAX_CACHE_ARRAY);
299 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
300 "Inode %"MLFu64" not marked as inline anymore!\n",
301 oi->ip_blkno);
302 assert_spin_locked(&oi->ip_lock);
303
304 /* Be careful to initialize the tree members *first* because
305 * once the ci_tree is used, the array is junk... */
306 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
307 tree[i]->c_block = ci->ci_cache.ci_array[i];
308
309 oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
310 ci->ci_cache.ci_tree = RB_ROOT;
311 /* this will be set again by __ocfs2_insert_cache_tree */
312 ci->ci_num_cached = 0;
313
314 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
315 __ocfs2_insert_cache_tree(ci, tree[i]);
316 tree[i] = NULL;
317 }
318
319 mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n",
320 oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
321}
322
323/* Slow path function - memory allocation is necessary. See the
324 * comment above ocfs2_set_buffer_uptodate for more information. */
325static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
326 sector_t block,
327 int expand_tree)
328{
329 int i;
330 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
331 struct ocfs2_meta_cache_item *new = NULL;
332 struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
333 { NULL, };
334
335 mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n",
336 oi->ip_blkno, (unsigned long long) block, expand_tree);
337
338 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL);
339 if (!new) {
340 mlog_errno(-ENOMEM);
341 return;
342 }
343 new->c_block = block;
344
345 if (expand_tree) {
346 /* Do *not* allocate an array here - the removal code
347 * has no way of tracking that. */
348 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
349 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
350 GFP_KERNEL);
351 if (!tree[i]) {
352 mlog_errno(-ENOMEM);
353 goto out_free;
354 }
355
356 /* These are initialized in ocfs2_expand_cache! */
357 }
358 }
359
360 spin_lock(&oi->ip_lock);
361 if (ocfs2_insert_can_use_array(oi, ci)) {
362 mlog(0, "Someone cleared the tree underneath us\n");
363 /* Ok, items were removed from the cache in between
364 * locks. Detect this and revert back to the fast path */
365 ocfs2_append_cache_array(ci, block);
366 spin_unlock(&oi->ip_lock);
367 goto out_free;
368 }
369
370 if (expand_tree)
371 ocfs2_expand_cache(oi, tree);
372
373 __ocfs2_insert_cache_tree(ci, new);
374 spin_unlock(&oi->ip_lock);
375
376 new = NULL;
377out_free:
378 if (new)
379 kmem_cache_free(ocfs2_uptodate_cachep, new);
380
381 /* If these were used, then ocfs2_expand_cache re-set them to
382 * NULL for us. */
383 if (tree[0]) {
384 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
385 if (tree[i])
386 kmem_cache_free(ocfs2_uptodate_cachep,
387 tree[i]);
388 }
389}
390
391/* Item insertion is guarded by ip_io_sem, so the insertion path takes
392 * advantage of this by not rechecking for a duplicate insert during
393 * the slow case. Additionally, if the cache needs to be bumped up to
394 * a tree, the code will not recheck after acquiring the lock --
395 * multiple paths cannot be expanding to a tree at the same time.
396 *
397 * The slow path takes into account that items can be removed
398 * (including the whole tree wiped and reset) when this process it out
399 * allocating memory. In those cases, it reverts back to the fast
400 * path.
401 *
402 * Note that this function may actually fail to insert the block if
403 * memory cannot be allocated. This is not fatal however (but may
404 * result in a performance penalty) */
405void ocfs2_set_buffer_uptodate(struct inode *inode,
406 struct buffer_head *bh)
407{
408 int expand;
409 struct ocfs2_inode_info *oi = OCFS2_I(inode);
410 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
411
412 /* The block may very well exist in our cache already, so avoid
413 * doing any more work in that case. */
414 if (ocfs2_buffer_cached(oi, bh))
415 return;
416
417 mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno,
418 (unsigned long long) bh->b_blocknr);
419
420 /* No need to recheck under spinlock - insertion is guarded by
421 * ip_io_sem */
422 spin_lock(&oi->ip_lock);
423 if (ocfs2_insert_can_use_array(oi, ci)) {
424 /* Fast case - it's an array and there's a free
425 * spot. */
426 ocfs2_append_cache_array(ci, bh->b_blocknr);
427 spin_unlock(&oi->ip_lock);
428 return;
429 }
430
431 expand = 0;
432 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
433 /* We need to bump things up to a tree. */
434 expand = 1;
435 }
436 spin_unlock(&oi->ip_lock);
437
438 __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
439}
440
441/* Called against a newly allocated buffer. Most likely nobody should
442 * be able to read this sort of metadata while it's still being
443 * allocated, but this is careful to take ip_io_sem anyway. */
444void ocfs2_set_new_buffer_uptodate(struct inode *inode,
445 struct buffer_head *bh)
446{
447 struct ocfs2_inode_info *oi = OCFS2_I(inode);
448
449 /* This should definitely *not* exist in our cache */
450 BUG_ON(ocfs2_buffer_cached(oi, bh));
451
452 set_buffer_uptodate(bh);
453
454 down(&oi->ip_io_sem);
455 ocfs2_set_buffer_uptodate(inode, bh);
456 up(&oi->ip_io_sem);
457}
458
459/* Requires ip_lock. */
460static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
461 int index)
462{
463 sector_t *array = ci->ci_cache.ci_array;
464 int bytes;
465
466 BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
467 BUG_ON(index >= ci->ci_num_cached);
468 BUG_ON(!ci->ci_num_cached);
469
470 mlog(0, "remove index %d (num_cached = %u\n", index,
471 ci->ci_num_cached);
472
473 ci->ci_num_cached--;
474
475 /* don't need to copy if the array is now empty, or if we
476 * removed at the tail */
477 if (ci->ci_num_cached && index < ci->ci_num_cached) {
478 bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
479 memmove(&array[index], &array[index + 1], bytes);
480 }
481}
482
483/* Requires ip_lock. */
484static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
485 struct ocfs2_meta_cache_item *item)
486{
487 mlog(0, "remove block %llu from tree\n",
488 (unsigned long long) item->c_block);
489
490 rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
491 ci->ci_num_cached--;
492}
493
494/* Called when we remove a chunk of metadata from an inode. We don't
495 * bother reverting things to an inlined array in the case of a remove
496 * which moves us back under the limit. */
497void ocfs2_remove_from_cache(struct inode *inode,
498 struct buffer_head *bh)
499{
500 int index;
501 sector_t block = bh->b_blocknr;
502 struct ocfs2_meta_cache_item *item = NULL;
503 struct ocfs2_inode_info *oi = OCFS2_I(inode);
504 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
505
506 spin_lock(&oi->ip_lock);
507 mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n",
508 oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached,
509 oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
510
511 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
512 index = ocfs2_search_cache_array(ci, block);
513 if (index != -1)
514 ocfs2_remove_metadata_array(ci, index);
515 } else {
516 item = ocfs2_search_cache_tree(ci, block);
517 if (item)
518 ocfs2_remove_metadata_tree(ci, item);
519 }
520 spin_unlock(&oi->ip_lock);
521
522 if (item)
523 kmem_cache_free(ocfs2_uptodate_cachep, item);
524}
525
526int __init init_ocfs2_uptodate_cache(void)
527{
528 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
529 sizeof(struct ocfs2_meta_cache_item),
530 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
531 if (!ocfs2_uptodate_cachep)
532 return -ENOMEM;
533
534 mlog(0, "%u inlined cache items per inode.\n",
535 OCFS2_INODE_MAX_CACHE_ARRAY);
536
537 return 0;
538}
539
540void __exit exit_ocfs2_uptodate_cache(void)
541{
542 if (ocfs2_uptodate_cachep)
543 kmem_cache_destroy(ocfs2_uptodate_cachep);
544}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
new file mode 100644
index 000000000000..e5aacdf4eabf
--- /dev/null
+++ b/fs/ocfs2/uptodate.h
@@ -0,0 +1,44 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * uptodate.h
5 *
6 * Cluster uptodate tracking
7 *
8 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_UPTODATE_H
27#define OCFS2_UPTODATE_H
28
29int __init init_ocfs2_uptodate_cache(void);
30void __exit exit_ocfs2_uptodate_cache(void);
31
32void ocfs2_metadata_cache_init(struct inode *inode);
33void ocfs2_metadata_cache_purge(struct inode *inode);
34
35int ocfs2_buffer_uptodate(struct inode *inode,
36 struct buffer_head *bh);
37void ocfs2_set_buffer_uptodate(struct inode *inode,
38 struct buffer_head *bh);
39void ocfs2_set_new_buffer_uptodate(struct inode *inode,
40 struct buffer_head *bh);
41void ocfs2_remove_from_cache(struct inode *inode,
42 struct buffer_head *bh);
43
44#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
new file mode 100644
index 000000000000..5405ce121c99
--- /dev/null
+++ b/fs/ocfs2/ver.c
@@ -0,0 +1,43 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.c
5 *
6 * version string
7 *
8 * Copyright (C) 2002, 2005 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/module.h>
27#include <linux/string.h>
28#include <linux/kernel.h>
29
30#include "ver.h"
31
32#define OCFS2_BUILD_VERSION "1.3.3"
33
34#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
35
36void ocfs2_print_version(void)
37{
38 printk(KERN_INFO "%s\n", VERSION_STR);
39}
40
41MODULE_DESCRIPTION(VERSION_STR);
42
43MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
new file mode 100644
index 000000000000..d7395cb91d2f
--- /dev/null
+++ b/fs/ocfs2/ver.h
@@ -0,0 +1,31 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * ver.h
5 *
6 * Function prototypes
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#ifndef OCFS2_VER_H
27#define OCFS2_VER_H
28
29void ocfs2_print_version(void);
30
31#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
new file mode 100644
index 000000000000..021978e0576b
--- /dev/null
+++ b/fs/ocfs2/vote.c
@@ -0,0 +1,1202 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * vote.c
5 *
6 * description here
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/smp_lock.h>
30#include <linux/kthread.h>
31
32#include <cluster/heartbeat.h>
33#include <cluster/nodemanager.h>
34#include <cluster/tcp.h>
35
36#include <dlm/dlmapi.h>
37
38#define MLOG_MASK_PREFIX ML_VOTE
39#include <cluster/masklog.h>
40
41#include "ocfs2.h"
42
43#include "alloc.h"
44#include "dlmglue.h"
45#include "extent_map.h"
46#include "heartbeat.h"
47#include "inode.h"
48#include "journal.h"
49#include "slot_map.h"
50#include "vote.h"
51
52#include "buffer_head_io.h"
53
54#define OCFS2_MESSAGE_TYPE_VOTE (0x1)
55#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2)
56struct ocfs2_msg_hdr
57{
58 __be32 h_response_id; /* used to lookup message handle on sending
59 * node. */
60 __be32 h_request;
61 __be64 h_blkno;
62 __be32 h_generation;
63 __be32 h_node_num; /* node sending this particular message. */
64};
65
66/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
67 * for the network. */
68#define OCFS2_VOTE_FILENAME_LEN 256
69struct ocfs2_vote_msg
70{
71 struct ocfs2_msg_hdr v_hdr;
72 union {
73 __be32 v_generic1;
74 __be32 v_orphaned_slot; /* Used during delete votes */
75 __be32 v_nlink; /* Used during unlink votes */
76 } md1; /* Message type dependant 1 */
77 __be32 v_unlink_namelen;
78 __be64 v_unlink_parent;
79 u8 v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN];
80};
81
82/* Responses are given these values to maintain backwards
83 * compatibility with older ocfs2 versions */
84#define OCFS2_RESPONSE_OK (0)
85#define OCFS2_RESPONSE_BUSY (-16)
86#define OCFS2_RESPONSE_BAD_MSG (-22)
87
88struct ocfs2_response_msg
89{
90 struct ocfs2_msg_hdr r_hdr;
91 __be32 r_response;
92 __be32 r_orphaned_slot;
93};
94
95struct ocfs2_vote_work {
96 struct list_head w_list;
97 struct ocfs2_vote_msg w_msg;
98};
99
100enum ocfs2_vote_request {
101 OCFS2_VOTE_REQ_INVALID = 0,
102 OCFS2_VOTE_REQ_DELETE,
103 OCFS2_VOTE_REQ_UNLINK,
104 OCFS2_VOTE_REQ_RENAME,
105 OCFS2_VOTE_REQ_MOUNT,
106 OCFS2_VOTE_REQ_UMOUNT,
107 OCFS2_VOTE_REQ_LAST
108};
109
110static inline int ocfs2_is_valid_vote_request(int request)
111{
112 return OCFS2_VOTE_REQ_INVALID < request &&
113 request < OCFS2_VOTE_REQ_LAST;
114}
115
116typedef void (*ocfs2_net_response_callback)(void *priv,
117 struct ocfs2_response_msg *resp);
118struct ocfs2_net_response_cb {
119 ocfs2_net_response_callback rc_cb;
120 void *rc_priv;
121};
122
123struct ocfs2_net_wait_ctxt {
124 struct list_head n_list;
125 u32 n_response_id;
126 wait_queue_head_t n_event;
127 struct ocfs2_node_map n_node_map;
128 int n_response; /* an agreggate response. 0 if
129 * all nodes are go, < 0 on any
130 * negative response from any
131 * node or network error. */
132 struct ocfs2_net_response_cb *n_callback;
133};
134
135static void ocfs2_process_mount_request(struct ocfs2_super *osb,
136 unsigned int node_num)
137{
138 mlog(0, "MOUNT vote from node %u\n", node_num);
139 /* The other node only sends us this message when he has an EX
140 * on the superblock, so our recovery threads (if having been
141 * launched) are waiting on it.*/
142 ocfs2_recovery_map_clear(osb, node_num);
143 ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num);
144
145 /* We clear the umount map here because a node may have been
146 * previously mounted, safely unmounted but never stopped
147 * heartbeating - in which case we'd have a stale entry. */
148 ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
149}
150
151static void ocfs2_process_umount_request(struct ocfs2_super *osb,
152 unsigned int node_num)
153{
154 mlog(0, "UMOUNT vote from node %u\n", node_num);
155 ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num);
156 ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
157}
158
159void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
160{
161 struct ocfs2_inode_info *oi = OCFS2_I(inode);
162
163 assert_spin_locked(&oi->ip_lock);
164 /* We set the SKIP_DELETE flag on the inode so we don't try to
165 * delete it in delete_inode ourselves, thus avoiding
166 * unecessary lock pinging. If the other node failed to wipe
167 * the inode as a result of a crash, then recovery will pick
168 * up the slack. */
169 oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
170}
171
172static int ocfs2_process_delete_request(struct inode *inode,
173 int *orphaned_slot)
174{
175 int response = OCFS2_RESPONSE_BUSY;
176
177 mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
178 inode->i_ino, inode->i_nlink, *orphaned_slot);
179
180 spin_lock(&OCFS2_I(inode)->ip_lock);
181
182 /* Whatever our vote response is, we want to make sure that
183 * the orphaned slot is recorded properly on this node *and*
184 * on the requesting node. Technically, if the requesting node
185 * did not know which slot the inode is orphaned in but we
186 * respond with BUSY he doesn't actually need the orphaned
187 * slot, but it doesn't hurt to do it here anyway. */
188 if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
189 mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
190 OCFS2_INVALID_SLOT &&
191 OCFS2_I(inode)->ip_orphaned_slot !=
192 (*orphaned_slot),
193 "Inode %"MLFu64": This node thinks it's "
194 "orphaned in slot %d, messaged it's in %d\n",
195 OCFS2_I(inode)->ip_blkno,
196 OCFS2_I(inode)->ip_orphaned_slot,
197 *orphaned_slot);
198
199 mlog(0, "Setting orphaned slot for inode %"MLFu64" to %d\n",
200 OCFS2_I(inode)->ip_blkno, *orphaned_slot);
201
202 OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
203 } else {
204 mlog(0, "Sending back orphaned slot %d for inode %"MLFu64"\n",
205 OCFS2_I(inode)->ip_orphaned_slot,
206 OCFS2_I(inode)->ip_blkno);
207
208 *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
209 }
210
211 /* vote no if the file is still open. */
212 if (OCFS2_I(inode)->ip_open_count) {
213 mlog(0, "open count = %u\n",
214 OCFS2_I(inode)->ip_open_count);
215 spin_unlock(&OCFS2_I(inode)->ip_lock);
216 goto done;
217 }
218 spin_unlock(&OCFS2_I(inode)->ip_lock);
219
220 /* directories are a bit ugly... What if someone is sitting in
221 * it? We want to make sure the inode is removed completely as
222 * a result of the iput in process_vote. */
223 if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
224 mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
225 goto done;
226 }
227
228 if (filemap_fdatawrite(inode->i_mapping)) {
229 mlog(ML_ERROR, "Could not sync inode %"MLFu64" for delete!\n",
230 OCFS2_I(inode)->ip_blkno);
231 goto done;
232 }
233 sync_mapping_buffers(inode->i_mapping);
234 truncate_inode_pages(inode->i_mapping, 0);
235 ocfs2_extent_map_trunc(inode, 0);
236
237 spin_lock(&OCFS2_I(inode)->ip_lock);
238 /* double check open count - someone might have raced this
239 * thread into ocfs2_file_open while we were writing out
240 * data. If we're to allow a wipe of this inode now, we *must*
241 * hold the spinlock until we've marked it. */
242 if (OCFS2_I(inode)->ip_open_count) {
243 mlog(0, "Raced to wipe! open count = %u\n",
244 OCFS2_I(inode)->ip_open_count);
245 spin_unlock(&OCFS2_I(inode)->ip_lock);
246 goto done;
247 }
248
249 /* Mark the inode as being wiped from disk. */
250 ocfs2_mark_inode_remotely_deleted(inode);
251 spin_unlock(&OCFS2_I(inode)->ip_lock);
252
253 /* Not sure this is necessary anymore. */
254 d_prune_aliases(inode);
255
256 /* If we get here, then we're voting 'yes', so commit the
257 * delete on our side. */
258 response = OCFS2_RESPONSE_OK;
259done:
260 return response;
261}
262
263static int ocfs2_match_dentry(struct dentry *dentry,
264 u64 parent_blkno,
265 unsigned int namelen,
266 const char *name)
267{
268 struct inode *parent;
269
270 if (!dentry->d_parent) {
271 mlog(0, "Detached from parent.\n");
272 return 0;
273 }
274
275 parent = dentry->d_parent->d_inode;
276 /* Negative parent dentry? */
277 if (!parent)
278 return 0;
279
280 /* Name is in a different directory. */
281 if (OCFS2_I(parent)->ip_blkno != parent_blkno)
282 return 0;
283
284 if (dentry->d_name.len != namelen)
285 return 0;
286
287 /* comparison above guarantees this is safe. */
288 if (memcmp(dentry->d_name.name, name, namelen))
289 return 0;
290
291 return 1;
292}
293
294static void ocfs2_process_dentry_request(struct inode *inode,
295 int rename,
296 unsigned int new_nlink,
297 u64 parent_blkno,
298 unsigned int namelen,
299 const char *name)
300{
301 struct dentry *dentry = NULL;
302 struct list_head *p;
303 struct ocfs2_inode_info *oi = OCFS2_I(inode);
304
305 mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno,
306 namelen, namelen, name);
307
308 spin_lock(&dcache_lock);
309
310 /* Another node is removing this name from the system. It is
311 * up to us to find the corresponding dentry and if it exists,
312 * unhash it from the dcache. */
313 list_for_each(p, &inode->i_dentry) {
314 dentry = list_entry(p, struct dentry, d_alias);
315
316 if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) {
317 mlog(0, "dentry found: %.*s\n",
318 dentry->d_name.len, dentry->d_name.name);
319
320 dget_locked(dentry);
321 break;
322 }
323
324 dentry = NULL;
325 }
326
327 spin_unlock(&dcache_lock);
328
329 if (dentry) {
330 d_delete(dentry);
331 dput(dentry);
332 }
333
334 /* rename votes don't send link counts */
335 if (!rename) {
336 mlog(0, "new_nlink = %u\n", new_nlink);
337
338 /* We don't have the proper locks here to directly
339 * change i_nlink and besides, the vote is sent
340 * *before* the operation so it may have failed on the
341 * other node. This passes a hint to ocfs2_drop_inode
342 * to force ocfs2_delete_inode, who will take the
343 * proper cluster locks to sort things out. */
344 if (new_nlink == 0) {
345 spin_lock(&oi->ip_lock);
346 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
347 spin_unlock(&OCFS2_I(inode)->ip_lock);
348 }
349 }
350}
351
352static void ocfs2_process_vote(struct ocfs2_super *osb,
353 struct ocfs2_vote_msg *msg)
354{
355 int net_status, vote_response;
356 int orphaned_slot = 0;
357 int rename = 0;
358 unsigned int node_num, generation, new_nlink, namelen;
359 u64 blkno, parent_blkno;
360 enum ocfs2_vote_request request;
361 struct inode *inode = NULL;
362 struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
363 struct ocfs2_response_msg response;
364
365 /* decode the network mumbo jumbo into local variables. */
366 request = be32_to_cpu(hdr->h_request);
367 blkno = be64_to_cpu(hdr->h_blkno);
368 generation = be32_to_cpu(hdr->h_generation);
369 node_num = be32_to_cpu(hdr->h_node_num);
370 if (request == OCFS2_VOTE_REQ_DELETE)
371 orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
372
373 mlog(0, "processing vote: request = %u, blkno = %"MLFu64", "
374 "generation = %u, node_num = %u, priv1 = %u\n", request,
375 blkno, generation, node_num, be32_to_cpu(msg->md1.v_generic1));
376
377 if (!ocfs2_is_valid_vote_request(request)) {
378 mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
379 request, node_num);
380 vote_response = OCFS2_RESPONSE_BAD_MSG;
381 goto respond;
382 }
383
384 vote_response = OCFS2_RESPONSE_OK;
385
386 switch (request) {
387 case OCFS2_VOTE_REQ_UMOUNT:
388 ocfs2_process_umount_request(osb, node_num);
389 goto respond;
390 case OCFS2_VOTE_REQ_MOUNT:
391 ocfs2_process_mount_request(osb, node_num);
392 goto respond;
393 default:
394 /* avoids a gcc warning */
395 break;
396 }
397
398 /* We cannot process the remaining message types before we're
399 * fully mounted. It's perfectly safe however to send a 'yes'
400 * response as we can't possibly have any of the state they're
401 * asking us to modify yet. */
402 if (atomic_read(&osb->vol_state) == VOLUME_INIT)
403 goto respond;
404
405 /* If we get here, then the request is against an inode. */
406 inode = ocfs2_ilookup_for_vote(osb, blkno,
407 request == OCFS2_VOTE_REQ_DELETE);
408
409 /* Not finding the inode is perfectly valid - it means we're
410 * not interested in what the other node is about to do to it
411 * so in those cases we automatically respond with an
412 * affirmative. Cluster locking ensures that we won't race
413 * interest in the inode with this vote request. */
414 if (!inode)
415 goto respond;
416
417 /* Check generation values. It's possible for us to get a
418 * request against a stale inode. If so then we proceed as if
419 * we had not found an inode in the first place. */
420 if (inode->i_generation != generation) {
421 mlog(0, "generation passed %u != inode generation = %u, "
422 "ip_flags = %x, ip_blkno = %"MLFu64", msg %"MLFu64", "
423 "i_count = %u, message type = %u\n",
424 generation, inode->i_generation, OCFS2_I(inode)->ip_flags,
425 OCFS2_I(inode)->ip_blkno, blkno,
426 atomic_read(&inode->i_count), request);
427 iput(inode);
428 inode = NULL;
429 goto respond;
430 }
431
432 switch (request) {
433 case OCFS2_VOTE_REQ_DELETE:
434 vote_response = ocfs2_process_delete_request(inode,
435 &orphaned_slot);
436 break;
437 case OCFS2_VOTE_REQ_RENAME:
438 rename = 1;
439 /* fall through */
440 case OCFS2_VOTE_REQ_UNLINK:
441 parent_blkno = be64_to_cpu(msg->v_unlink_parent);
442 namelen = be32_to_cpu(msg->v_unlink_namelen);
443 /* new_nlink will be ignored in case of a rename vote */
444 new_nlink = be32_to_cpu(msg->md1.v_nlink);
445 ocfs2_process_dentry_request(inode, rename, new_nlink,
446 parent_blkno, namelen,
447 msg->v_unlink_dirent);
448 break;
449 default:
450 mlog(ML_ERROR, "node %u, invalid request: %u\n",
451 node_num, request);
452 vote_response = OCFS2_RESPONSE_BAD_MSG;
453 }
454
455respond:
456 /* Response struture is small so we just put it on the stack
457 * and stuff it inline. */
458 memset(&response, 0, sizeof(struct ocfs2_response_msg));
459 response.r_hdr.h_response_id = hdr->h_response_id;
460 response.r_hdr.h_blkno = hdr->h_blkno;
461 response.r_hdr.h_generation = hdr->h_generation;
462 response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
463 response.r_response = cpu_to_be32(vote_response);
464 response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
465
466 net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
467 osb->net_key,
468 &response,
469 sizeof(struct ocfs2_response_msg),
470 node_num,
471 NULL);
472 /* We still want to error print for ENOPROTOOPT here. The
473 * sending node shouldn't have unregistered his net handler
474 * without sending an unmount vote 1st */
475 if (net_status < 0
476 && net_status != -ETIMEDOUT
477 && net_status != -ENOTCONN)
478 mlog(ML_ERROR, "message to node %u fails with error %d!\n",
479 node_num, net_status);
480
481 if (inode)
482 iput(inode);
483}
484
485static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
486{
487 unsigned long processed;
488 struct ocfs2_lock_res *lockres;
489 struct ocfs2_vote_work *work;
490
491 mlog_entry_void();
492
493 spin_lock(&osb->vote_task_lock);
494 /* grab this early so we know to try again if a state change and
495 * wake happens part-way through our work */
496 osb->vote_work_sequence = osb->vote_wake_sequence;
497
498 processed = osb->blocked_lock_count;
499 while (processed) {
500 BUG_ON(list_empty(&osb->blocked_lock_list));
501
502 lockres = list_entry(osb->blocked_lock_list.next,
503 struct ocfs2_lock_res, l_blocked_list);
504 list_del_init(&lockres->l_blocked_list);
505 osb->blocked_lock_count--;
506 spin_unlock(&osb->vote_task_lock);
507
508 BUG_ON(!processed);
509 processed--;
510
511 ocfs2_process_blocked_lock(osb, lockres);
512
513 spin_lock(&osb->vote_task_lock);
514 }
515
516 while (osb->vote_count) {
517 BUG_ON(list_empty(&osb->vote_list));
518 work = list_entry(osb->vote_list.next,
519 struct ocfs2_vote_work, w_list);
520 list_del(&work->w_list);
521 osb->vote_count--;
522 spin_unlock(&osb->vote_task_lock);
523
524 ocfs2_process_vote(osb, &work->w_msg);
525 kfree(work);
526
527 spin_lock(&osb->vote_task_lock);
528 }
529 spin_unlock(&osb->vote_task_lock);
530
531 mlog_exit_void();
532}
533
534static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb)
535{
536 int empty = 0;
537
538 spin_lock(&osb->vote_task_lock);
539 if (list_empty(&osb->blocked_lock_list) &&
540 list_empty(&osb->vote_list))
541 empty = 1;
542
543 spin_unlock(&osb->vote_task_lock);
544 return empty;
545}
546
547static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb)
548{
549 int should_wake = 0;
550
551 spin_lock(&osb->vote_task_lock);
552 if (osb->vote_work_sequence != osb->vote_wake_sequence)
553 should_wake = 1;
554 spin_unlock(&osb->vote_task_lock);
555
556 return should_wake;
557}
558
559int ocfs2_vote_thread(void *arg)
560{
561 int status = 0;
562 struct ocfs2_super *osb = arg;
563
564 /* only quit once we've been asked to stop and there is no more
565 * work available */
566 while (!(kthread_should_stop() &&
567 ocfs2_vote_thread_lists_empty(osb))) {
568
569 wait_event_interruptible(osb->vote_event,
570 ocfs2_vote_thread_should_wake(osb) ||
571 kthread_should_stop());
572
573 mlog(0, "vote_thread: awoken\n");
574
575 ocfs2_vote_thread_do_work(osb);
576 }
577
578 osb->vote_task = NULL;
579 return status;
580}
581
582static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id)
583{
584 struct ocfs2_net_wait_ctxt *w;
585
586 w = kcalloc(1, sizeof(*w), GFP_KERNEL);
587 if (!w) {
588 mlog_errno(-ENOMEM);
589 goto bail;
590 }
591
592 INIT_LIST_HEAD(&w->n_list);
593 init_waitqueue_head(&w->n_event);
594 ocfs2_node_map_init(&w->n_node_map);
595 w->n_response_id = response_id;
596 w->n_callback = NULL;
597bail:
598 return w;
599}
600
601static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb)
602{
603 unsigned int ret;
604
605 spin_lock(&osb->net_response_lock);
606 ret = ++osb->net_response_ids;
607 spin_unlock(&osb->net_response_lock);
608
609 return ret;
610}
611
612static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb,
613 struct ocfs2_net_wait_ctxt *w)
614{
615 spin_lock(&osb->net_response_lock);
616 list_del(&w->n_list);
617 spin_unlock(&osb->net_response_lock);
618}
619
620static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb,
621 struct ocfs2_net_wait_ctxt *w)
622{
623 spin_lock(&osb->net_response_lock);
624 list_add_tail(&w->n_list,
625 &osb->net_response_list);
626 spin_unlock(&osb->net_response_lock);
627}
628
629static void __ocfs2_mark_node_responded(struct ocfs2_super *osb,
630 struct ocfs2_net_wait_ctxt *w,
631 int node_num)
632{
633 assert_spin_locked(&osb->net_response_lock);
634
635 ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num);
636 if (ocfs2_node_map_is_empty(osb, &w->n_node_map))
637 wake_up(&w->n_event);
638}
639
640/* Intended to be called from the node down callback, we fake remove
641 * the node from all our response contexts */
642void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
643 int node_num)
644{
645 struct list_head *p;
646 struct ocfs2_net_wait_ctxt *w = NULL;
647
648 spin_lock(&osb->net_response_lock);
649
650 list_for_each(p, &osb->net_response_list) {
651 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
652
653 __ocfs2_mark_node_responded(osb, w, node_num);
654 }
655
656 spin_unlock(&osb->net_response_lock);
657}
658
659static int ocfs2_broadcast_vote(struct ocfs2_super *osb,
660 struct ocfs2_vote_msg *request,
661 unsigned int response_id,
662 int *response,
663 struct ocfs2_net_response_cb *callback)
664{
665 int status, i, remote_err;
666 struct ocfs2_net_wait_ctxt *w = NULL;
667 int dequeued = 0;
668
669 mlog_entry_void();
670
671 w = ocfs2_new_net_wait_ctxt(response_id);
672 if (!w) {
673 status = -ENOMEM;
674 mlog_errno(status);
675 goto bail;
676 }
677 w->n_callback = callback;
678
679 /* we're pretty much ready to go at this point, and this fills
680 * in n_response which we need anyway... */
681 ocfs2_queue_net_wait_ctxt(osb, w);
682
683 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0);
684
685 while (i != O2NM_INVALID_NODE_NUM) {
686 if (i != osb->node_num) {
687 mlog(0, "trying to send request to node %i\n", i);
688 ocfs2_node_map_set_bit(osb, &w->n_node_map, i);
689
690 remote_err = 0;
691 status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE,
692 osb->net_key,
693 request,
694 sizeof(*request),
695 i,
696 &remote_err);
697 if (status == -ETIMEDOUT) {
698 mlog(0, "remote node %d timed out!\n", i);
699 status = -EAGAIN;
700 goto bail;
701 }
702 if (remote_err < 0) {
703 status = remote_err;
704 mlog(0, "remote error %d on node %d!\n",
705 remote_err, i);
706 mlog_errno(status);
707 goto bail;
708 }
709 if (status < 0) {
710 mlog_errno(status);
711 goto bail;
712 }
713 }
714 i++;
715 i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i);
716 mlog(0, "next is %d, i am %d\n", i, osb->node_num);
717 }
718 mlog(0, "done sending, now waiting on responses...\n");
719
720 wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map));
721
722 ocfs2_dequeue_net_wait_ctxt(osb, w);
723 dequeued = 1;
724
725 *response = w->n_response;
726 status = 0;
727bail:
728 if (w) {
729 if (!dequeued)
730 ocfs2_dequeue_net_wait_ctxt(osb, w);
731 kfree(w);
732 }
733
734 mlog_exit(status);
735 return status;
736}
737
738static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
739 u64 blkno,
740 unsigned int generation,
741 enum ocfs2_vote_request type,
742 u32 priv)
743{
744 struct ocfs2_vote_msg *request;
745 struct ocfs2_msg_hdr *hdr;
746
747 BUG_ON(!ocfs2_is_valid_vote_request(type));
748
749 request = kcalloc(1, sizeof(*request), GFP_KERNEL);
750 if (!request) {
751 mlog_errno(-ENOMEM);
752 } else {
753 hdr = &request->v_hdr;
754 hdr->h_node_num = cpu_to_be32(osb->node_num);
755 hdr->h_request = cpu_to_be32(type);
756 hdr->h_blkno = cpu_to_be64(blkno);
757 hdr->h_generation = cpu_to_be32(generation);
758
759 request->md1.v_generic1 = cpu_to_be32(priv);
760 }
761
762 return request;
763}
764
765/* Complete the buildup of a new vote request and process the
766 * broadcast return value. */
767static int ocfs2_do_request_vote(struct ocfs2_super *osb,
768 struct ocfs2_vote_msg *request,
769 struct ocfs2_net_response_cb *callback)
770{
771 int status, response;
772 unsigned int response_id;
773 struct ocfs2_msg_hdr *hdr;
774
775 response_id = ocfs2_new_response_id(osb);
776
777 hdr = &request->v_hdr;
778 hdr->h_response_id = cpu_to_be32(response_id);
779
780 status = ocfs2_broadcast_vote(osb, request, response_id, &response,
781 callback);
782 if (status < 0) {
783 mlog_errno(status);
784 goto bail;
785 }
786
787 status = response;
788bail:
789
790 return status;
791}
792
793static int ocfs2_request_vote(struct inode *inode,
794 struct ocfs2_vote_msg *request,
795 struct ocfs2_net_response_cb *callback)
796{
797 int status;
798 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
799
800 if (ocfs2_inode_is_new(inode))
801 return 0;
802
803 status = -EAGAIN;
804 while (status == -EAGAIN) {
805 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
806 signal_pending(current))
807 return -ERESTARTSYS;
808
809 status = ocfs2_super_lock(osb, 0);
810 if (status < 0) {
811 mlog_errno(status);
812 break;
813 }
814
815 status = 0;
816 if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
817 osb->node_num))
818 status = ocfs2_do_request_vote(osb, request, callback);
819
820 ocfs2_super_unlock(osb, 0);
821 }
822 return status;
823}
824
825static void ocfs2_delete_response_cb(void *priv,
826 struct ocfs2_response_msg *resp)
827{
828 int orphaned_slot, node;
829 struct inode *inode = priv;
830
831 orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
832 node = be32_to_cpu(resp->r_hdr.h_node_num);
833 mlog(0, "node %d tells us that inode %"MLFu64" is orphaned in slot "
834 "%d\n", node, OCFS2_I(inode)->ip_blkno, orphaned_slot);
835
836 /* The other node may not actually know which slot the inode
837 * is orphaned in. */
838 if (orphaned_slot == OCFS2_INVALID_SLOT)
839 return;
840
841 /* Ok, the responding node knows which slot this inode is
842 * orphaned in. We verify that the information is correct and
843 * then record this in the inode. ocfs2_delete_inode will use
844 * this information to determine which lock to take. */
845 spin_lock(&OCFS2_I(inode)->ip_lock);
846 mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
847 OCFS2_I(inode)->ip_orphaned_slot
848 != OCFS2_INVALID_SLOT, "Inode %"MLFu64": Node %d "
849 "says it's orphaned in slot %d, we think it's in %d\n",
850 OCFS2_I(inode)->ip_blkno,
851 be32_to_cpu(resp->r_hdr.h_node_num),
852 orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
853
854 OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
855 spin_unlock(&OCFS2_I(inode)->ip_lock);
856}
857
858int ocfs2_request_delete_vote(struct inode *inode)
859{
860 int orphaned_slot, status;
861 struct ocfs2_net_response_cb delete_cb;
862 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
863 struct ocfs2_vote_msg *request;
864
865 spin_lock(&OCFS2_I(inode)->ip_lock);
866 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
867 spin_unlock(&OCFS2_I(inode)->ip_lock);
868
869 delete_cb.rc_cb = ocfs2_delete_response_cb;
870 delete_cb.rc_priv = inode;
871
872 mlog(0, "Inode %"MLFu64", we start thinking orphaned slot is %d\n",
873 OCFS2_I(inode)->ip_blkno, orphaned_slot);
874
875 status = -ENOMEM;
876 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
877 inode->i_generation,
878 OCFS2_VOTE_REQ_DELETE, orphaned_slot);
879 if (request) {
880 status = ocfs2_request_vote(inode, request, &delete_cb);
881
882 kfree(request);
883 }
884
885 return status;
886}
887
888static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg *request,
889 struct dentry *dentry)
890{
891 struct inode *parent = dentry->d_parent->d_inode;
892
893 /* We need some values which will uniquely identify a dentry
894 * on the other nodes so that they can find it and run
895 * d_delete against it. Parent directory block and full name
896 * should suffice. */
897
898 mlog(0, "unlink/rename request: parent: %"MLFu64" name: %.*s\n",
899 OCFS2_I(parent)->ip_blkno, dentry->d_name.len,
900 dentry->d_name.name);
901
902 request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno);
903 request->v_unlink_namelen = cpu_to_be32(dentry->d_name.len);
904 memcpy(request->v_unlink_dirent, dentry->d_name.name,
905 dentry->d_name.len);
906}
907
908int ocfs2_request_unlink_vote(struct inode *inode,
909 struct dentry *dentry,
910 unsigned int nlink)
911{
912 int status;
913 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
914 struct ocfs2_vote_msg *request;
915
916 if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
917 return -ENAMETOOLONG;
918
919 status = -ENOMEM;
920 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
921 inode->i_generation,
922 OCFS2_VOTE_REQ_UNLINK, nlink);
923 if (request) {
924 ocfs2_setup_unlink_vote(request, dentry);
925
926 status = ocfs2_request_vote(inode, request, NULL);
927
928 kfree(request);
929 }
930 return status;
931}
932
933int ocfs2_request_rename_vote(struct inode *inode,
934 struct dentry *dentry)
935{
936 int status;
937 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
938 struct ocfs2_vote_msg *request;
939
940 if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN)
941 return -ENAMETOOLONG;
942
943 status = -ENOMEM;
944 request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
945 inode->i_generation,
946 OCFS2_VOTE_REQ_RENAME, 0);
947 if (request) {
948 ocfs2_setup_unlink_vote(request, dentry);
949
950 status = ocfs2_request_vote(inode, request, NULL);
951
952 kfree(request);
953 }
954 return status;
955}
956
957int ocfs2_request_mount_vote(struct ocfs2_super *osb)
958{
959 int status;
960 struct ocfs2_vote_msg *request = NULL;
961
962 request = ocfs2_new_vote_request(osb, 0ULL, 0,
963 OCFS2_VOTE_REQ_MOUNT, 0);
964 if (!request) {
965 status = -ENOMEM;
966 goto bail;
967 }
968
969 status = -EAGAIN;
970 while (status == -EAGAIN) {
971 if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
972 signal_pending(current)) {
973 status = -ERESTARTSYS;
974 goto bail;
975 }
976
977 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
978 osb->node_num)) {
979 status = 0;
980 goto bail;
981 }
982
983 status = ocfs2_do_request_vote(osb, request, NULL);
984 }
985
986bail:
987 if (request)
988 kfree(request);
989
990 return status;
991}
992
993int ocfs2_request_umount_vote(struct ocfs2_super *osb)
994{
995 int status;
996 struct ocfs2_vote_msg *request = NULL;
997
998 request = ocfs2_new_vote_request(osb, 0ULL, 0,
999 OCFS2_VOTE_REQ_UMOUNT, 0);
1000 if (!request) {
1001 status = -ENOMEM;
1002 goto bail;
1003 }
1004
1005 status = -EAGAIN;
1006 while (status == -EAGAIN) {
1007 /* Do not check signals on this vote... We really want
1008 * this one to go all the way through. */
1009
1010 if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
1011 osb->node_num)) {
1012 status = 0;
1013 goto bail;
1014 }
1015
1016 status = ocfs2_do_request_vote(osb, request, NULL);
1017 }
1018
1019bail:
1020 if (request)
1021 kfree(request);
1022
1023 return status;
1024}
1025
1026/* TODO: This should eventually be a hash table! */
1027static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb,
1028 u32 response_id)
1029{
1030 struct list_head *p;
1031 struct ocfs2_net_wait_ctxt *w = NULL;
1032
1033 list_for_each(p, &osb->net_response_list) {
1034 w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list);
1035 if (response_id == w->n_response_id)
1036 break;
1037 w = NULL;
1038 }
1039
1040 return w;
1041}
1042
1043/* Translate response codes into local node errno values */
1044static inline int ocfs2_translate_response(int response)
1045{
1046 int ret;
1047
1048 switch (response) {
1049 case OCFS2_RESPONSE_OK:
1050 ret = 0;
1051 break;
1052
1053 case OCFS2_RESPONSE_BUSY:
1054 ret = -EBUSY;
1055 break;
1056
1057 default:
1058 ret = -EINVAL;
1059 }
1060
1061 return ret;
1062}
1063
1064static int ocfs2_handle_response_message(struct o2net_msg *msg,
1065 u32 len,
1066 void *data)
1067{
1068 unsigned int response_id, node_num;
1069 int response_status;
1070 struct ocfs2_super *osb = data;
1071 struct ocfs2_response_msg *resp;
1072 struct ocfs2_net_wait_ctxt * w;
1073 struct ocfs2_net_response_cb *resp_cb;
1074
1075 resp = (struct ocfs2_response_msg *) msg->buf;
1076
1077 response_id = be32_to_cpu(resp->r_hdr.h_response_id);
1078 node_num = be32_to_cpu(resp->r_hdr.h_node_num);
1079 response_status =
1080 ocfs2_translate_response(be32_to_cpu(resp->r_response));
1081
1082 mlog(0, "received response message:\n");
1083 mlog(0, "h_response_id = %u\n", response_id);
1084 mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request));
1085 mlog(0, "h_blkno = %"MLFu64"\n", be64_to_cpu(resp->r_hdr.h_blkno));
1086 mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation));
1087 mlog(0, "h_node_num = %u\n", node_num);
1088 mlog(0, "r_response = %d\n", response_status);
1089
1090 spin_lock(&osb->net_response_lock);
1091 w = __ocfs2_find_net_wait_ctxt(osb, response_id);
1092 if (!w) {
1093 mlog(0, "request not found!\n");
1094 goto bail;
1095 }
1096 resp_cb = w->n_callback;
1097
1098 if (response_status && (!w->n_response)) {
1099 /* we only really need one negative response so don't
1100 * set it twice. */
1101 w->n_response = response_status;
1102 }
1103
1104 if (resp_cb) {
1105 spin_unlock(&osb->net_response_lock);
1106
1107 resp_cb->rc_cb(resp_cb->rc_priv, resp);
1108
1109 spin_lock(&osb->net_response_lock);
1110 }
1111
1112 __ocfs2_mark_node_responded(osb, w, node_num);
1113bail:
1114 spin_unlock(&osb->net_response_lock);
1115
1116 return 0;
1117}
1118
1119static int ocfs2_handle_vote_message(struct o2net_msg *msg,
1120 u32 len,
1121 void *data)
1122{
1123 int status;
1124 struct ocfs2_super *osb = data;
1125 struct ocfs2_vote_work *work;
1126
1127 work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL);
1128 if (!work) {
1129 status = -ENOMEM;
1130 mlog_errno(status);
1131 goto bail;
1132 }
1133
1134 INIT_LIST_HEAD(&work->w_list);
1135 memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg));
1136
1137 mlog(0, "scheduling vote request:\n");
1138 mlog(0, "h_response_id = %u\n",
1139 be32_to_cpu(work->w_msg.v_hdr.h_response_id));
1140 mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request));
1141 mlog(0, "h_blkno = %"MLFu64"\n",
1142 be64_to_cpu(work->w_msg.v_hdr.h_blkno));
1143 mlog(0, "h_generation = %u\n",
1144 be32_to_cpu(work->w_msg.v_hdr.h_generation));
1145 mlog(0, "h_node_num = %u\n",
1146 be32_to_cpu(work->w_msg.v_hdr.h_node_num));
1147 mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
1148
1149 spin_lock(&osb->vote_task_lock);
1150 list_add_tail(&work->w_list, &osb->vote_list);
1151 osb->vote_count++;
1152 spin_unlock(&osb->vote_task_lock);
1153
1154 ocfs2_kick_vote_thread(osb);
1155
1156 status = 0;
1157bail:
1158 return status;
1159}
1160
1161void ocfs2_unregister_net_handlers(struct ocfs2_super *osb)
1162{
1163 if (!osb->net_key)
1164 return;
1165
1166 o2net_unregister_handler_list(&osb->osb_net_handlers);
1167
1168 if (!list_empty(&osb->net_response_list))
1169 mlog(ML_ERROR, "net response list not empty!\n");
1170
1171 osb->net_key = 0;
1172}
1173
1174int ocfs2_register_net_handlers(struct ocfs2_super *osb)
1175{
1176 int status = 0;
1177
1178 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE,
1179 osb->net_key,
1180 sizeof(struct ocfs2_response_msg),
1181 ocfs2_handle_response_message,
1182 osb, &osb->osb_net_handlers);
1183 if (status) {
1184 mlog_errno(status);
1185 goto bail;
1186 }
1187
1188 status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE,
1189 osb->net_key,
1190 sizeof(struct ocfs2_vote_msg),
1191 ocfs2_handle_vote_message,
1192 osb, &osb->osb_net_handlers);
1193 if (status) {
1194 mlog_errno(status);
1195 goto bail;
1196 }
1197bail:
1198 if (status < 0)
1199 ocfs2_unregister_net_handlers(osb);
1200
1201 return status;
1202}
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
new file mode 100644
index 000000000000..9cce60703466
--- /dev/null
+++ b/fs/ocfs2/vote.h
@@ -0,0 +1,56 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * vote.h
5 *
6 * description here
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26
27#ifndef VOTE_H
28#define VOTE_H
29
30int ocfs2_vote_thread(void *arg);
31static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
32{
33 spin_lock(&osb->vote_task_lock);
34 /* make sure the voting thread gets a swipe at whatever changes
35 * the caller may have made to the voting state */
36 osb->vote_wake_sequence++;
37 spin_unlock(&osb->vote_task_lock);
38 wake_up(&osb->vote_event);
39}
40
41int ocfs2_request_delete_vote(struct inode *inode);
42int ocfs2_request_unlink_vote(struct inode *inode,
43 struct dentry *dentry,
44 unsigned int nlink);
45int ocfs2_request_rename_vote(struct inode *inode,
46 struct dentry *dentry);
47int ocfs2_request_mount_vote(struct ocfs2_super *osb);
48int ocfs2_request_umount_vote(struct ocfs2_super *osb);
49int ocfs2_register_net_handlers(struct ocfs2_super *osb);
50void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
51
52void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
53
54void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
55 int node_num);
56#endif
diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig
index 656bc43431b9..e227a04261ab 100644
--- a/fs/partitions/Kconfig
+++ b/fs/partitions/Kconfig
@@ -85,7 +85,7 @@ config ATARI_PARTITION
85 85
86config IBM_PARTITION 86config IBM_PARTITION
87 bool "IBM disk label and partition support" 87 bool "IBM disk label and partition support"
88 depends on PARTITION_ADVANCED && ARCH_S390 88 depends on PARTITION_ADVANCED && S390
89 help 89 help
90 Say Y here if you would like to be able to read the hard disk 90 Say Y here if you would like to be able to read the hard disk
91 partition table format used by IBM DASD disks operating under CMS. 91 partition table format used by IBM DASD disks operating under CMS.
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 6327bcb2d73d..78010ad60e47 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -56,7 +56,10 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
56 struct hd_geometry *geo; 56 struct hd_geometry *geo;
57 char type[5] = {0,}; 57 char type[5] = {0,};
58 char name[7] = {0,}; 58 char name[7] = {0,};
59 struct vtoc_volume_label *vlabel; 59 union label_t {
60 struct vtoc_volume_label vol;
61 struct vtoc_cms_label cms;
62 } *label;
60 unsigned char *data; 63 unsigned char *data;
61 Sector sect; 64 Sector sect;
62 65
@@ -64,9 +67,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
64 goto out_noinfo; 67 goto out_noinfo;
65 if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL) 68 if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL)
66 goto out_nogeo; 69 goto out_nogeo;
67 if ((vlabel = kmalloc(sizeof(struct vtoc_volume_label), 70 if ((label = kmalloc(sizeof(union label_t), GFP_KERNEL)) == NULL)
68 GFP_KERNEL)) == NULL) 71 goto out_nolab;
69 goto out_novlab;
70 72
71 if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 || 73 if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 ||
72 ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) 74 ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
@@ -87,7 +89,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
87 strncpy(name, data + 8, 6); 89 strncpy(name, data + 8, 6);
88 else 90 else
89 strncpy(name, data + 4, 6); 91 strncpy(name, data + 4, 6);
90 memcpy (vlabel, data, sizeof(struct vtoc_volume_label)); 92 memcpy(label, data, sizeof(union label_t));
91 put_dev_sector(sect); 93 put_dev_sector(sect);
92 94
93 EBCASC(type, 4); 95 EBCASC(type, 4);
@@ -100,14 +102,12 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
100 /* 102 /*
101 * VM style CMS1 labeled disk 103 * VM style CMS1 labeled disk
102 */ 104 */
103 int *label = (int *) vlabel; 105 if (label->cms.disk_offset != 0) {
104
105 if (label[13] != 0) {
106 printk("CMS1/%8s(MDSK):", name); 106 printk("CMS1/%8s(MDSK):", name);
107 /* disk is reserved minidisk */ 107 /* disk is reserved minidisk */
108 blocksize = label[3]; 108 blocksize = label->cms.block_size;
109 offset = label[13]; 109 offset = label->cms.disk_offset;
110 size = (label[7] - 1)*(blocksize >> 9); 110 size = (label->cms.block_count - 1) * (blocksize >> 9);
111 } else { 111 } else {
112 printk("CMS1/%8s:", name); 112 printk("CMS1/%8s:", name);
113 offset = (info->label_block + 1); 113 offset = (info->label_block + 1);
@@ -126,7 +126,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
126 printk("VOL1/%8s:", name); 126 printk("VOL1/%8s:", name);
127 127
128 /* get block number and read then go through format1 labels */ 128 /* get block number and read then go through format1 labels */
129 blk = cchhb2blk(&vlabel->vtoc, geo) + 1; 129 blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
130 counter = 0; 130 counter = 0;
131 while ((data = read_dev_sector(bdev, blk*(blocksize/512), 131 while ((data = read_dev_sector(bdev, blk*(blocksize/512),
132 &sect)) != NULL) { 132 &sect)) != NULL) {
@@ -174,7 +174,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
174 } 174 }
175 175
176 printk("\n"); 176 printk("\n");
177 kfree(vlabel); 177 kfree(label);
178 kfree(geo); 178 kfree(geo);
179 kfree(info); 179 kfree(info);
180 return 1; 180 return 1;
@@ -182,8 +182,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
182out_readerr: 182out_readerr:
183out_badsect: 183out_badsect:
184out_noioctl: 184out_noioctl:
185 kfree(vlabel); 185 kfree(label);
186out_novlab: 186out_nolab:
187 kfree(geo); 187 kfree(geo);
188out_nogeo: 188out_nogeo:
189 kfree(info); 189 kfree(info);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3e1239e4b303..5e9251f65317 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,7 @@ int proc_pid_status(struct task_struct *task, char * buffer)
308 buffer = task_sig(task, buffer); 308 buffer = task_sig(task, buffer);
309 buffer = task_cap(task, buffer); 309 buffer = task_cap(task, buffer);
310 buffer = cpuset_task_status_allowed(task, buffer); 310 buffer = cpuset_task_status_allowed(task, buffer);
311#if defined(CONFIG_ARCH_S390) 311#if defined(CONFIG_S390)
312 buffer = task_show_regs(task, buffer); 312 buffer = task_show_regs(task, buffer);
313#endif 313#endif
314 return buffer - orig; 314 return buffer - orig;
diff --git a/fs/ramfs/Makefile b/fs/ramfs/Makefile
index f096f3007091..5a0236e02ee1 100644
--- a/fs/ramfs/Makefile
+++ b/fs/ramfs/Makefile
@@ -4,4 +4,6 @@
4 4
5obj-$(CONFIG_RAMFS) += ramfs.o 5obj-$(CONFIG_RAMFS) += ramfs.o
6 6
7ramfs-objs := inode.o 7file-mmu-y := file-nommu.o
8file-mmu-$(CONFIG_MMU) := file-mmu.o
9ramfs-objs += inode.o $(file-mmu-y)
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
new file mode 100644
index 000000000000..2115383dcc8d
--- /dev/null
+++ b/fs/ramfs/file-mmu.c
@@ -0,0 +1,57 @@
1/* file-mmu.c: ramfs MMU-based file operations
2 *
3 * Resizable simple ram filesystem for Linux.
4 *
5 * Copyright (C) 2000 Linus Torvalds.
6 * 2000 Transmeta Corp.
7 *
8 * Usage limits added by David Gibson, Linuxcare Australia.
9 * This file is released under the GPL.
10 */
11
12/*
13 * NOTE! This filesystem is probably most useful
14 * not as a real filesystem, but as an example of
15 * how virtual filesystems can be written.
16 *
17 * It doesn't get much simpler than this. Consider
18 * that this file implements the full semantics of
19 * a POSIX-compliant read-write filesystem.
20 *
21 * Note in particular how the filesystem does not
22 * need to implement any data structures of its own
23 * to keep track of the virtual data: using the VFS
24 * caches is sufficient.
25 */
26
27#include <linux/module.h>
28#include <linux/fs.h>
29#include <linux/pagemap.h>
30#include <linux/highmem.h>
31#include <linux/init.h>
32#include <linux/string.h>
33#include <linux/smp_lock.h>
34#include <linux/backing-dev.h>
35#include <linux/ramfs.h>
36
37#include <asm/uaccess.h>
38#include "internal.h"
39
40struct address_space_operations ramfs_aops = {
41 .readpage = simple_readpage,
42 .prepare_write = simple_prepare_write,
43 .commit_write = simple_commit_write
44};
45
46struct file_operations ramfs_file_operations = {
47 .read = generic_file_read,
48 .write = generic_file_write,
49 .mmap = generic_file_mmap,
50 .fsync = simple_sync_file,
51 .sendfile = generic_file_sendfile,
52 .llseek = generic_file_llseek,
53};
54
55struct inode_operations ramfs_file_inode_operations = {
56 .getattr = simple_getattr,
57};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
new file mode 100644
index 000000000000..3f810acd0bfa
--- /dev/null
+++ b/fs/ramfs/file-nommu.c
@@ -0,0 +1,292 @@
1/* file-nommu.c: no-MMU version of ramfs
2 *
3 * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/fs.h>
14#include <linux/pagemap.h>
15#include <linux/highmem.h>
16#include <linux/init.h>
17#include <linux/string.h>
18#include <linux/smp_lock.h>
19#include <linux/backing-dev.h>
20#include <linux/ramfs.h>
21#include <linux/quotaops.h>
22#include <linux/pagevec.h>
23#include <linux/mman.h>
24
25#include <asm/uaccess.h>
26#include "internal.h"
27
28static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
29
30struct address_space_operations ramfs_aops = {
31 .readpage = simple_readpage,
32 .prepare_write = simple_prepare_write,
33 .commit_write = simple_commit_write
34};
35
36struct file_operations ramfs_file_operations = {
37 .mmap = ramfs_nommu_mmap,
38 .get_unmapped_area = ramfs_nommu_get_unmapped_area,
39 .read = generic_file_read,
40 .write = generic_file_write,
41 .fsync = simple_sync_file,
42 .sendfile = generic_file_sendfile,
43 .llseek = generic_file_llseek,
44};
45
46struct inode_operations ramfs_file_inode_operations = {
47 .setattr = ramfs_nommu_setattr,
48 .getattr = simple_getattr,
49};
50
51/*****************************************************************************/
52/*
53 * add a contiguous set of pages into a ramfs inode when it's truncated from
54 * size 0 on the assumption that it's going to be used for an mmap of shared
55 * memory
56 */
57static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
58{
59 struct pagevec lru_pvec;
60 unsigned long npages, xpages, loop, limit;
61 struct page *pages;
62 unsigned order;
63 void *data;
64 int ret;
65
66 /* make various checks */
67 order = get_order(newsize);
68 if (unlikely(order >= MAX_ORDER))
69 goto too_big;
70
71 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
72 if (limit != RLIM_INFINITY && newsize > limit)
73 goto fsize_exceeded;
74
75 if (newsize > inode->i_sb->s_maxbytes)
76 goto too_big;
77
78 i_size_write(inode, newsize);
79
80 /* allocate enough contiguous pages to be able to satisfy the
81 * request */
82 pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order);
83 if (!pages)
84 return -ENOMEM;
85
86 /* split the high-order page into an array of single pages */
87 xpages = 1UL << order;
88 npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
89
90 for (loop = 0; loop < npages; loop++)
91 set_page_count(pages + loop, 1);
92
93 /* trim off any pages we don't actually require */
94 for (loop = npages; loop < xpages; loop++)
95 __free_page(pages + loop);
96
97 /* clear the memory we allocated */
98 newsize = PAGE_SIZE * npages;
99 data = page_address(pages);
100 memset(data, 0, newsize);
101
102 /* attach all the pages to the inode's address space */
103 pagevec_init(&lru_pvec, 0);
104 for (loop = 0; loop < npages; loop++) {
105 struct page *page = pages + loop;
106
107 ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL);
108 if (ret < 0)
109 goto add_error;
110
111 if (!pagevec_add(&lru_pvec, page))
112 __pagevec_lru_add(&lru_pvec);
113
114 unlock_page(page);
115 }
116
117 pagevec_lru_add(&lru_pvec);
118 return 0;
119
120 fsize_exceeded:
121 send_sig(SIGXFSZ, current, 0);
122 too_big:
123 return -EFBIG;
124
125 add_error:
126 page_cache_release(pages + loop);
127 for (loop++; loop < npages; loop++)
128 __free_page(pages + loop);
129 return ret;
130}
131
132/*****************************************************************************/
133/*
134 * check that file shrinkage doesn't leave any VMAs dangling in midair
135 */
136static int ramfs_nommu_check_mappings(struct inode *inode,
137 size_t newsize, size_t size)
138{
139 struct vm_area_struct *vma;
140 struct prio_tree_iter iter;
141
142 /* search for VMAs that fall within the dead zone */
143 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
144 newsize >> PAGE_SHIFT,
145 (size + PAGE_SIZE - 1) >> PAGE_SHIFT
146 ) {
147 /* found one - only interested if it's shared out of the page
148 * cache */
149 if (vma->vm_flags & VM_SHARED)
150 return -ETXTBSY; /* not quite true, but near enough */
151 }
152
153 return 0;
154}
155
156/*****************************************************************************/
157/*
158 *
159 */
160static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
161{
162 int ret;
163
164 /* assume a truncate from zero size is going to be for the purposes of
165 * shared mmap */
166 if (size == 0) {
167 if (unlikely(newsize >> 32))
168 return -EFBIG;
169
170 return ramfs_nommu_expand_for_mapping(inode, newsize);
171 }
172
173 /* check that a decrease in size doesn't cut off any shared mappings */
174 if (newsize < size) {
175 ret = ramfs_nommu_check_mappings(inode, newsize, size);
176 if (ret < 0)
177 return ret;
178 }
179
180 ret = vmtruncate(inode, size);
181
182 return ret;
183}
184
185/*****************************************************************************/
186/*
187 * handle a change of attributes
188 * - we're specifically interested in a change of size
189 */
190static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
191{
192 struct inode *inode = dentry->d_inode;
193 unsigned int old_ia_valid = ia->ia_valid;
194 int ret = 0;
195
196 /* by providing our own setattr() method, we skip this quotaism */
197 if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) ||
198 (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid))
199 ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0;
200
201 /* pick out size-changing events */
202 if (ia->ia_valid & ATTR_SIZE) {
203 loff_t size = i_size_read(inode);
204 if (ia->ia_size != size) {
205 ret = ramfs_nommu_resize(inode, ia->ia_size, size);
206 if (ret < 0 || ia->ia_valid == ATTR_SIZE)
207 goto out;
208 } else {
209 /* we skipped the truncate but must still update
210 * timestamps
211 */
212 ia->ia_valid |= ATTR_MTIME|ATTR_CTIME;
213 }
214 }
215
216 ret = inode_setattr(inode, ia);
217 out:
218 ia->ia_valid = old_ia_valid;
219 return ret;
220}
221
222/*****************************************************************************/
223/*
224 * try to determine where a shared mapping can be made
225 * - we require that:
226 * - the pages to be mapped must exist
227 * - the pages be physically contiguous in sequence
228 */
229unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
230 unsigned long addr, unsigned long len,
231 unsigned long pgoff, unsigned long flags)
232{
233 unsigned long maxpages, lpages, nr, loop, ret;
234 struct inode *inode = file->f_dentry->d_inode;
235 struct page **pages = NULL, **ptr, *page;
236 loff_t isize;
237
238 if (!(flags & MAP_SHARED))
239 return addr;
240
241 /* the mapping mustn't extend beyond the EOF */
242 lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
243 isize = i_size_read(inode);
244
245 ret = -EINVAL;
246 maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
247 if (pgoff >= maxpages)
248 goto out;
249
250 if (maxpages - pgoff < lpages)
251 goto out;
252
253 /* gang-find the pages */
254 ret = -ENOMEM;
255 pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
256 if (!pages)
257 goto out;
258
259 nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
260 if (nr != lpages)
261 goto out; /* leave if some pages were missing */
262
263 /* check the pages for physical adjacency */
264 ptr = pages;
265 page = *ptr++;
266 page++;
267 for (loop = lpages; loop > 1; loop--)
268 if (*ptr++ != page++)
269 goto out;
270
271 /* okay - all conditions fulfilled */
272 ret = (unsigned long) page_address(pages[0]);
273
274 out:
275 if (pages) {
276 ptr = pages;
277 for (loop = lpages; loop > 0; loop--)
278 put_page(*ptr++);
279 kfree(pages);
280 }
281
282 return ret;
283}
284
285/*****************************************************************************/
286/*
287 * set up a mapping
288 */
289int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
290{
291 return 0;
292}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0a88917605ae..c66bd5e4c05c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -34,13 +34,12 @@
34#include <linux/ramfs.h> 34#include <linux/ramfs.h>
35 35
36#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37#include "internal.h"
37 38
38/* some random number */ 39/* some random number */
39#define RAMFS_MAGIC 0x858458f6 40#define RAMFS_MAGIC 0x858458f6
40 41
41static struct super_operations ramfs_ops; 42static struct super_operations ramfs_ops;
42static struct address_space_operations ramfs_aops;
43static struct inode_operations ramfs_file_inode_operations;
44static struct inode_operations ramfs_dir_inode_operations; 43static struct inode_operations ramfs_dir_inode_operations;
45 44
46static struct backing_dev_info ramfs_backing_dev_info = { 45static struct backing_dev_info ramfs_backing_dev_info = {
@@ -142,25 +141,6 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
142 return error; 141 return error;
143} 142}
144 143
145static struct address_space_operations ramfs_aops = {
146 .readpage = simple_readpage,
147 .prepare_write = simple_prepare_write,
148 .commit_write = simple_commit_write
149};
150
151struct file_operations ramfs_file_operations = {
152 .read = generic_file_read,
153 .write = generic_file_write,
154 .mmap = generic_file_mmap,
155 .fsync = simple_sync_file,
156 .sendfile = generic_file_sendfile,
157 .llseek = generic_file_llseek,
158};
159
160static struct inode_operations ramfs_file_inode_operations = {
161 .getattr = simple_getattr,
162};
163
164static struct inode_operations ramfs_dir_inode_operations = { 144static struct inode_operations ramfs_dir_inode_operations = {
165 .create = ramfs_create, 145 .create = ramfs_create,
166 .lookup = simple_lookup, 146 .lookup = simple_lookup,
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
new file mode 100644
index 000000000000..272c8a7120b0
--- /dev/null
+++ b/fs/ramfs/internal.h
@@ -0,0 +1,15 @@
1/* internal.h: ramfs internal definitions
2 *
3 * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12
13extern struct address_space_operations ramfs_aops;
14extern struct file_operations ramfs_file_operations;
15extern struct inode_operations ramfs_file_inode_operations;
diff --git a/include/asm-alpha/atomic.h b/include/asm-alpha/atomic.h
index 6183eab006d4..cb03bbe92cdf 100644
--- a/include/asm-alpha/atomic.h
+++ b/include/asm-alpha/atomic.h
@@ -216,4 +216,5 @@ static __inline__ long atomic64_sub_return(long i, atomic64_t * v)
216#define smp_mb__before_atomic_inc() smp_mb() 216#define smp_mb__before_atomic_inc() smp_mb()
217#define smp_mb__after_atomic_inc() smp_mb() 217#define smp_mb__after_atomic_inc() smp_mb()
218 218
219#include <asm-generic/atomic.h>
219#endif /* _ALPHA_ATOMIC_H */ 220#endif /* _ALPHA_ATOMIC_H */
diff --git a/include/asm-alpha/dma-mapping.h b/include/asm-alpha/dma-mapping.h
index 680f7ecbb28f..9dc7256cf979 100644
--- a/include/asm-alpha/dma-mapping.h
+++ b/include/asm-alpha/dma-mapping.h
@@ -16,7 +16,7 @@
16#define dma_free_coherent(dev, size, va, addr) \ 16#define dma_free_coherent(dev, size, va, addr) \
17 pci_free_consistent(alpha_gendev_to_pci(dev), size, va, addr) 17 pci_free_consistent(alpha_gendev_to_pci(dev), size, va, addr)
18#define dma_map_page(dev, page, off, size, dir) \ 18#define dma_map_page(dev, page, off, size, dir) \
19 pci_map_single(alpha_gendev_to_pci(dev), page, off, size, dir) 19 pci_map_page(alpha_gendev_to_pci(dev), page, off, size, dir)
20#define dma_unmap_page(dev, addr, size, dir) \ 20#define dma_unmap_page(dev, addr, size, dir) \
21 pci_unmap_page(alpha_gendev_to_pci(dev), addr, size, dir) 21 pci_unmap_page(alpha_gendev_to_pci(dev), addr, size, dir)
22#define dma_map_sg(dev, sg, nents, dir) \ 22#define dma_map_sg(dev, sg, nents, dir) \
diff --git a/include/asm-alpha/hardirq.h b/include/asm-alpha/hardirq.h
index c0593f9b21e1..7bb6a36c96a1 100644
--- a/include/asm-alpha/hardirq.h
+++ b/include/asm-alpha/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
13 13
14#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ 14#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */
15 15
16void ack_bad_irq(unsigned int irq);
17
16#define HARDIRQ_BITS 12 18#define HARDIRQ_BITS 12
17 19
18/* 20/*
diff --git a/include/asm-alpha/mman.h b/include/asm-alpha/mman.h
index eb9c279045ef..f6439532a262 100644
--- a/include/asm-alpha/mman.h
+++ b/include/asm-alpha/mman.h
@@ -42,6 +42,7 @@
42#define MADV_WILLNEED 3 /* will need these pages */ 42#define MADV_WILLNEED 3 /* will need these pages */
43#define MADV_SPACEAVAIL 5 /* ensure resources are available */ 43#define MADV_SPACEAVAIL 5 /* ensure resources are available */
44#define MADV_DONTNEED 6 /* don't need these pages */ 44#define MADV_DONTNEED 6 /* don't need these pages */
45#define MADV_REMOVE 7 /* remove these pages & resources */
45 46
46/* compatibility flags */ 47/* compatibility flags */
47#define MAP_ANON MAP_ANONYMOUS 48#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-arm/atomic.h b/include/asm-arm/atomic.h
index d586f65c8228..f72b63309bc5 100644
--- a/include/asm-arm/atomic.h
+++ b/include/asm-arm/atomic.h
@@ -205,5 +205,6 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
205#define smp_mb__before_atomic_inc() barrier() 205#define smp_mb__before_atomic_inc() barrier()
206#define smp_mb__after_atomic_inc() barrier() 206#define smp_mb__after_atomic_inc() barrier()
207 207
208#include <asm-generic/atomic.h>
208#endif 209#endif
209#endif 210#endif
diff --git a/include/asm-arm/mman.h b/include/asm-arm/mman.h
index 8e4f69c4fa5f..f0bebca2ac21 100644
--- a/include/asm-arm/mman.h
+++ b/include/asm-arm/mman.h
@@ -35,6 +35,7 @@
35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
36#define MADV_WILLNEED 0x3 /* pre-fault pages */ 36#define MADV_WILLNEED 0x3 /* pre-fault pages */
37#define MADV_DONTNEED 0x4 /* discard these pages */ 37#define MADV_DONTNEED 0x4 /* discard these pages */
38#define MADV_REMOVE 0x5 /* remove these pages & resources */
38 39
39/* compatibility flags */ 40/* compatibility flags */
40#define MAP_ANON MAP_ANONYMOUS 41#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-arm26/atomic.h b/include/asm-arm26/atomic.h
index a47cadc59686..3074b0e76343 100644
--- a/include/asm-arm26/atomic.h
+++ b/include/asm-arm26/atomic.h
@@ -118,5 +118,6 @@ static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
118#define smp_mb__before_atomic_inc() barrier() 118#define smp_mb__before_atomic_inc() barrier()
119#define smp_mb__after_atomic_inc() barrier() 119#define smp_mb__after_atomic_inc() barrier()
120 120
121#include <asm-generic/atomic.h>
121#endif 122#endif
122#endif 123#endif
diff --git a/include/asm-arm26/mman.h b/include/asm-arm26/mman.h
index cc27b8240265..0ed7780541fa 100644
--- a/include/asm-arm26/mman.h
+++ b/include/asm-arm26/mman.h
@@ -35,6 +35,7 @@
35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
36#define MADV_WILLNEED 0x3 /* pre-fault pages */ 36#define MADV_WILLNEED 0x3 /* pre-fault pages */
37#define MADV_DONTNEED 0x4 /* discard these pages */ 37#define MADV_DONTNEED 0x4 /* discard these pages */
38#define MADV_REMOVE 0x5 /* remove these pages & resources */
38 39
39/* compatibility flags */ 40/* compatibility flags */
40#define MAP_ANON MAP_ANONYMOUS 41#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-cris/atomic.h b/include/asm-cris/atomic.h
index 683b05a57d88..2df2c7aa19b7 100644
--- a/include/asm-cris/atomic.h
+++ b/include/asm-cris/atomic.h
@@ -156,4 +156,5 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
156#define smp_mb__before_atomic_inc() barrier() 156#define smp_mb__before_atomic_inc() barrier()
157#define smp_mb__after_atomic_inc() barrier() 157#define smp_mb__after_atomic_inc() barrier()
158 158
159#include <asm-generic/atomic.h>
159#endif 160#endif
diff --git a/include/asm-cris/mman.h b/include/asm-cris/mman.h
index 8570e72b9502..5a382b8bf3f7 100644
--- a/include/asm-cris/mman.h
+++ b/include/asm-cris/mman.h
@@ -37,6 +37,7 @@
37#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 37#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
38#define MADV_WILLNEED 0x3 /* pre-fault pages */ 38#define MADV_WILLNEED 0x3 /* pre-fault pages */
39#define MADV_DONTNEED 0x4 /* discard these pages */ 39#define MADV_DONTNEED 0x4 /* discard these pages */
40#define MADV_REMOVE 0x5 /* remove these pages & resources */
40 41
41/* compatibility flags */ 42/* compatibility flags */
42#define MAP_ANON MAP_ANONYMOUS 43#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-frv/atomic.h b/include/asm-frv/atomic.h
index f6539ff569c5..3f54fea2b051 100644
--- a/include/asm-frv/atomic.h
+++ b/include/asm-frv/atomic.h
@@ -426,4 +426,5 @@ extern uint32_t __cmpxchg_32(uint32_t *v, uint32_t test, uint32_t new);
426}) 426})
427#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) 427#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
428 428
429#include <asm-generic/atomic.h>
429#endif /* _ASM_ATOMIC_H */ 430#endif /* _ASM_ATOMIC_H */
diff --git a/include/asm-frv/futex.h b/include/asm-frv/futex.h
index 9feff4ce1424..fca9d90e32c9 100644
--- a/include/asm-frv/futex.h
+++ b/include/asm-frv/futex.h
@@ -7,47 +7,7 @@
7#include <asm/errno.h> 7#include <asm/errno.h>
8#include <asm/uaccess.h> 8#include <asm/uaccess.h>
9 9
10static inline int 10extern int futex_atomic_op_inuser(int encoded_op, int __user *uaddr);
11futex_atomic_op_inuser (int encoded_op, int __user *uaddr)
12{
13 int op = (encoded_op >> 28) & 7;
14 int cmp = (encoded_op >> 24) & 15;
15 int oparg = (encoded_op << 8) >> 20;
16 int cmparg = (encoded_op << 20) >> 20;
17 int oldval = 0, ret;
18 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
19 oparg = 1 << oparg;
20
21 if (! access_ok (VERIFY_WRITE, uaddr, sizeof(int)))
22 return -EFAULT;
23
24 inc_preempt_count();
25
26 switch (op) {
27 case FUTEX_OP_SET:
28 case FUTEX_OP_ADD:
29 case FUTEX_OP_OR:
30 case FUTEX_OP_ANDN:
31 case FUTEX_OP_XOR:
32 default:
33 ret = -ENOSYS;
34 }
35
36 dec_preempt_count();
37
38 if (!ret) {
39 switch (cmp) {
40 case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break;
41 case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break;
42 case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break;
43 case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break;
44 case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break;
45 case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break;
46 default: ret = -ENOSYS;
47 }
48 }
49 return ret;
50}
51 11
52#endif 12#endif
53#endif 13#endif
diff --git a/include/asm-frv/mman.h b/include/asm-frv/mman.h
index c684720dfbdd..8af4a41c255e 100644
--- a/include/asm-frv/mman.h
+++ b/include/asm-frv/mman.h
@@ -35,6 +35,7 @@
35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
36#define MADV_WILLNEED 0x3 /* pre-fault pages */ 36#define MADV_WILLNEED 0x3 /* pre-fault pages */
37#define MADV_DONTNEED 0x4 /* discard these pages */ 37#define MADV_DONTNEED 0x4 /* discard these pages */
38#define MADV_REMOVE 0x5 /* remove these pages & resources */
38 39
39/* compatibility flags */ 40/* compatibility flags */
40#define MAP_ANON MAP_ANONYMOUS 41#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-frv/signal.h b/include/asm-frv/signal.h
index d407bde57eca..67366894780f 100644
--- a/include/asm-frv/signal.h
+++ b/include/asm-frv/signal.h
@@ -151,7 +151,6 @@ typedef struct sigaltstack {
151 size_t ss_size; 151 size_t ss_size;
152} stack_t; 152} stack_t;
153 153
154extern int do_signal(struct pt_regs *regs, sigset_t *oldset);
155#define ptrace_signal_deliver(regs, cookie) do { } while (0) 154#define ptrace_signal_deliver(regs, cookie) do { } while (0)
156 155
157#ifdef __KERNEL__ 156#ifdef __KERNEL__
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
new file mode 100644
index 000000000000..e0a28b925ef0
--- /dev/null
+++ b/include/asm-generic/atomic.h
@@ -0,0 +1,116 @@
1#ifndef _ASM_GENERIC_ATOMIC_H
2#define _ASM_GENERIC_ATOMIC_H
3/*
4 * Copyright (C) 2005 Silicon Graphics, Inc.
5 * Christoph Lameter <clameter@sgi.com>
6 *
7 * Allows to provide arch independent atomic definitions without the need to
8 * edit all arch specific atomic.h files.
9 */
10
11
12/*
13 * Suppport for atomic_long_t
14 *
15 * Casts for parameters are avoided for existing atomic functions in order to
16 * avoid issues with cast-as-lval under gcc 4.x and other limitations that the
17 * macros of a platform may have.
18 */
19
20#if BITS_PER_LONG == 64
21
22typedef atomic64_t atomic_long_t;
23
24#define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i)
25
26static inline long atomic_long_read(atomic_long_t *l)
27{
28 atomic64_t *v = (atomic64_t *)l;
29
30 return (long)atomic64_read(v);
31}
32
33static inline void atomic_long_set(atomic_long_t *l, long i)
34{
35 atomic64_t *v = (atomic64_t *)l;
36
37 atomic_set(v, i);
38}
39
40static inline void atomic_long_inc(atomic_long_t *l)
41{
42 atomic64_t *v = (atomic64_t *)l;
43
44 atomic64_inc(v);
45}
46
47static inline void atomic_long_dec(atomic_long_t *l)
48{
49 atomic64_t *v = (atomic64_t *)l;
50
51 atomic64_dec(v);
52}
53
54static inline void atomic_long_add(long i, atomic_long_t *l)
55{
56 atomic64_t *v = (atomic64_t *)l;
57
58 atomic64_add(i, v);
59}
60
61static inline void atomic_long_sub(long i, atomic_long_t *l)
62{
63 atomic64_t *v = (atomic64_t *)l;
64
65 atomic64_sub(i, v);
66}
67
68#else
69
70typedef atomic_t atomic_long_t;
71
72#define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i)
73static inline long atomic_long_read(atomic_long_t *l)
74{
75 atomic_t *v = (atomic_t *)l;
76
77 return (long)atomic_read(v);
78}
79
80static inline void atomic_long_set(atomic_long_t *l, long i)
81{
82 atomic_t *v = (atomic_t *)l;
83
84 atomic_set(v, i);
85}
86
87static inline void atomic_long_inc(atomic_long_t *l)
88{
89 atomic_t *v = (atomic_t *)l;
90
91 atomic_inc(v);
92}
93
94static inline void atomic_long_dec(atomic_long_t *l)
95{
96 atomic_t *v = (atomic_t *)l;
97
98 atomic_dec(v);
99}
100
101static inline void atomic_long_add(long i, atomic_long_t *l)
102{
103 atomic_t *v = (atomic_t *)l;
104
105 atomic_add(i, v);
106}
107
108static inline void atomic_long_sub(long i, atomic_long_t *l)
109{
110 atomic_t *v = (atomic_t *)l;
111
112 atomic_sub(i, v);
113}
114
115#endif
116#endif
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 094d4917c1a9..35de20cf8fac 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -10,6 +10,8 @@
10#define ALIGN_FUNCTION() . = ALIGN(8) 10#define ALIGN_FUNCTION() . = ALIGN(8)
11 11
12#define RODATA \ 12#define RODATA \
13 . = ALIGN(4096); \
14 __start_rodata = .; \
13 .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \ 15 .rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \
14 *(.rodata) *(.rodata.*) \ 16 *(.rodata) *(.rodata.*) \
15 *(__vermagic) /* Kernel version magic */ \ 17 *(__vermagic) /* Kernel version magic */ \
@@ -74,6 +76,8 @@
74 __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) { \ 76 __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) { \
75 *(__ksymtab_strings) \ 77 *(__ksymtab_strings) \
76 } \ 78 } \
79 __end_rodata = .; \
80 . = ALIGN(4096); \
77 \ 81 \
78 /* Built-in module parameters. */ \ 82 /* Built-in module parameters. */ \
79 __param : AT(ADDR(__param) - LOAD_OFFSET) { \ 83 __param : AT(ADDR(__param) - LOAD_OFFSET) { \
diff --git a/include/asm-h8300/atomic.h b/include/asm-h8300/atomic.h
index f23d86819ea8..d891541e89c3 100644
--- a/include/asm-h8300/atomic.h
+++ b/include/asm-h8300/atomic.h
@@ -137,4 +137,5 @@ static __inline__ void atomic_set_mask(unsigned long mask, unsigned long *v)
137#define smp_mb__before_atomic_inc() barrier() 137#define smp_mb__before_atomic_inc() barrier()
138#define smp_mb__after_atomic_inc() barrier() 138#define smp_mb__after_atomic_inc() barrier()
139 139
140#include <asm-generic/atomic.h>
140#endif /* __ARCH_H8300_ATOMIC __ */ 141#endif /* __ARCH_H8300_ATOMIC __ */
diff --git a/include/asm-h8300/irq.h b/include/asm-h8300/irq.h
index 5027181ed067..73065f5bda0e 100644
--- a/include/asm-h8300/irq.h
+++ b/include/asm-h8300/irq.h
@@ -61,11 +61,6 @@ static __inline__ int irq_canonicalize(int irq)
61 61
62extern void enable_irq(unsigned int); 62extern void enable_irq(unsigned int);
63extern void disable_irq(unsigned int); 63extern void disable_irq(unsigned int);
64
65/*
66 * Some drivers want these entry points
67 */
68#define enable_irq_nosync(x) enable_irq(x)
69#define disable_irq_nosync(x) disable_irq(x) 64#define disable_irq_nosync(x) disable_irq(x)
70 65
71struct irqaction; 66struct irqaction;
diff --git a/include/asm-h8300/mman.h b/include/asm-h8300/mman.h
index 63f727a59850..744a8fb485c2 100644
--- a/include/asm-h8300/mman.h
+++ b/include/asm-h8300/mman.h
@@ -35,6 +35,7 @@
35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
36#define MADV_WILLNEED 0x3 /* pre-fault pages */ 36#define MADV_WILLNEED 0x3 /* pre-fault pages */
37#define MADV_DONTNEED 0x4 /* discard these pages */ 37#define MADV_DONTNEED 0x4 /* discard these pages */
38#define MADV_REMOVE 0x5 /* remove these pages & resources */
38 39
39/* compatibility flags */ 40/* compatibility flags */
40#define MAP_ANON MAP_ANONYMOUS 41#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-i386/atomic.h b/include/asm-i386/atomic.h
index c68557aa04b2..7a5472d77091 100644
--- a/include/asm-i386/atomic.h
+++ b/include/asm-i386/atomic.h
@@ -254,4 +254,5 @@ __asm__ __volatile__(LOCK "orl %0,%1" \
254#define smp_mb__before_atomic_inc() barrier() 254#define smp_mb__before_atomic_inc() barrier()
255#define smp_mb__after_atomic_inc() barrier() 255#define smp_mb__after_atomic_inc() barrier()
256 256
257#include <asm-generic/atomic.h>
257#endif 258#endif
diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h
index 4807aa1d2e3d..65679aca4b22 100644
--- a/include/asm-i386/bitops.h
+++ b/include/asm-i386/bitops.h
@@ -332,9 +332,9 @@ static inline unsigned long __ffs(unsigned long word)
332 * Returns the bit-number of the first set bit, not the number of the byte 332 * Returns the bit-number of the first set bit, not the number of the byte
333 * containing a bit. 333 * containing a bit.
334 */ 334 */
335static inline int find_first_bit(const unsigned long *addr, unsigned size) 335static inline unsigned find_first_bit(const unsigned long *addr, unsigned size)
336{ 336{
337 int x = 0; 337 unsigned x = 0;
338 338
339 while (x < size) { 339 while (x < size) {
340 unsigned long val = *addr++; 340 unsigned long val = *addr++;
@@ -367,11 +367,6 @@ static inline unsigned long ffz(unsigned long word)
367 return word; 367 return word;
368} 368}
369 369
370/*
371 * fls: find last bit set.
372 */
373
374#define fls(x) generic_fls(x)
375#define fls64(x) generic_fls64(x) 370#define fls64(x) generic_fls64(x)
376 371
377#ifdef __KERNEL__ 372#ifdef __KERNEL__
@@ -415,6 +410,23 @@ static inline int ffs(int x)
415} 410}
416 411
417/** 412/**
413 * fls - find last bit set
414 * @x: the word to search
415 *
416 * This is defined the same way as ffs.
417 */
418static inline int fls(int x)
419{
420 int r;
421
422 __asm__("bsrl %1,%0\n\t"
423 "jnz 1f\n\t"
424 "movl $-1,%0\n"
425 "1:" : "=r" (r) : "rm" (x));
426 return r+1;
427}
428
429/**
418 * hweightN - returns the hamming weight of a N-bit word 430 * hweightN - returns the hamming weight of a N-bit word
419 * @x: the word to weigh 431 * @x: the word to weigh
420 * 432 *
diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h
index ea54540638d2..50233e0345fb 100644
--- a/include/asm-i386/bugs.h
+++ b/include/asm-i386/bugs.h
@@ -8,9 +8,6 @@
8 * <rreilova@ececs.uc.edu> 8 * <rreilova@ececs.uc.edu>
9 * - Channing Corn (tests & fixes), 9 * - Channing Corn (tests & fixes),
10 * - Andrew D. Balsa (code cleanup). 10 * - Andrew D. Balsa (code cleanup).
11 *
12 * Pentium III FXSR, SSE support
13 * Gareth Hughes <gareth@valinux.com>, May 2000
14 */ 11 */
15 12
16/* 13/*
@@ -76,25 +73,7 @@ static void __init check_fpu(void)
76 return; 73 return;
77 } 74 }
78 75
79/* Enable FXSR and company _before_ testing for FP problems. */ 76/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */
80 /*
81 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
82 */
83 if (offsetof(struct task_struct, thread.i387.fxsave) & 15) {
84 extern void __buggy_fxsr_alignment(void);
85 __buggy_fxsr_alignment();
86 }
87 if (cpu_has_fxsr) {
88 printk(KERN_INFO "Enabling fast FPU save and restore... ");
89 set_in_cr4(X86_CR4_OSFXSR);
90 printk("done.\n");
91 }
92 if (cpu_has_xmm) {
93 printk(KERN_INFO "Enabling unmasked SIMD FPU exception support... ");
94 set_in_cr4(X86_CR4_OSXMMEXCPT);
95 printk("done.\n");
96 }
97
98 /* Test for the divl bug.. */ 77 /* Test for the divl bug.. */
99 __asm__("fninit\n\t" 78 __asm__("fninit\n\t"
100 "fldl %1\n\t" 79 "fldl %1\n\t"
diff --git a/include/asm-i386/cacheflush.h b/include/asm-i386/cacheflush.h
index 2ea36dea37d9..7199f7b326f1 100644
--- a/include/asm-i386/cacheflush.h
+++ b/include/asm-i386/cacheflush.h
@@ -31,4 +31,8 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot);
31void kernel_map_pages(struct page *page, int numpages, int enable); 31void kernel_map_pages(struct page *page, int numpages, int enable);
32#endif 32#endif
33 33
34#ifdef CONFIG_DEBUG_RODATA
35void mark_rodata_ro(void);
36#endif
37
34#endif /* _I386_CACHEFLUSH_H */ 38#endif /* _I386_CACHEFLUSH_H */
diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index 29b851a18c6e..494e73bca095 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -15,9 +15,6 @@
15#include <asm/mmu.h> 15#include <asm/mmu.h>
16 16
17extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; 17extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
18DECLARE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);
19
20#define get_cpu_gdt_table(_cpu) (per_cpu(cpu_gdt_table,_cpu))
21 18
22DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); 19DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
23 20
@@ -29,6 +26,11 @@ struct Xgt_desc_struct {
29 26
30extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS]; 27extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS];
31 28
29static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
30{
31 return ((struct desc_struct *)cpu_gdt_descr[cpu].address);
32}
33
32#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) 34#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
33#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)) 35#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
34 36
diff --git a/include/asm-i386/mach-bigsmp/mach_apic.h b/include/asm-i386/mach-bigsmp/mach_apic.h
index ba936d4daedb..18b19a773440 100644
--- a/include/asm-i386/mach-bigsmp/mach_apic.h
+++ b/include/asm-i386/mach-bigsmp/mach_apic.h
@@ -1,17 +1,10 @@
1#ifndef __ASM_MACH_APIC_H 1#ifndef __ASM_MACH_APIC_H
2#define __ASM_MACH_APIC_H 2#define __ASM_MACH_APIC_H
3#include <asm/smp.h> 3
4 4
5#define SEQUENTIAL_APICID 5extern u8 bios_cpu_apicid[];
6#ifdef SEQUENTIAL_APICID 6
7#define xapic_phys_to_log_apicid(phys_apic) ( (1ul << ((phys_apic) & 0x3)) |\ 7#define xapic_phys_to_log_apicid(cpu) (bios_cpu_apicid[cpu])
8 ((phys_apic<<2) & (~0xf)) )
9#elif CLUSTERED_APICID
10#define xapic_phys_to_log_apicid(phys_apic) ( (1ul << ((phys_apic) & 0x3)) |\
11 ((phys_apic) & (~0xf)) )
12#endif
13
14#define NO_BALANCE_IRQ (1)
15#define esr_disable (1) 8#define esr_disable (1)
16 9
17static inline int apic_id_registered(void) 10static inline int apic_id_registered(void)
@@ -19,7 +12,6 @@ static inline int apic_id_registered(void)
19 return (1); 12 return (1);
20} 13}
21 14
22#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
23/* Round robin the irqs amoung the online cpus */ 15/* Round robin the irqs amoung the online cpus */
24static inline cpumask_t target_cpus(void) 16static inline cpumask_t target_cpus(void)
25{ 17{
@@ -32,29 +24,34 @@ static inline cpumask_t target_cpus(void)
32 } while (cpu >= NR_CPUS); 24 } while (cpu >= NR_CPUS);
33 return cpumask_of_cpu(cpu); 25 return cpumask_of_cpu(cpu);
34} 26}
35#define TARGET_CPUS (target_cpus())
36 27
37#define INT_DELIVERY_MODE dest_Fixed 28#undef APIC_DEST_LOGICAL
38#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ 29#define APIC_DEST_LOGICAL 0
30#define TARGET_CPUS (target_cpus())
31#define APIC_DFR_VALUE (APIC_DFR_FLAT)
32#define INT_DELIVERY_MODE (dest_Fixed)
33#define INT_DEST_MODE (0) /* phys delivery to target proc */
34#define NO_BALANCE_IRQ (0)
35#define WAKE_SECONDARY_VIA_INIT
36
39 37
40static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) 38static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
41{ 39{
42 return 0; 40 return (0);
43} 41}
44 42
45/* we don't use the phys_cpu_present_map to indicate apicid presence */ 43static inline unsigned long check_apicid_present(int bit)
46static inline unsigned long check_apicid_present(int bit)
47{ 44{
48 return 1; 45 return (1);
49} 46}
50 47
51#define apicid_cluster(apicid) (apicid & 0xF0) 48static inline unsigned long calculate_ldr(int cpu)
52
53static inline unsigned long calculate_ldr(unsigned long old)
54{ 49{
55 unsigned long id; 50 unsigned long val, id;
56 id = xapic_phys_to_log_apicid(hard_smp_processor_id()); 51 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
57 return ((old & ~APIC_LDR_MASK) | SET_APIC_LOGICAL_ID(id)); 52 id = xapic_phys_to_log_apicid(cpu);
53 val |= SET_APIC_LOGICAL_ID(id);
54 return val;
58} 55}
59 56
60/* 57/*
@@ -67,37 +64,35 @@ static inline unsigned long calculate_ldr(unsigned long old)
67static inline void init_apic_ldr(void) 64static inline void init_apic_ldr(void)
68{ 65{
69 unsigned long val; 66 unsigned long val;
67 int cpu = smp_processor_id();
70 68
71 apic_write_around(APIC_DFR, APIC_DFR_VALUE); 69 apic_write_around(APIC_DFR, APIC_DFR_VALUE);
72 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; 70 val = calculate_ldr(cpu);
73 val = calculate_ldr(val);
74 apic_write_around(APIC_LDR, val); 71 apic_write_around(APIC_LDR, val);
75} 72}
76 73
77static inline void clustered_apic_check(void) 74static inline void clustered_apic_check(void)
78{ 75{
79 printk("Enabling APIC mode: %s. Using %d I/O APICs\n", 76 printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
80 "Cluster", nr_ioapics); 77 "Physflat", nr_ioapics);
81} 78}
82 79
83static inline int multi_timer_check(int apic, int irq) 80static inline int multi_timer_check(int apic, int irq)
84{ 81{
85 return 0; 82 return (0);
86} 83}
87 84
88static inline int apicid_to_node(int logical_apicid) 85static inline int apicid_to_node(int logical_apicid)
89{ 86{
90 return 0; 87 return (0);
91} 88}
92 89
93extern u8 bios_cpu_apicid[];
94
95static inline int cpu_present_to_apicid(int mps_cpu) 90static inline int cpu_present_to_apicid(int mps_cpu)
96{ 91{
97 if (mps_cpu < NR_CPUS) 92 if (mps_cpu < NR_CPUS)
98 return (int)bios_cpu_apicid[mps_cpu]; 93 return (int) bios_cpu_apicid[mps_cpu];
99 else 94
100 return BAD_APICID; 95 return BAD_APICID;
101} 96}
102 97
103static inline physid_mask_t apicid_to_cpu_present(int phys_apicid) 98static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
@@ -109,10 +104,10 @@ extern u8 cpu_2_logical_apicid[];
109/* Mapping from cpu number to logical apicid */ 104/* Mapping from cpu number to logical apicid */
110static inline int cpu_to_logical_apicid(int cpu) 105static inline int cpu_to_logical_apicid(int cpu)
111{ 106{
112 if (cpu >= NR_CPUS) 107 if (cpu >= NR_CPUS)
113 return BAD_APICID; 108 return BAD_APICID;
114 return (int)cpu_2_logical_apicid[cpu]; 109 return cpu_physical_id(cpu);
115 } 110}
116 111
117static inline int mpc_apic_id(struct mpc_config_processor *m, 112static inline int mpc_apic_id(struct mpc_config_processor *m,
118 struct mpc_config_translation *translation_record) 113 struct mpc_config_translation *translation_record)
@@ -128,11 +123,9 @@ static inline int mpc_apic_id(struct mpc_config_processor *m,
128static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) 123static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
129{ 124{
130 /* For clustered we don't have a good way to do this yet - hack */ 125 /* For clustered we don't have a good way to do this yet - hack */
131 return physids_promote(0xFUL); 126 return physids_promote(0xFFL);
132} 127}
133 128
134#define WAKE_SECONDARY_VIA_INIT
135
136static inline void setup_portio_remap(void) 129static inline void setup_portio_remap(void)
137{ 130{
138} 131}
diff --git a/include/asm-i386/mach-bigsmp/mach_apicdef.h b/include/asm-i386/mach-bigsmp/mach_apicdef.h
index 23e58b317c79..a58ab5a75c8c 100644
--- a/include/asm-i386/mach-bigsmp/mach_apicdef.h
+++ b/include/asm-i386/mach-bigsmp/mach_apicdef.h
@@ -1,11 +1,11 @@
1#ifndef __ASM_MACH_APICDEF_H 1#ifndef __ASM_MACH_APICDEF_H
2#define __ASM_MACH_APICDEF_H 2#define __ASM_MACH_APICDEF_H
3 3
4#define APIC_ID_MASK (0x0F<<24) 4#define APIC_ID_MASK (0xFF<<24)
5 5
6static inline unsigned get_apic_id(unsigned long x) 6static inline unsigned get_apic_id(unsigned long x)
7{ 7{
8 return (((x)>>24)&0x0F); 8 return (((x)>>24)&0xFF);
9} 9}
10 10
11#define GET_APIC_ID(x) get_apic_id(x) 11#define GET_APIC_ID(x) get_apic_id(x)
diff --git a/include/asm-i386/mman.h b/include/asm-i386/mman.h
index 196619a83854..ba4941e6f643 100644
--- a/include/asm-i386/mman.h
+++ b/include/asm-i386/mman.h
@@ -35,6 +35,7 @@
35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
36#define MADV_WILLNEED 0x3 /* pre-fault pages */ 36#define MADV_WILLNEED 0x3 /* pre-fault pages */
37#define MADV_DONTNEED 0x4 /* discard these pages */ 37#define MADV_DONTNEED 0x4 /* discard these pages */
38#define MADV_REMOVE 0x5 /* remove these pages & resources */
38 39
39/* compatibility flags */ 40/* compatibility flags */
40#define MAP_ANON MAP_ANONYMOUS 41#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 620a90641ea8..74f595d80579 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -76,11 +76,6 @@ static inline int pfn_to_nid(unsigned long pfn)
76 * Following are macros that each numa implmentation must define. 76 * Following are macros that each numa implmentation must define.
77 */ 77 */
78 78
79/*
80 * Given a kernel address, find the home node of the underlying memory.
81 */
82#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
83
84#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 79#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
85#define node_end_pfn(nid) \ 80#define node_end_pfn(nid) \
86({ \ 81({ \
diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h
index eb7f2b4234aa..424661d25bd3 100644
--- a/include/asm-i386/module.h
+++ b/include/asm-i386/module.h
@@ -52,8 +52,10 @@ struct mod_arch_specific
52#define MODULE_PROC_FAMILY "CYRIXIII " 52#define MODULE_PROC_FAMILY "CYRIXIII "
53#elif defined CONFIG_MVIAC3_2 53#elif defined CONFIG_MVIAC3_2
54#define MODULE_PROC_FAMILY "VIAC3-2 " 54#define MODULE_PROC_FAMILY "VIAC3-2 "
55#elif CONFIG_MGEODEGX1 55#elif defined CONFIG_MGEODEGX1
56#define MODULE_PROC_FAMILY "GEODEGX1 " 56#define MODULE_PROC_FAMILY "GEODEGX1 "
57#elif defined CONFIG_MGEODE_LX
58#define MODULE_PROC_FAMILY "GEODE "
57#else 59#else
58#error unknown processor family 60#error unknown processor family
59#endif 61#endif
diff --git a/include/asm-i386/mpspec_def.h b/include/asm-i386/mpspec_def.h
index a961093dbf88..76feedf85a8a 100644
--- a/include/asm-i386/mpspec_def.h
+++ b/include/asm-i386/mpspec_def.h
@@ -75,7 +75,7 @@ struct mpc_config_bus
75{ 75{
76 unsigned char mpc_type; 76 unsigned char mpc_type;
77 unsigned char mpc_busid; 77 unsigned char mpc_busid;
78 unsigned char mpc_bustype[6] __attribute((packed)); 78 unsigned char mpc_bustype[6];
79}; 79};
80 80
81/* List of Bus Type string values, Intel MP Spec. */ 81/* List of Bus Type string values, Intel MP Spec. */
diff --git a/include/asm-i386/segment.h b/include/asm-i386/segment.h
index bb5ff5b2c02e..faf995307b9e 100644
--- a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -91,6 +91,20 @@
91#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) 91#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
92#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) 92#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
93 93
94/* The PnP BIOS entries in the GDT */
95#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
96#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
97#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
98#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
99#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
100
101/* The PnP BIOS selectors */
102#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
103#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
104#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
105#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
106#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
107
94/* 108/*
95 * The interrupt descriptor table has room for 256 idt's, 109 * The interrupt descriptor table has room for 256 idt's,
96 * the global descriptor table is dependent on the number 110 * the global descriptor table is dependent on the number
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 772f85da1206..9c0593b7a94e 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -54,23 +54,7 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" \
54 ); } while(0) 54 ); } while(0)
55 55
56#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) ) 56#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) )
57#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1)>>12 ) 57#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1) )
58
59static inline unsigned long _get_base(char * addr)
60{
61 unsigned long __base;
62 __asm__("movb %3,%%dh\n\t"
63 "movb %2,%%dl\n\t"
64 "shll $16,%%edx\n\t"
65 "movw %1,%%dx"
66 :"=&d" (__base)
67 :"m" (*((addr)+2)),
68 "m" (*((addr)+4)),
69 "m" (*((addr)+7)));
70 return __base;
71}
72
73#define get_base(ldt) _get_base( ((char *)&(ldt)) )
74 58
75/* 59/*
76 * Load a segment. Fall back on loading the zero 60 * Load a segment. Fall back on loading the zero
@@ -140,6 +124,19 @@ static inline unsigned long _get_base(char * addr)
140 :"=r" (__dummy)); \ 124 :"=r" (__dummy)); \
141 __dummy; \ 125 __dummy; \
142}) 126})
127
128#define read_cr4_safe() ({ \
129 unsigned int __dummy; \
130 /* This could fault if %cr4 does not exist */ \
131 __asm__("1: movl %%cr4, %0 \n" \
132 "2: \n" \
133 ".section __ex_table,\"a\" \n" \
134 ".long 1b,2b \n" \
135 ".previous \n" \
136 : "=r" (__dummy): "0" (0)); \
137 __dummy; \
138})
139
143#define write_cr4(x) \ 140#define write_cr4(x) \
144 __asm__ __volatile__("movl %0,%%cr4": :"r" (x)); 141 __asm__ __volatile__("movl %0,%%cr4": :"r" (x));
145#define stts() write_cr0(8 | read_cr0()) 142#define stts() write_cr0(8 | read_cr0())
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 0f92e78dfea1..fe38b9a96233 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -256,7 +256,7 @@
256#define __NR_io_submit 248 256#define __NR_io_submit 248
257#define __NR_io_cancel 249 257#define __NR_io_cancel 249
258#define __NR_fadvise64 250 258#define __NR_fadvise64 250
259#define __NR_set_zone_reclaim 251 259/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */
260#define __NR_exit_group 252 260#define __NR_exit_group 252
261#define __NR_lookup_dcookie 253 261#define __NR_lookup_dcookie 253
262#define __NR_epoll_create 254 262#define __NR_epoll_create 254
diff --git a/include/asm-ia64/atomic.h b/include/asm-ia64/atomic.h
index 2fbebf85c31d..15cf7984c48e 100644
--- a/include/asm-ia64/atomic.h
+++ b/include/asm-ia64/atomic.h
@@ -192,4 +192,5 @@ atomic64_add_negative (__s64 i, atomic64_t *v)
192#define smp_mb__before_atomic_inc() barrier() 192#define smp_mb__before_atomic_inc() barrier()
193#define smp_mb__after_atomic_inc() barrier() 193#define smp_mb__after_atomic_inc() barrier()
194 194
195#include <asm-generic/atomic.h>
195#endif /* _ASM_IA64_ATOMIC_H */ 196#endif /* _ASM_IA64_ATOMIC_H */
diff --git a/include/asm-ia64/mman.h b/include/asm-ia64/mman.h
index 1c0a73af1461..828beb24a20e 100644
--- a/include/asm-ia64/mman.h
+++ b/include/asm-ia64/mman.h
@@ -43,6 +43,7 @@
43#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 43#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
44#define MADV_WILLNEED 0x3 /* pre-fault pages */ 44#define MADV_WILLNEED 0x3 /* pre-fault pages */
45#define MADV_DONTNEED 0x4 /* discard these pages */ 45#define MADV_DONTNEED 0x4 /* discard these pages */
46#define MADV_REMOVE 0x5 /* remove these pages & resources */
46 47
47/* compatibility flags */ 48/* compatibility flags */
48#define MAP_ANON MAP_ANONYMOUS 49#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 6d96a67439be..2bf543493cb8 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -265,7 +265,7 @@
265#define __NR_keyctl 1273 265#define __NR_keyctl 1273
266#define __NR_ioprio_set 1274 266#define __NR_ioprio_set 1274
267#define __NR_ioprio_get 1275 267#define __NR_ioprio_get 1275
268#define __NR_set_zone_reclaim 1276 268/* 1276 is available for reuse (was briefly sys_set_zone_reclaim) */
269#define __NR_inotify_init 1277 269#define __NR_inotify_init 1277
270#define __NR_inotify_add_watch 1278 270#define __NR_inotify_add_watch 1278
271#define __NR_inotify_rm_watch 1279 271#define __NR_inotify_rm_watch 1279
diff --git a/include/asm-m32r/assembler.h b/include/asm-m32r/assembler.h
index e1dff9d6baad..b7f4d8aaeb46 100644
--- a/include/asm-m32r/assembler.h
+++ b/include/asm-m32r/assembler.h
@@ -52,7 +52,7 @@
52 or3 \reg, \reg, #low(\x) 52 or3 \reg, \reg, #low(\x)
53 .endm 53 .endm
54 54
55#if !defined(CONFIG_CHIP_M32102) 55#if !(defined(CONFIG_CHIP_M32102) || defined(CONFIG_CHIP_M32104))
56#define STI(reg) STI_M reg 56#define STI(reg) STI_M reg
57 .macro STI_M reg 57 .macro STI_M reg
58 setpsw #0x40 -> nop 58 setpsw #0x40 -> nop
@@ -64,7 +64,7 @@
64 clrpsw #0x40 -> nop 64 clrpsw #0x40 -> nop
65 ; WORKAROUND: "-> nop" is a workaround for the M32700(TS1). 65 ; WORKAROUND: "-> nop" is a workaround for the M32700(TS1).
66 .endm 66 .endm
67#else /* CONFIG_CHIP_M32102 */ 67#else /* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
68#define STI(reg) STI_M reg 68#define STI(reg) STI_M reg
69 .macro STI_M reg 69 .macro STI_M reg
70 mvfc \reg, psw 70 mvfc \reg, psw
@@ -191,12 +191,12 @@
191 and \reg, sp 191 and \reg, sp
192 .endm 192 .endm
193 193
194#if !defined(CONFIG_CHIP_M32102) 194#if !(defined(CONFIG_CHIP_M32102) || defined(CONFIG_CHIP_M32104))
195 .macro SWITCH_TO_KERNEL_STACK 195 .macro SWITCH_TO_KERNEL_STACK
196 ; switch to kernel stack (spi) 196 ; switch to kernel stack (spi)
197 clrpsw #0x80 -> nop 197 clrpsw #0x80 -> nop
198 .endm 198 .endm
199#else /* CONFIG_CHIP_M32102 */ 199#else /* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
200 .macro SWITCH_TO_KERNEL_STACK 200 .macro SWITCH_TO_KERNEL_STACK
201 push r0 ; save r0 for working 201 push r0 ; save r0 for working
202 mvfc r0, psw 202 mvfc r0, psw
@@ -218,7 +218,7 @@
218 .fillinsn 218 .fillinsn
2192: 2192:
220 .endm 220 .endm
221#endif /* CONFIG_CHIP_M32102 */ 221#endif /* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
222 222
223#endif /* __ASSEMBLY__ */ 223#endif /* __ASSEMBLY__ */
224 224
diff --git a/include/asm-m32r/atomic.h b/include/asm-m32r/atomic.h
index ef1fb8ea4726..70761278b6cb 100644
--- a/include/asm-m32r/atomic.h
+++ b/include/asm-m32r/atomic.h
@@ -313,4 +313,5 @@ static __inline__ void atomic_set_mask(unsigned long mask, atomic_t *addr)
313#define smp_mb__before_atomic_inc() barrier() 313#define smp_mb__before_atomic_inc() barrier()
314#define smp_mb__after_atomic_inc() barrier() 314#define smp_mb__after_atomic_inc() barrier()
315 315
316#include <asm-generic/atomic.h>
316#endif /* _ASM_M32R_ATOMIC_H */ 317#endif /* _ASM_M32R_ATOMIC_H */
diff --git a/include/asm-m32r/cacheflush.h b/include/asm-m32r/cacheflush.h
index 46fc4c325108..e57427b6e249 100644
--- a/include/asm-m32r/cacheflush.h
+++ b/include/asm-m32r/cacheflush.h
@@ -7,7 +7,7 @@
7extern void _flush_cache_all(void); 7extern void _flush_cache_all(void);
8extern void _flush_cache_copyback_all(void); 8extern void _flush_cache_copyback_all(void);
9 9
10#if defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_OPSP) 10#if defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_OPSP) || defined(CONFIG_CHIP_M32104)
11#define flush_cache_all() do { } while (0) 11#define flush_cache_all() do { } while (0)
12#define flush_cache_mm(mm) do { } while (0) 12#define flush_cache_mm(mm) do { } while (0)
13#define flush_cache_range(vma, start, end) do { } while (0) 13#define flush_cache_range(vma, start, end) do { } while (0)
diff --git a/include/asm-m32r/irq.h b/include/asm-m32r/irq.h
index 8ed77968ecb4..ca943954572a 100644
--- a/include/asm-m32r/irq.h
+++ b/include/asm-m32r/irq.h
@@ -65,6 +65,22 @@
65#define NR_IRQS \ 65#define NR_IRQS \
66 (OPSPUT_NUM_CPU_IRQ + OPSPUT_NUM_PLD_IRQ \ 66 (OPSPUT_NUM_CPU_IRQ + OPSPUT_NUM_PLD_IRQ \
67 + OPSPUT_NUM_LCD_PLD_IRQ + OPSPUT_NUM_LAN_PLD_IRQ) 67 + OPSPUT_NUM_LCD_PLD_IRQ + OPSPUT_NUM_LAN_PLD_IRQ)
68
69#elif defined(CONFIG_PLAT_M32104UT)
70/*
71 * IRQ definitions for M32104UT
72 * M32104 Chip: 64 interrupts
73 * ICU of M32104UT-on-board PLD: 32 interrupts cascaded to INT1# chip pin
74 */
75#define M32104UT_NUM_CPU_IRQ (64)
76#define M32104UT_NUM_PLD_IRQ (32)
77#define M32104UT_IRQ_BASE 0
78#define M32104UT_CPU_IRQ_BASE M32104UT_IRQ_BASE
79#define M32104UT_PLD_IRQ_BASE (M32104UT_CPU_IRQ_BASE + M32104UT_NUM_CPU_IRQ)
80
81#define NR_IRQS \
82 (M32104UT_NUM_CPU_IRQ + M32104UT_NUM_PLD_IRQ)
83
68#else 84#else
69#define NR_IRQS 64 85#define NR_IRQS 64
70#endif 86#endif
diff --git a/include/asm-m32r/m32102.h b/include/asm-m32r/m32102.h
index cb98101f4f6e..a1f0d1fe9eb8 100644
--- a/include/asm-m32r/m32102.h
+++ b/include/asm-m32r/m32102.h
@@ -11,7 +11,11 @@
11/*======================================================================* 11/*======================================================================*
12 * Special Function Register 12 * Special Function Register
13 *======================================================================*/ 13 *======================================================================*/
14#if !defined(CONFIG_CHIP_M32104)
14#define M32R_SFR_OFFSET (0x00E00000) /* 0x00E00000-0x00EFFFFF 1[MB] */ 15#define M32R_SFR_OFFSET (0x00E00000) /* 0x00E00000-0x00EFFFFF 1[MB] */
16#else
17#define M32R_SFR_OFFSET (0x00700000) /* 0x00700000-0x007FFFFF 1[MB] */
18#endif
15 19
16/* 20/*
17 * Clock and Power Management registers. 21 * Clock and Power Management registers.
@@ -100,7 +104,7 @@
100#define M32R_MFT5RLD_PORTL (0x0C+M32R_MFT5_OFFSET) /* MFT4 reload */ 104#define M32R_MFT5RLD_PORTL (0x0C+M32R_MFT5_OFFSET) /* MFT4 reload */
101#define M32R_MFT5CMPRLD_PORTL (0x10+M32R_MFT5_OFFSET) /* MFT4 compare reload */ 105#define M32R_MFT5CMPRLD_PORTL (0x10+M32R_MFT5_OFFSET) /* MFT4 compare reload */
102 106
103#ifdef CONFIG_CHIP_M32700 107#if defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_M32104)
104#define M32R_MFTCR_MFT0MSK (1UL<<31) /* b0 */ 108#define M32R_MFTCR_MFT0MSK (1UL<<31) /* b0 */
105#define M32R_MFTCR_MFT1MSK (1UL<<30) /* b1 */ 109#define M32R_MFTCR_MFT1MSK (1UL<<30) /* b1 */
106#define M32R_MFTCR_MFT2MSK (1UL<<29) /* b2 */ 110#define M32R_MFTCR_MFT2MSK (1UL<<29) /* b2 */
@@ -113,7 +117,7 @@
113#define M32R_MFTCR_MFT3EN (1UL<<20) /* b11 */ 117#define M32R_MFTCR_MFT3EN (1UL<<20) /* b11 */
114#define M32R_MFTCR_MFT4EN (1UL<<19) /* b12 */ 118#define M32R_MFTCR_MFT4EN (1UL<<19) /* b12 */
115#define M32R_MFTCR_MFT5EN (1UL<<18) /* b13 */ 119#define M32R_MFTCR_MFT5EN (1UL<<18) /* b13 */
116#else /* not CONFIG_CHIP_M32700 */ 120#else /* not CONFIG_CHIP_M32700 && not CONFIG_CHIP_M32104 */
117#define M32R_MFTCR_MFT0MSK (1UL<<15) /* b16 */ 121#define M32R_MFTCR_MFT0MSK (1UL<<15) /* b16 */
118#define M32R_MFTCR_MFT1MSK (1UL<<14) /* b17 */ 122#define M32R_MFTCR_MFT1MSK (1UL<<14) /* b17 */
119#define M32R_MFTCR_MFT2MSK (1UL<<13) /* b18 */ 123#define M32R_MFTCR_MFT2MSK (1UL<<13) /* b18 */
@@ -126,7 +130,7 @@
126#define M32R_MFTCR_MFT3EN (1UL<<4) /* b27 */ 130#define M32R_MFTCR_MFT3EN (1UL<<4) /* b27 */
127#define M32R_MFTCR_MFT4EN (1UL<<3) /* b28 */ 131#define M32R_MFTCR_MFT4EN (1UL<<3) /* b28 */
128#define M32R_MFTCR_MFT5EN (1UL<<2) /* b29 */ 132#define M32R_MFTCR_MFT5EN (1UL<<2) /* b29 */
129#endif /* not CONFIG_CHIP_M32700 */ 133#endif /* not CONFIG_CHIP_M32700 && not CONFIG_CHIP_M32104 */
130 134
131#define M32R_MFTMOD_CC_MASK (1UL<<15) /* b16 */ 135#define M32R_MFTMOD_CC_MASK (1UL<<15) /* b16 */
132#define M32R_MFTMOD_TCCR (1UL<<13) /* b18 */ 136#define M32R_MFTMOD_TCCR (1UL<<13) /* b18 */
@@ -241,8 +245,24 @@
241#define M32R_IRQ_MFT1 (17) /* MFT1 */ 245#define M32R_IRQ_MFT1 (17) /* MFT1 */
242#define M32R_IRQ_MFT2 (18) /* MFT2 */ 246#define M32R_IRQ_MFT2 (18) /* MFT2 */
243#define M32R_IRQ_MFT3 (19) /* MFT3 */ 247#define M32R_IRQ_MFT3 (19) /* MFT3 */
244#define M32R_IRQ_MFT4 (20) /* MFT4 */ 248#ifdef CONFIG_CHIP_M32104
245#define M32R_IRQ_MFT5 (21) /* MFT5 */ 249#define M32R_IRQ_MFTX0 (24) /* MFTX0 */
250#define M32R_IRQ_MFTX1 (25) /* MFTX1 */
251#define M32R_IRQ_DMA0 (32) /* DMA0 */
252#define M32R_IRQ_DMA1 (33) /* DMA1 */
253#define M32R_IRQ_DMA2 (34) /* DMA2 */
254#define M32R_IRQ_DMA3 (35) /* DMA3 */
255#define M32R_IRQ_SIO0_R (40) /* SIO0 send */
256#define M32R_IRQ_SIO0_S (41) /* SIO0 receive */
257#define M32R_IRQ_SIO1_R (42) /* SIO1 send */
258#define M32R_IRQ_SIO1_S (43) /* SIO1 receive */
259#define M32R_IRQ_SIO2_R (44) /* SIO2 send */
260#define M32R_IRQ_SIO2_S (45) /* SIO2 receive */
261#define M32R_IRQ_SIO3_R (46) /* SIO3 send */
262#define M32R_IRQ_SIO3_S (47) /* SIO3 receive */
263#define M32R_IRQ_ADC (56) /* ADC */
264#define M32R_IRQ_PC (57) /* PC */
265#else /* ! M32104 */
246#define M32R_IRQ_DMA0 (32) /* DMA0 */ 266#define M32R_IRQ_DMA0 (32) /* DMA0 */
247#define M32R_IRQ_DMA1 (33) /* DMA1 */ 267#define M32R_IRQ_DMA1 (33) /* DMA1 */
248#define M32R_IRQ_SIO0_R (48) /* SIO0 send */ 268#define M32R_IRQ_SIO0_R (48) /* SIO0 send */
@@ -255,6 +275,7 @@
255#define M32R_IRQ_SIO3_S (55) /* SIO3 receive */ 275#define M32R_IRQ_SIO3_S (55) /* SIO3 receive */
256#define M32R_IRQ_SIO4_R (56) /* SIO4 send */ 276#define M32R_IRQ_SIO4_R (56) /* SIO4 send */
257#define M32R_IRQ_SIO4_S (57) /* SIO4 receive */ 277#define M32R_IRQ_SIO4_S (57) /* SIO4 receive */
278#endif /* ! M32104 */
258 279
259#ifdef CONFIG_SMP 280#ifdef CONFIG_SMP
260#define M32R_IRQ_IPI0 (56) 281#define M32R_IRQ_IPI0 (56)
@@ -281,15 +302,12 @@
281#define M32R_FPGA_VERSION0_PORTL (0x30+M32R_FPGA_TOP) 302#define M32R_FPGA_VERSION0_PORTL (0x30+M32R_FPGA_TOP)
282#define M32R_FPGA_VERSION1_PORTL (0x34+M32R_FPGA_TOP) 303#define M32R_FPGA_VERSION1_PORTL (0x34+M32R_FPGA_TOP)
283 304
305#endif /* CONFIG_SMP */
306
284#ifndef __ASSEMBLY__ 307#ifndef __ASSEMBLY__
285/* For NETDEV WATCHDOG */
286typedef struct { 308typedef struct {
287 unsigned long icucr; /* ICU Control Register */ 309 unsigned long icucr; /* ICU Control Register */
288} icu_data_t; 310} icu_data_t;
289
290extern icu_data_t icu_data[];
291#endif 311#endif
292 312
293#endif /* CONFIG_SMP */
294
295#endif /* _M32102_H_ */ 313#endif /* _M32102_H_ */
diff --git a/include/asm-m32r/m32104ut/m32104ut_pld.h b/include/asm-m32r/m32104ut/m32104ut_pld.h
new file mode 100644
index 000000000000..a4eac20553df
--- /dev/null
+++ b/include/asm-m32r/m32104ut/m32104ut_pld.h
@@ -0,0 +1,163 @@
1/*
2 * include/asm/m32104ut/m32104ut_pld.h
3 *
4 * Definitions for Programable Logic Device(PLD) on M32104UT board.
5 * Based on m32700ut_pld.h
6 *
7 * Copyright (c) 2002 Takeo Takahashi
8 * Copyright (c) 2005 Naoto Sugai
9 *
10 * This file is subject to the terms and conditions of the GNU General
11 * Public License. See the file "COPYING" in the main directory of
12 * this archive for more details.
13 */
14
15#ifndef _M32104UT_M32104UT_PLD_H
16#define _M32104UT_M32104UT_PLD_H
17
18#include <linux/config.h>
19
20#if defined(CONFIG_PLAT_M32104UT)
21#define PLD_PLAT_BASE 0x02c00000
22#else
23#error "no platform configuration"
24#endif
25
26#ifndef __ASSEMBLY__
27/*
28 * C functions use non-cache address.
29 */
30#define PLD_BASE (PLD_PLAT_BASE /* + NONCACHE_OFFSET */)
31#define __reg8 (volatile unsigned char *)
32#define __reg16 (volatile unsigned short *)
33#define __reg32 (volatile unsigned int *)
34#else
35#define PLD_BASE (PLD_PLAT_BASE + NONCACHE_OFFSET)
36#define __reg8
37#define __reg16
38#define __reg32
39#endif /* __ASSEMBLY__ */
40
41/* CFC */
42#define PLD_CFRSTCR __reg16(PLD_BASE + 0x0000)
43#define PLD_CFSTS __reg16(PLD_BASE + 0x0002)
44#define PLD_CFIMASK __reg16(PLD_BASE + 0x0004)
45#define PLD_CFBUFCR __reg16(PLD_BASE + 0x0006)
46
47/* MMC */
48#define PLD_MMCCR __reg16(PLD_BASE + 0x4000)
49#define PLD_MMCMOD __reg16(PLD_BASE + 0x4002)
50#define PLD_MMCSTS __reg16(PLD_BASE + 0x4006)
51#define PLD_MMCBAUR __reg16(PLD_BASE + 0x400a)
52#define PLD_MMCCMDBCUT __reg16(PLD_BASE + 0x400c)
53#define PLD_MMCCDTBCUT __reg16(PLD_BASE + 0x400e)
54#define PLD_MMCDET __reg16(PLD_BASE + 0x4010)
55#define PLD_MMCWP __reg16(PLD_BASE + 0x4012)
56#define PLD_MMCWDATA __reg16(PLD_BASE + 0x5000)
57#define PLD_MMCRDATA __reg16(PLD_BASE + 0x6000)
58#define PLD_MMCCMDDATA __reg16(PLD_BASE + 0x7000)
59#define PLD_MMCRSPDATA __reg16(PLD_BASE + 0x7006)
60
61/* ICU
62 * ICUISTS: status register
63 * ICUIREQ0: request register
64 * ICUIREQ1: request register
65 * ICUCR3: control register for CFIREQ# interrupt
66 * ICUCR4: control register for CFC Card insert interrupt
67 * ICUCR5: control register for CFC Card eject interrupt
68 * ICUCR6: control register for external interrupt
69 * ICUCR11: control register for MMC Card insert/eject interrupt
70 * ICUCR13: control register for SC error interrupt
71 * ICUCR14: control register for SC receive interrupt
72 * ICUCR15: control register for SC send interrupt
73 */
74
75#define PLD_IRQ_INT0 (M32104UT_PLD_IRQ_BASE + 0) /* None */
76#define PLD_IRQ_CFIREQ (M32104UT_PLD_IRQ_BASE + 3) /* CF IREQ */
77#define PLD_IRQ_CFC_INSERT (M32104UT_PLD_IRQ_BASE + 4) /* CF Insert */
78#define PLD_IRQ_CFC_EJECT (M32104UT_PLD_IRQ_BASE + 5) /* CF Eject */
79#define PLD_IRQ_EXINT (M32104UT_PLD_IRQ_BASE + 6) /* EXINT */
80#define PLD_IRQ_MMCCARD (M32104UT_PLD_IRQ_BASE + 11) /* MMC Insert/Eject */
81#define PLD_IRQ_SC_ERROR (M32104UT_PLD_IRQ_BASE + 13) /* SC error */
82#define PLD_IRQ_SC_RCV (M32104UT_PLD_IRQ_BASE + 14) /* SC receive */
83#define PLD_IRQ_SC_SND (M32104UT_PLD_IRQ_BASE + 15) /* SC send */
84
85#define PLD_ICUISTS __reg16(PLD_BASE + 0x8002)
86#define PLD_ICUISTS_VECB_MASK (0xf000)
87#define PLD_ICUISTS_VECB(x) ((x) & PLD_ICUISTS_VECB_MASK)
88#define PLD_ICUISTS_ISN_MASK (0x07c0)
89#define PLD_ICUISTS_ISN(x) ((x) & PLD_ICUISTS_ISN_MASK)
90#define PLD_ICUCR3 __reg16(PLD_BASE + 0x8104)
91#define PLD_ICUCR4 __reg16(PLD_BASE + 0x8106)
92#define PLD_ICUCR5 __reg16(PLD_BASE + 0x8108)
93#define PLD_ICUCR6 __reg16(PLD_BASE + 0x810a)
94#define PLD_ICUCR11 __reg16(PLD_BASE + 0x8114)
95#define PLD_ICUCR13 __reg16(PLD_BASE + 0x8118)
96#define PLD_ICUCR14 __reg16(PLD_BASE + 0x811a)
97#define PLD_ICUCR15 __reg16(PLD_BASE + 0x811c)
98#define PLD_ICUCR_IEN (0x1000)
99#define PLD_ICUCR_IREQ (0x0100)
100#define PLD_ICUCR_ISMOD00 (0x0000) /* Low edge */
101#define PLD_ICUCR_ISMOD01 (0x0010) /* Low level */
102#define PLD_ICUCR_ISMOD02 (0x0020) /* High edge */
103#define PLD_ICUCR_ISMOD03 (0x0030) /* High level */
104#define PLD_ICUCR_ILEVEL0 (0x0000)
105#define PLD_ICUCR_ILEVEL1 (0x0001)
106#define PLD_ICUCR_ILEVEL2 (0x0002)
107#define PLD_ICUCR_ILEVEL3 (0x0003)
108#define PLD_ICUCR_ILEVEL4 (0x0004)
109#define PLD_ICUCR_ILEVEL5 (0x0005)
110#define PLD_ICUCR_ILEVEL6 (0x0006)
111#define PLD_ICUCR_ILEVEL7 (0x0007)
112
113/* Power Control of MMC and CF */
114#define PLD_CPCR __reg16(PLD_BASE + 0x14000)
115#define PLD_CPCR_CDP 0x0001
116
117/* LED Control
118 *
119 * 1: DIP swich side
120 * 2: Reset switch side
121 */
122#define PLD_IOLEDCR __reg16(PLD_BASE + 0x14002)
123#define PLD_IOLED_1_ON 0x001
124#define PLD_IOLED_1_OFF 0x000
125#define PLD_IOLED_2_ON 0x002
126#define PLD_IOLED_2_OFF 0x000
127
128/* DIP Switch
129 * 0: Write-protect of Flash Memory (0:protected, 1:non-protected)
130 * 1: -
131 * 2: -
132 * 3: -
133 */
134#define PLD_IOSWSTS __reg16(PLD_BASE + 0x14004)
135#define PLD_IOSWSTS_IOSW2 0x0200
136#define PLD_IOSWSTS_IOSW1 0x0100
137#define PLD_IOSWSTS_IOWP0 0x0001
138
139/* CRC */
140#define PLD_CRC7DATA __reg16(PLD_BASE + 0x18000)
141#define PLD_CRC7INDATA __reg16(PLD_BASE + 0x18002)
142#define PLD_CRC16DATA __reg16(PLD_BASE + 0x18004)
143#define PLD_CRC16INDATA __reg16(PLD_BASE + 0x18006)
144#define PLD_CRC16ADATA __reg16(PLD_BASE + 0x18008)
145#define PLD_CRC16AINDATA __reg16(PLD_BASE + 0x1800a)
146
147/* RTC */
148#define PLD_RTCCR __reg16(PLD_BASE + 0x1c000)
149#define PLD_RTCBAUR __reg16(PLD_BASE + 0x1c002)
150#define PLD_RTCWRDATA __reg16(PLD_BASE + 0x1c004)
151#define PLD_RTCRDDATA __reg16(PLD_BASE + 0x1c006)
152#define PLD_RTCRSTODT __reg16(PLD_BASE + 0x1c008)
153
154/* SIM Card */
155#define PLD_SCCR __reg16(PLD_BASE + 0x38000)
156#define PLD_SCMOD __reg16(PLD_BASE + 0x38004)
157#define PLD_SCSTS __reg16(PLD_BASE + 0x38006)
158#define PLD_SCINTCR __reg16(PLD_BASE + 0x38008)
159#define PLD_SCBAUR __reg16(PLD_BASE + 0x3800a)
160#define PLD_SCTXB __reg16(PLD_BASE + 0x3800c)
161#define PLD_SCRXB __reg16(PLD_BASE + 0x3800e)
162
163#endif /* _M32104UT_M32104UT_PLD_H */
diff --git a/include/asm-m32r/m32r.h b/include/asm-m32r/m32r.h
index ec142be00862..b133ca61acf1 100644
--- a/include/asm-m32r/m32r.h
+++ b/include/asm-m32r/m32r.h
@@ -14,7 +14,7 @@
14#include <asm/m32r_mp_fpga.h> 14#include <asm/m32r_mp_fpga.h>
15#elif defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_XNUX2) \ 15#elif defined(CONFIG_CHIP_VDEC2) || defined(CONFIG_CHIP_XNUX2) \
16 || defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_M32102) \ 16 || defined(CONFIG_CHIP_M32700) || defined(CONFIG_CHIP_M32102) \
17 || defined(CONFIG_CHIP_OPSP) 17 || defined(CONFIG_CHIP_OPSP) || defined(CONFIG_CHIP_M32104)
18#include <asm/m32102.h> 18#include <asm/m32102.h>
19#endif 19#endif
20 20
@@ -43,6 +43,10 @@
43#include <asm/m32700ut/m32700ut_pld.h> 43#include <asm/m32700ut/m32700ut_pld.h>
44#endif 44#endif
45 45
46#if defined(CONFIG_PLAT_M32104UT)
47#include <asm/m32104ut/m32104ut_pld.h>
48#endif /* CONFIG_PLAT_M32104 */
49
46/* 50/*
47 * M32R Register 51 * M32R Register
48 */ 52 */
@@ -122,7 +126,7 @@
122 126
123#include <asm/page.h> 127#include <asm/page.h>
124#ifdef CONFIG_MMU 128#ifdef CONFIG_MMU
125#define NONCACHE_OFFSET __PAGE_OFFSET+0x20000000 129#define NONCACHE_OFFSET (__PAGE_OFFSET + 0x20000000)
126#else 130#else
127#define NONCACHE_OFFSET __PAGE_OFFSET 131#define NONCACHE_OFFSET __PAGE_OFFSET
128#endif /* CONFIG_MMU */ 132#endif /* CONFIG_MMU */
diff --git a/include/asm-m32r/mman.h b/include/asm-m32r/mman.h
index 011f6d9ec5cc..12e29747bc84 100644
--- a/include/asm-m32r/mman.h
+++ b/include/asm-m32r/mman.h
@@ -37,6 +37,7 @@
37#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 37#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
38#define MADV_WILLNEED 0x3 /* pre-fault pages */ 38#define MADV_WILLNEED 0x3 /* pre-fault pages */
39#define MADV_DONTNEED 0x4 /* discard these pages */ 39#define MADV_DONTNEED 0x4 /* discard these pages */
40#define MADV_REMOVE 0x5 /* remove these pages & resources */
40 41
41/* compatibility flags */ 42/* compatibility flags */
42#define MAP_ANON MAP_ANONYMOUS 43#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h
index 5eee832b73a0..dcf619a0a0b0 100644
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@@ -69,12 +69,12 @@
69} while(0) 69} while(0)
70 70
71/* Interrupt Control */ 71/* Interrupt Control */
72#if !defined(CONFIG_CHIP_M32102) 72#if !defined(CONFIG_CHIP_M32102) && !defined(CONFIG_CHIP_M32104)
73#define local_irq_enable() \ 73#define local_irq_enable() \
74 __asm__ __volatile__ ("setpsw #0x40 -> nop": : :"memory") 74 __asm__ __volatile__ ("setpsw #0x40 -> nop": : :"memory")
75#define local_irq_disable() \ 75#define local_irq_disable() \
76 __asm__ __volatile__ ("clrpsw #0x40 -> nop": : :"memory") 76 __asm__ __volatile__ ("clrpsw #0x40 -> nop": : :"memory")
77#else /* CONFIG_CHIP_M32102 */ 77#else /* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
78static inline void local_irq_enable(void) 78static inline void local_irq_enable(void)
79{ 79{
80 unsigned long tmpreg; 80 unsigned long tmpreg;
@@ -96,7 +96,7 @@ static inline void local_irq_disable(void)
96 "mvtc %0, psw \n\t" 96 "mvtc %0, psw \n\t"
97 : "=&r" (tmpreg0), "=&r" (tmpreg1) : : "cbit", "memory"); 97 : "=&r" (tmpreg0), "=&r" (tmpreg1) : : "cbit", "memory");
98} 98}
99#endif /* CONFIG_CHIP_M32102 */ 99#endif /* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
100 100
101#define local_save_flags(x) \ 101#define local_save_flags(x) \
102 __asm__ __volatile__("mvfc %0,psw" : "=r"(x) : /* no input */) 102 __asm__ __volatile__("mvfc %0,psw" : "=r"(x) : /* no input */)
@@ -105,13 +105,13 @@ static inline void local_irq_disable(void)
105 __asm__ __volatile__("mvtc %0,psw" : /* no outputs */ \ 105 __asm__ __volatile__("mvtc %0,psw" : /* no outputs */ \
106 : "r" (x) : "cbit", "memory") 106 : "r" (x) : "cbit", "memory")
107 107
108#if !defined(CONFIG_CHIP_M32102) 108#if !(defined(CONFIG_CHIP_M32102) || defined(CONFIG_CHIP_M32104))
109#define local_irq_save(x) \ 109#define local_irq_save(x) \
110 __asm__ __volatile__( \ 110 __asm__ __volatile__( \
111 "mvfc %0, psw; \n\t" \ 111 "mvfc %0, psw; \n\t" \
112 "clrpsw #0x40 -> nop; \n\t" \ 112 "clrpsw #0x40 -> nop; \n\t" \
113 : "=r" (x) : /* no input */ : "memory") 113 : "=r" (x) : /* no input */ : "memory")
114#else /* CONFIG_CHIP_M32102 */ 114#else /* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
115#define local_irq_save(x) \ 115#define local_irq_save(x) \
116 ({ \ 116 ({ \
117 unsigned long tmpreg; \ 117 unsigned long tmpreg; \
@@ -124,7 +124,7 @@ static inline void local_irq_disable(void)
124 : "=r" (x), "=&r" (tmpreg) \ 124 : "=r" (x), "=&r" (tmpreg) \
125 : : "cbit", "memory"); \ 125 : : "cbit", "memory"); \
126 }) 126 })
127#endif /* CONFIG_CHIP_M32102 */ 127#endif /* CONFIG_CHIP_M32102 || CONFIG_CHIP_M32104 */
128 128
129#define irqs_disabled() \ 129#define irqs_disabled() \
130 ({ \ 130 ({ \
diff --git a/include/asm-m32r/unistd.h b/include/asm-m32r/unistd.h
index ac399e1f7bc0..39be87ca2a5a 100644
--- a/include/asm-m32r/unistd.h
+++ b/include/asm-m32r/unistd.h
@@ -319,7 +319,7 @@ type name(void) \
319register long __scno __asm__ ("r7") = __NR_##name; \ 319register long __scno __asm__ ("r7") = __NR_##name; \
320register long __res __asm__("r0"); \ 320register long __res __asm__("r0"); \
321__asm__ __volatile__ (\ 321__asm__ __volatile__ (\
322 "trap #" SYSCALL_VECTOR \ 322 "trap #" SYSCALL_VECTOR "|| nop"\
323 : "=r" (__res) \ 323 : "=r" (__res) \
324 : "r" (__scno) \ 324 : "r" (__scno) \
325 : "memory"); \ 325 : "memory"); \
@@ -332,7 +332,7 @@ type name(type1 arg1) \
332register long __scno __asm__ ("r7") = __NR_##name; \ 332register long __scno __asm__ ("r7") = __NR_##name; \
333register long __res __asm__ ("r0") = (long)(arg1); \ 333register long __res __asm__ ("r0") = (long)(arg1); \
334__asm__ __volatile__ (\ 334__asm__ __volatile__ (\
335 "trap #" SYSCALL_VECTOR \ 335 "trap #" SYSCALL_VECTOR "|| nop"\
336 : "=r" (__res) \ 336 : "=r" (__res) \
337 : "r" (__scno), "0" (__res) \ 337 : "r" (__scno), "0" (__res) \
338 : "memory"); \ 338 : "memory"); \
@@ -346,7 +346,7 @@ register long __scno __asm__ ("r7") = __NR_##name; \
346register long __arg2 __asm__ ("r1") = (long)(arg2); \ 346register long __arg2 __asm__ ("r1") = (long)(arg2); \
347register long __res __asm__ ("r0") = (long)(arg1); \ 347register long __res __asm__ ("r0") = (long)(arg1); \
348__asm__ __volatile__ (\ 348__asm__ __volatile__ (\
349 "trap #" SYSCALL_VECTOR \ 349 "trap #" SYSCALL_VECTOR "|| nop"\
350 : "=r" (__res) \ 350 : "=r" (__res) \
351 : "r" (__scno), "0" (__res), "r" (__arg2) \ 351 : "r" (__scno), "0" (__res), "r" (__arg2) \
352 : "memory"); \ 352 : "memory"); \
@@ -361,7 +361,7 @@ register long __arg3 __asm__ ("r2") = (long)(arg3); \
361register long __arg2 __asm__ ("r1") = (long)(arg2); \ 361register long __arg2 __asm__ ("r1") = (long)(arg2); \
362register long __res __asm__ ("r0") = (long)(arg1); \ 362register long __res __asm__ ("r0") = (long)(arg1); \
363__asm__ __volatile__ (\ 363__asm__ __volatile__ (\
364 "trap #" SYSCALL_VECTOR \ 364 "trap #" SYSCALL_VECTOR "|| nop"\
365 : "=r" (__res) \ 365 : "=r" (__res) \
366 : "r" (__scno), "0" (__res), "r" (__arg2), \ 366 : "r" (__scno), "0" (__res), "r" (__arg2), \
367 "r" (__arg3) \ 367 "r" (__arg3) \
@@ -378,7 +378,7 @@ register long __arg3 __asm__ ("r2") = (long)(arg3); \
378register long __arg2 __asm__ ("r1") = (long)(arg2); \ 378register long __arg2 __asm__ ("r1") = (long)(arg2); \
379register long __res __asm__ ("r0") = (long)(arg1); \ 379register long __res __asm__ ("r0") = (long)(arg1); \
380__asm__ __volatile__ (\ 380__asm__ __volatile__ (\
381 "trap #" SYSCALL_VECTOR \ 381 "trap #" SYSCALL_VECTOR "|| nop"\
382 : "=r" (__res) \ 382 : "=r" (__res) \
383 : "r" (__scno), "0" (__res), "r" (__arg2), \ 383 : "r" (__scno), "0" (__res), "r" (__arg2), \
384 "r" (__arg3), "r" (__arg4) \ 384 "r" (__arg3), "r" (__arg4) \
@@ -397,7 +397,7 @@ register long __arg3 __asm__ ("r2") = (long)(arg3); \
397register long __arg2 __asm__ ("r1") = (long)(arg2); \ 397register long __arg2 __asm__ ("r1") = (long)(arg2); \
398register long __res __asm__ ("r0") = (long)(arg1); \ 398register long __res __asm__ ("r0") = (long)(arg1); \
399__asm__ __volatile__ (\ 399__asm__ __volatile__ (\
400 "trap #" SYSCALL_VECTOR \ 400 "trap #" SYSCALL_VECTOR "|| nop"\
401 : "=r" (__res) \ 401 : "=r" (__res) \
402 : "r" (__scno), "0" (__res), "r" (__arg2), \ 402 : "r" (__scno), "0" (__res), "r" (__arg2), \
403 "r" (__arg3), "r" (__arg4), "r" (__arg5) \ 403 "r" (__arg3), "r" (__arg4), "r" (__arg5) \
diff --git a/include/asm-m68k/atomic.h b/include/asm-m68k/atomic.h
index e3c962eeabf3..b8a4e75d679d 100644
--- a/include/asm-m68k/atomic.h
+++ b/include/asm-m68k/atomic.h
@@ -157,4 +157,5 @@ static inline void atomic_set_mask(unsigned long mask, unsigned long *v)
157#define smp_mb__before_atomic_inc() barrier() 157#define smp_mb__before_atomic_inc() barrier()
158#define smp_mb__after_atomic_inc() barrier() 158#define smp_mb__after_atomic_inc() barrier()
159 159
160#include <asm-generic/atomic.h>
160#endif /* __ARCH_M68K_ATOMIC __ */ 161#endif /* __ARCH_M68K_ATOMIC __ */
diff --git a/include/asm-m68k/irq.h b/include/asm-m68k/irq.h
index 1f569905cb74..127ad190cf2d 100644
--- a/include/asm-m68k/irq.h
+++ b/include/asm-m68k/irq.h
@@ -70,8 +70,6 @@ static __inline__ int irq_canonicalize(int irq)
70 70
71extern void (*enable_irq)(unsigned int); 71extern void (*enable_irq)(unsigned int);
72extern void (*disable_irq)(unsigned int); 72extern void (*disable_irq)(unsigned int);
73
74#define disable_irq_nosync disable_irq
75#define enable_irq_nosync enable_irq 73#define enable_irq_nosync enable_irq
76 74
77struct pt_regs; 75struct pt_regs;
diff --git a/include/asm-m68k/mman.h b/include/asm-m68k/mman.h
index f831c4eeae6e..ea262ab88b3b 100644
--- a/include/asm-m68k/mman.h
+++ b/include/asm-m68k/mman.h
@@ -35,6 +35,7 @@
35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
36#define MADV_WILLNEED 0x3 /* pre-fault pages */ 36#define MADV_WILLNEED 0x3 /* pre-fault pages */
37#define MADV_DONTNEED 0x4 /* discard these pages */ 37#define MADV_DONTNEED 0x4 /* discard these pages */
38#define MADV_REMOVE 0x5 /* remove these pages & resources */
38 39
39/* compatibility flags */ 40/* compatibility flags */
40#define MAP_ANON MAP_ANONYMOUS 41#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-m68knommu/atomic.h b/include/asm-m68knommu/atomic.h
index 3c1cc153c415..1702dbe9318c 100644
--- a/include/asm-m68knommu/atomic.h
+++ b/include/asm-m68knommu/atomic.h
@@ -143,4 +143,5 @@ static inline int atomic_sub_return(int i, atomic_t * v)
143#define atomic_dec_return(v) atomic_sub_return(1,(v)) 143#define atomic_dec_return(v) atomic_sub_return(1,(v))
144#define atomic_inc_return(v) atomic_add_return(1,(v)) 144#define atomic_inc_return(v) atomic_add_return(1,(v))
145 145
146#include <asm-generic/atomic.h>
146#endif /* __ARCH_M68KNOMMU_ATOMIC __ */ 147#endif /* __ARCH_M68KNOMMU_ATOMIC __ */
diff --git a/include/asm-m68knommu/irq.h b/include/asm-m68knommu/irq.h
index a08fa9b958da..20c48ec858a4 100644
--- a/include/asm-m68knommu/irq.h
+++ b/include/asm-m68knommu/irq.h
@@ -84,10 +84,8 @@ extern void (*mach_disable_irq)(unsigned int);
84/* 84/*
85 * Some drivers want these entry points 85 * Some drivers want these entry points
86 */ 86 */
87#define enable_irq(x) (mach_enable_irq ? (*mach_enable_irq)(x) : 0) 87#define enable_irq(x) 0
88#define disable_irq(x) (mach_disable_irq ? (*mach_disable_irq)(x) : 0) 88#define disable_irq(x) do { } while (0)
89
90#define enable_irq_nosync(x) enable_irq(x)
91#define disable_irq_nosync(x) disable_irq(x) 89#define disable_irq_nosync(x) disable_irq(x)
92 90
93struct irqaction; 91struct irqaction;
diff --git a/include/asm-mips/atomic.h b/include/asm-mips/atomic.h
index 55c37c106ef0..92256e43a938 100644
--- a/include/asm-mips/atomic.h
+++ b/include/asm-mips/atomic.h
@@ -713,4 +713,5 @@ static __inline__ long atomic64_sub_if_positive(long i, atomic64_t * v)
713#define smp_mb__before_atomic_inc() smp_mb() 713#define smp_mb__before_atomic_inc() smp_mb()
714#define smp_mb__after_atomic_inc() smp_mb() 714#define smp_mb__after_atomic_inc() smp_mb()
715 715
716#include <asm-generic/atomic.h>
716#endif /* _ASM_ATOMIC_H */ 717#endif /* _ASM_ATOMIC_H */
diff --git a/include/asm-mips/mman.h b/include/asm-mips/mman.h
index 62060957ba93..dd17c8bd62a1 100644
--- a/include/asm-mips/mman.h
+++ b/include/asm-mips/mman.h
@@ -65,6 +65,7 @@
65#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 65#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
66#define MADV_WILLNEED 0x3 /* pre-fault pages */ 66#define MADV_WILLNEED 0x3 /* pre-fault pages */
67#define MADV_DONTNEED 0x4 /* discard these pages */ 67#define MADV_DONTNEED 0x4 /* discard these pages */
68#define MADV_REMOVE 0x5 /* remove these pages & resources */
68 69
69/* compatibility flags */ 70/* compatibility flags */
70#define MAP_ANON MAP_ANONYMOUS 71#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-mips/riscos-syscall.h b/include/asm-mips/riscos-syscall.h
deleted file mode 100644
index 4d8eb15461eb..000000000000
--- a/include/asm-mips/riscos-syscall.h
+++ /dev/null
@@ -1,979 +0,0 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Copyright (C) 1995, 96, 97, 98, 99, 2000 by Ralf Baechle
7 */
8#ifndef _ASM_RISCOS_SYSCALL_H
9#define _ASM_RISCOS_SYSCALL_H
10
11/*
12 * The syscalls 0 - 3999 are reserved for a down to the root syscall
13 * compatibility with RISC/os and IRIX. We'll see how to deal with the
14 * various "real" BSD variants like Ultrix, NetBSD ...
15 */
16
17/*
18 * SVR4 syscalls are in the range from 1 to 999
19 */
20#define __NR_SVR4 0
21#define __NR_SVR4_syscall (__NR_SVR4 + 0)
22#define __NR_SVR4_exit (__NR_SVR4 + 1)
23#define __NR_SVR4_fork (__NR_SVR4 + 2)
24#define __NR_SVR4_read (__NR_SVR4 + 3)
25#define __NR_SVR4_write (__NR_SVR4 + 4)
26#define __NR_SVR4_open (__NR_SVR4 + 5)
27#define __NR_SVR4_close (__NR_SVR4 + 6)
28#define __NR_SVR4_wait (__NR_SVR4 + 7)
29#define __NR_SVR4_creat (__NR_SVR4 + 8)
30#define __NR_SVR4_link (__NR_SVR4 + 9)
31#define __NR_SVR4_unlink (__NR_SVR4 + 10)
32#define __NR_SVR4_exec (__NR_SVR4 + 11)
33#define __NR_SVR4_chdir (__NR_SVR4 + 12)
34#define __NR_SVR4_gtime (__NR_SVR4 + 13)
35#define __NR_SVR4_mknod (__NR_SVR4 + 14)
36#define __NR_SVR4_chmod (__NR_SVR4 + 15)
37#define __NR_SVR4_chown (__NR_SVR4 + 16)
38#define __NR_SVR4_sbreak (__NR_SVR4 + 17)
39#define __NR_SVR4_stat (__NR_SVR4 + 18)
40#define __NR_SVR4_lseek (__NR_SVR4 + 19)
41#define __NR_SVR4_getpid (__NR_SVR4 + 20)
42#define __NR_SVR4_mount (__NR_SVR4 + 21)
43#define __NR_SVR4_umount (__NR_SVR4 + 22)
44#define __NR_SVR4_setuid (__NR_SVR4 + 23)
45#define __NR_SVR4_getuid (__NR_SVR4 + 24)
46#define __NR_SVR4_stime (__NR_SVR4 + 25)
47#define __NR_SVR4_ptrace (__NR_SVR4 + 26)
48#define __NR_SVR4_alarm (__NR_SVR4 + 27)
49#define __NR_SVR4_fstat (__NR_SVR4 + 28)
50#define __NR_SVR4_pause (__NR_SVR4 + 29)
51#define __NR_SVR4_utime (__NR_SVR4 + 30)
52#define __NR_SVR4_stty (__NR_SVR4 + 31)
53#define __NR_SVR4_gtty (__NR_SVR4 + 32)
54#define __NR_SVR4_access (__NR_SVR4 + 33)
55#define __NR_SVR4_nice (__NR_SVR4 + 34)
56#define __NR_SVR4_statfs (__NR_SVR4 + 35)
57#define __NR_SVR4_sync (__NR_SVR4 + 36)
58#define __NR_SVR4_kill (__NR_SVR4 + 37)
59#define __NR_SVR4_fstatfs (__NR_SVR4 + 38)
60#define __NR_SVR4_setpgrp (__NR_SVR4 + 39)
61#define __NR_SVR4_cxenix (__NR_SVR4 + 40)
62#define __NR_SVR4_dup (__NR_SVR4 + 41)
63#define __NR_SVR4_pipe (__NR_SVR4 + 42)
64#define __NR_SVR4_times (__NR_SVR4 + 43)
65#define __NR_SVR4_profil (__NR_SVR4 + 44)
66#define __NR_SVR4_plock (__NR_SVR4 + 45)
67#define __NR_SVR4_setgid (__NR_SVR4 + 46)
68#define __NR_SVR4_getgid (__NR_SVR4 + 47)
69#define __NR_SVR4_sig (__NR_SVR4 + 48)
70#define __NR_SVR4_msgsys (__NR_SVR4 + 49)
71#define __NR_SVR4_sysmips (__NR_SVR4 + 50)
72#define __NR_SVR4_sysacct (__NR_SVR4 + 51)
73#define __NR_SVR4_shmsys (__NR_SVR4 + 52)
74#define __NR_SVR4_semsys (__NR_SVR4 + 53)
75#define __NR_SVR4_ioctl (__NR_SVR4 + 54)
76#define __NR_SVR4_uadmin (__NR_SVR4 + 55)
77#define __NR_SVR4_exch (__NR_SVR4 + 56)
78#define __NR_SVR4_utssys (__NR_SVR4 + 57)
79#define __NR_SVR4_fsync (__NR_SVR4 + 58)
80#define __NR_SVR4_exece (__NR_SVR4 + 59)
81#define __NR_SVR4_umask (__NR_SVR4 + 60)
82#define __NR_SVR4_chroot (__NR_SVR4 + 61)
83#define __NR_SVR4_fcntl (__NR_SVR4 + 62)
84#define __NR_SVR4_ulimit (__NR_SVR4 + 63)
85#define __NR_SVR4_reserved1 (__NR_SVR4 + 64)
86#define __NR_SVR4_reserved2 (__NR_SVR4 + 65)
87#define __NR_SVR4_reserved3 (__NR_SVR4 + 66)
88#define __NR_SVR4_reserved4 (__NR_SVR4 + 67)
89#define __NR_SVR4_reserved5 (__NR_SVR4 + 68)
90#define __NR_SVR4_reserved6 (__NR_SVR4 + 69)
91#define __NR_SVR4_advfs (__NR_SVR4 + 70)
92#define __NR_SVR4_unadvfs (__NR_SVR4 + 71)
93#define __NR_SVR4_unused1 (__NR_SVR4 + 72)
94#define __NR_SVR4_unused2 (__NR_SVR4 + 73)
95#define __NR_SVR4_rfstart (__NR_SVR4 + 74)
96#define __NR_SVR4_unused3 (__NR_SVR4 + 75)
97#define __NR_SVR4_rdebug (__NR_SVR4 + 76)
98#define __NR_SVR4_rfstop (__NR_SVR4 + 77)
99#define __NR_SVR4_rfsys (__NR_SVR4 + 78)
100#define __NR_SVR4_rmdir (__NR_SVR4 + 79)
101#define __NR_SVR4_mkdir (__NR_SVR4 + 80)
102#define __NR_SVR4_getdents (__NR_SVR4 + 81)
103#define __NR_SVR4_libattach (__NR_SVR4 + 82)
104#define __NR_SVR4_libdetach (__NR_SVR4 + 83)
105#define __NR_SVR4_sysfs (__NR_SVR4 + 84)
106#define __NR_SVR4_getmsg (__NR_SVR4 + 85)
107#define __NR_SVR4_putmsg (__NR_SVR4 + 86)
108#define __NR_SVR4_poll (__NR_SVR4 + 87)
109#define __NR_SVR4_lstat (__NR_SVR4 + 88)
110#define __NR_SVR4_symlink (__NR_SVR4 + 89)
111#define __NR_SVR4_readlink (__NR_SVR4 + 90)
112#define __NR_SVR4_setgroups (__NR_SVR4 + 91)
113#define __NR_SVR4_getgroups (__NR_SVR4 + 92)
114#define __NR_SVR4_fchmod (__NR_SVR4 + 93)
115#define __NR_SVR4_fchown (__NR_SVR4 + 94)
116#define __NR_SVR4_sigprocmask (__NR_SVR4 + 95)
117#define __NR_SVR4_sigsuspend (__NR_SVR4 + 96)
118#define __NR_SVR4_sigaltstack (__NR_SVR4 + 97)
119#define __NR_SVR4_sigaction (__NR_SVR4 + 98)
120#define __NR_SVR4_sigpending (__NR_SVR4 + 99)
121#define __NR_SVR4_setcontext (__NR_SVR4 + 100)
122#define __NR_SVR4_evsys (__NR_SVR4 + 101)
123#define __NR_SVR4_evtrapret (__NR_SVR4 + 102)
124#define __NR_SVR4_statvfs (__NR_SVR4 + 103)
125#define __NR_SVR4_fstatvfs (__NR_SVR4 + 104)
126#define __NR_SVR4_reserved7 (__NR_SVR4 + 105)
127#define __NR_SVR4_nfssys (__NR_SVR4 + 106)
128#define __NR_SVR4_waitid (__NR_SVR4 + 107)
129#define __NR_SVR4_sigsendset (__NR_SVR4 + 108)
130#define __NR_SVR4_hrtsys (__NR_SVR4 + 109)
131#define __NR_SVR4_acancel (__NR_SVR4 + 110)
132#define __NR_SVR4_async (__NR_SVR4 + 111)
133#define __NR_SVR4_priocntlset (__NR_SVR4 + 112)
134#define __NR_SVR4_pathconf (__NR_SVR4 + 113)
135#define __NR_SVR4_mincore (__NR_SVR4 + 114)
136#define __NR_SVR4_mmap (__NR_SVR4 + 115)
137#define __NR_SVR4_mprotect (__NR_SVR4 + 116)
138#define __NR_SVR4_munmap (__NR_SVR4 + 117)
139#define __NR_SVR4_fpathconf (__NR_SVR4 + 118)
140#define __NR_SVR4_vfork (__NR_SVR4 + 119)
141#define __NR_SVR4_fchdir (__NR_SVR4 + 120)
142#define __NR_SVR4_readv (__NR_SVR4 + 121)
143#define __NR_SVR4_writev (__NR_SVR4 + 122)
144#define __NR_SVR4_xstat (__NR_SVR4 + 123)
145#define __NR_SVR4_lxstat (__NR_SVR4 + 124)
146#define __NR_SVR4_fxstat (__NR_SVR4 + 125)
147#define __NR_SVR4_xmknod (__NR_SVR4 + 126)
148#define __NR_SVR4_clocal (__NR_SVR4 + 127)
149#define __NR_SVR4_setrlimit (__NR_SVR4 + 128)
150#define __NR_SVR4_getrlimit (__NR_SVR4 + 129)
151#define __NR_SVR4_lchown (__NR_SVR4 + 130)
152#define __NR_SVR4_memcntl (__NR_SVR4 + 131)
153#define __NR_SVR4_getpmsg (__NR_SVR4 + 132)
154#define __NR_SVR4_putpmsg (__NR_SVR4 + 133)
155#define __NR_SVR4_rename (__NR_SVR4 + 134)
156#define __NR_SVR4_nuname (__NR_SVR4 + 135)
157#define __NR_SVR4_setegid (__NR_SVR4 + 136)
158#define __NR_SVR4_sysconf (__NR_SVR4 + 137)
159#define __NR_SVR4_adjtime (__NR_SVR4 + 138)
160#define __NR_SVR4_sysinfo (__NR_SVR4 + 139)
161#define __NR_SVR4_reserved8 (__NR_SVR4 + 140)
162#define __NR_SVR4_seteuid (__NR_SVR4 + 141)
163#define __NR_SVR4_PYRAMID_statis (__NR_SVR4 + 142)
164#define __NR_SVR4_PYRAMID_tuning (__NR_SVR4 + 143)
165#define __NR_SVR4_PYRAMID_forcerr (__NR_SVR4 + 144)
166#define __NR_SVR4_PYRAMID_mpcntl (__NR_SVR4 + 145)
167#define __NR_SVR4_reserved9 (__NR_SVR4 + 146)
168#define __NR_SVR4_reserved10 (__NR_SVR4 + 147)
169#define __NR_SVR4_reserved11 (__NR_SVR4 + 148)
170#define __NR_SVR4_reserved12 (__NR_SVR4 + 149)
171#define __NR_SVR4_reserved13 (__NR_SVR4 + 150)
172#define __NR_SVR4_reserved14 (__NR_SVR4 + 151)
173#define __NR_SVR4_reserved15 (__NR_SVR4 + 152)
174#define __NR_SVR4_reserved16 (__NR_SVR4 + 153)
175#define __NR_SVR4_reserved17 (__NR_SVR4 + 154)
176#define __NR_SVR4_reserved18 (__NR_SVR4 + 155)
177#define __NR_SVR4_reserved19 (__NR_SVR4 + 156)
178#define __NR_SVR4_reserved20 (__NR_SVR4 + 157)
179#define __NR_SVR4_reserved21 (__NR_SVR4 + 158)
180#define __NR_SVR4_reserved22 (__NR_SVR4 + 159)
181#define __NR_SVR4_reserved23 (__NR_SVR4 + 160)
182#define __NR_SVR4_reserved24 (__NR_SVR4 + 161)
183#define __NR_SVR4_reserved25 (__NR_SVR4 + 162)
184#define __NR_SVR4_reserved26 (__NR_SVR4 + 163)
185#define __NR_SVR4_reserved27 (__NR_SVR4 + 164)
186#define __NR_SVR4_reserved28 (__NR_SVR4 + 165)
187#define __NR_SVR4_reserved29 (__NR_SVR4 + 166)
188#define __NR_SVR4_reserved30 (__NR_SVR4 + 167)
189#define __NR_SVR4_reserved31 (__NR_SVR4 + 168)
190#define __NR_SVR4_reserved32 (__NR_SVR4 + 169)
191#define __NR_SVR4_reserved33 (__NR_SVR4 + 170)
192#define __NR_SVR4_reserved34 (__NR_SVR4 + 171)
193#define __NR_SVR4_reserved35 (__NR_SVR4 + 172)
194#define __NR_SVR4_reserved36 (__NR_SVR4 + 173)
195#define __NR_SVR4_reserved37 (__NR_SVR4 + 174)
196#define __NR_SVR4_reserved38 (__NR_SVR4 + 175)
197#define __NR_SVR4_reserved39 (__NR_SVR4 + 176)
198#define __NR_SVR4_reserved40 (__NR_SVR4 + 177)
199#define __NR_SVR4_reserved41 (__NR_SVR4 + 178)
200#define __NR_SVR4_reserved42 (__NR_SVR4 + 179)
201#define __NR_SVR4_reserved43 (__NR_SVR4 + 180)
202#define __NR_SVR4_reserved44 (__NR_SVR4 + 181)
203#define __NR_SVR4_reserved45 (__NR_SVR4 + 182)
204#define __NR_SVR4_reserved46 (__NR_SVR4 + 183)
205#define __NR_SVR4_reserved47 (__NR_SVR4 + 184)
206#define __NR_SVR4_reserved48 (__NR_SVR4 + 185)
207#define __NR_SVR4_reserved49 (__NR_SVR4 + 186)
208#define __NR_SVR4_reserved50 (__NR_SVR4 + 187)
209#define __NR_SVR4_reserved51 (__NR_SVR4 + 188)
210#define __NR_SVR4_reserved52 (__NR_SVR4 + 189)
211#define __NR_SVR4_reserved53 (__NR_SVR4 + 190)
212#define __NR_SVR4_reserved54 (__NR_SVR4 + 191)
213#define __NR_SVR4_reserved55 (__NR_SVR4 + 192)
214#define __NR_SVR4_reserved56 (__NR_SVR4 + 193)
215#define __NR_SVR4_reserved57 (__NR_SVR4 + 194)
216#define __NR_SVR4_reserved58 (__NR_SVR4 + 195)
217#define __NR_SVR4_reserved59 (__NR_SVR4 + 196)
218#define __NR_SVR4_reserved60 (__NR_SVR4 + 197)
219#define __NR_SVR4_reserved61 (__NR_SVR4 + 198)
220#define __NR_SVR4_reserved62 (__NR_SVR4 + 199)
221#define __NR_SVR4_reserved63 (__NR_SVR4 + 200)
222#define __NR_SVR4_aread (__NR_SVR4 + 201)
223#define __NR_SVR4_awrite (__NR_SVR4 + 202)
224#define __NR_SVR4_listio (__NR_SVR4 + 203)
225#define __NR_SVR4_mips_acancel (__NR_SVR4 + 204)
226#define __NR_SVR4_astatus (__NR_SVR4 + 205)
227#define __NR_SVR4_await (__NR_SVR4 + 206)
228#define __NR_SVR4_areadv (__NR_SVR4 + 207)
229#define __NR_SVR4_awritev (__NR_SVR4 + 208)
230#define __NR_SVR4_MIPS_reserved1 (__NR_SVR4 + 209)
231#define __NR_SVR4_MIPS_reserved2 (__NR_SVR4 + 210)
232#define __NR_SVR4_MIPS_reserved3 (__NR_SVR4 + 211)
233#define __NR_SVR4_MIPS_reserved4 (__NR_SVR4 + 212)
234#define __NR_SVR4_MIPS_reserved5 (__NR_SVR4 + 213)
235#define __NR_SVR4_MIPS_reserved6 (__NR_SVR4 + 214)
236#define __NR_SVR4_MIPS_reserved7 (__NR_SVR4 + 215)
237#define __NR_SVR4_MIPS_reserved8 (__NR_SVR4 + 216)
238#define __NR_SVR4_MIPS_reserved9 (__NR_SVR4 + 217)
239#define __NR_SVR4_MIPS_reserved10 (__NR_SVR4 + 218)
240#define __NR_SVR4_MIPS_reserved11 (__NR_SVR4 + 219)
241#define __NR_SVR4_MIPS_reserved12 (__NR_SVR4 + 220)
242#define __NR_SVR4_CDC_reserved1 (__NR_SVR4 + 221)
243#define __NR_SVR4_CDC_reserved2 (__NR_SVR4 + 222)
244#define __NR_SVR4_CDC_reserved3 (__NR_SVR4 + 223)
245#define __NR_SVR4_CDC_reserved4 (__NR_SVR4 + 224)
246#define __NR_SVR4_CDC_reserved5 (__NR_SVR4 + 225)
247#define __NR_SVR4_CDC_reserved6 (__NR_SVR4 + 226)
248#define __NR_SVR4_CDC_reserved7 (__NR_SVR4 + 227)
249#define __NR_SVR4_CDC_reserved8 (__NR_SVR4 + 228)
250#define __NR_SVR4_CDC_reserved9 (__NR_SVR4 + 229)
251#define __NR_SVR4_CDC_reserved10 (__NR_SVR4 + 230)
252#define __NR_SVR4_CDC_reserved11 (__NR_SVR4 + 231)
253#define __NR_SVR4_CDC_reserved12 (__NR_SVR4 + 232)
254#define __NR_SVR4_CDC_reserved13 (__NR_SVR4 + 233)
255#define __NR_SVR4_CDC_reserved14 (__NR_SVR4 + 234)
256#define __NR_SVR4_CDC_reserved15 (__NR_SVR4 + 235)
257#define __NR_SVR4_CDC_reserved16 (__NR_SVR4 + 236)
258#define __NR_SVR4_CDC_reserved17 (__NR_SVR4 + 237)
259#define __NR_SVR4_CDC_reserved18 (__NR_SVR4 + 238)
260#define __NR_SVR4_CDC_reserved19 (__NR_SVR4 + 239)
261#define __NR_SVR4_CDC_reserved20 (__NR_SVR4 + 240)
262
263/*
264 * SYS V syscalls are in the range from 1000 to 1999
265 */
266#define __NR_SYSV 1000
267#define __NR_SYSV_syscall (__NR_SYSV + 0)
268#define __NR_SYSV_exit (__NR_SYSV + 1)
269#define __NR_SYSV_fork (__NR_SYSV + 2)
270#define __NR_SYSV_read (__NR_SYSV + 3)
271#define __NR_SYSV_write (__NR_SYSV + 4)
272#define __NR_SYSV_open (__NR_SYSV + 5)
273#define __NR_SYSV_close (__NR_SYSV + 6)
274#define __NR_SYSV_wait (__NR_SYSV + 7)
275#define __NR_SYSV_creat (__NR_SYSV + 8)
276#define __NR_SYSV_link (__NR_SYSV + 9)
277#define __NR_SYSV_unlink (__NR_SYSV + 10)
278#define __NR_SYSV_execv (__NR_SYSV + 11)
279#define __NR_SYSV_chdir (__NR_SYSV + 12)
280#define __NR_SYSV_time (__NR_SYSV + 13)
281#define __NR_SYSV_mknod (__NR_SYSV + 14)
282#define __NR_SYSV_chmod (__NR_SYSV + 15)
283#define __NR_SYSV_chown (__NR_SYSV + 16)
284#define __NR_SYSV_brk (__NR_SYSV + 17)
285#define __NR_SYSV_stat (__NR_SYSV + 18)
286#define __NR_SYSV_lseek (__NR_SYSV + 19)
287#define __NR_SYSV_getpid (__NR_SYSV + 20)
288#define __NR_SYSV_mount (__NR_SYSV + 21)
289#define __NR_SYSV_umount (__NR_SYSV + 22)
290#define __NR_SYSV_setuid (__NR_SYSV + 23)
291#define __NR_SYSV_getuid (__NR_SYSV + 24)
292#define __NR_SYSV_stime (__NR_SYSV + 25)
293#define __NR_SYSV_ptrace (__NR_SYSV + 26)
294#define __NR_SYSV_alarm (__NR_SYSV + 27)
295#define __NR_SYSV_fstat (__NR_SYSV + 28)
296#define __NR_SYSV_pause (__NR_SYSV + 29)
297#define __NR_SYSV_utime (__NR_SYSV + 30)
298#define __NR_SYSV_stty (__NR_SYSV + 31)
299#define __NR_SYSV_gtty (__NR_SYSV + 32)
300#define __NR_SYSV_access (__NR_SYSV + 33)
301#define __NR_SYSV_nice (__NR_SYSV + 34)
302#define __NR_SYSV_statfs (__NR_SYSV + 35)
303#define __NR_SYSV_sync (__NR_SYSV + 36)
304#define __NR_SYSV_kill (__NR_SYSV + 37)
305#define __NR_SYSV_fstatfs (__NR_SYSV + 38)
306#define __NR_SYSV_setpgrp (__NR_SYSV + 39)
307#define __NR_SYSV_syssgi (__NR_SYSV + 40)
308#define __NR_SYSV_dup (__NR_SYSV + 41)
309#define __NR_SYSV_pipe (__NR_SYSV + 42)
310#define __NR_SYSV_times (__NR_SYSV + 43)
311#define __NR_SYSV_profil (__NR_SYSV + 44)
312#define __NR_SYSV_plock (__NR_SYSV + 45)
313#define __NR_SYSV_setgid (__NR_SYSV + 46)
314#define __NR_SYSV_getgid (__NR_SYSV + 47)
315#define __NR_SYSV_sig (__NR_SYSV + 48)
316#define __NR_SYSV_msgsys (__NR_SYSV + 49)
317#define __NR_SYSV_sysmips (__NR_SYSV + 50)
318#define __NR_SYSV_acct (__NR_SYSV + 51)
319#define __NR_SYSV_shmsys (__NR_SYSV + 52)
320#define __NR_SYSV_semsys (__NR_SYSV + 53)
321#define __NR_SYSV_ioctl (__NR_SYSV + 54)
322#define __NR_SYSV_uadmin (__NR_SYSV + 55)
323#define __NR_SYSV_sysmp (__NR_SYSV + 56)
324#define __NR_SYSV_utssys (__NR_SYSV + 57)
325#define __NR_SYSV_USG_reserved1 (__NR_SYSV + 58)
326#define __NR_SYSV_execve (__NR_SYSV + 59)
327#define __NR_SYSV_umask (__NR_SYSV + 60)
328#define __NR_SYSV_chroot (__NR_SYSV + 61)
329#define __NR_SYSV_fcntl (__NR_SYSV + 62)
330#define __NR_SYSV_ulimit (__NR_SYSV + 63)
331#define __NR_SYSV_SAFARI4_reserved1 (__NR_SYSV + 64)
332#define __NR_SYSV_SAFARI4_reserved2 (__NR_SYSV + 65)
333#define __NR_SYSV_SAFARI4_reserved3 (__NR_SYSV + 66)
334#define __NR_SYSV_SAFARI4_reserved4 (__NR_SYSV + 67)
335#define __NR_SYSV_SAFARI4_reserved5 (__NR_SYSV + 68)
336#define __NR_SYSV_SAFARI4_reserved6 (__NR_SYSV + 69)
337#define __NR_SYSV_advfs (__NR_SYSV + 70)
338#define __NR_SYSV_unadvfs (__NR_SYSV + 71)
339#define __NR_SYSV_rmount (__NR_SYSV + 72)
340#define __NR_SYSV_rumount (__NR_SYSV + 73)
341#define __NR_SYSV_rfstart (__NR_SYSV + 74)
342#define __NR_SYSV_getrlimit64 (__NR_SYSV + 75)
343#define __NR_SYSV_setrlimit64 (__NR_SYSV + 76)
344#define __NR_SYSV_nanosleep (__NR_SYSV + 77)
345#define __NR_SYSV_lseek64 (__NR_SYSV + 78)
346#define __NR_SYSV_rmdir (__NR_SYSV + 79)
347#define __NR_SYSV_mkdir (__NR_SYSV + 80)
348#define __NR_SYSV_getdents (__NR_SYSV + 81)
349#define __NR_SYSV_sginap (__NR_SYSV + 82)
350#define __NR_SYSV_sgikopt (__NR_SYSV + 83)
351#define __NR_SYSV_sysfs (__NR_SYSV + 84)
352#define __NR_SYSV_getmsg (__NR_SYSV + 85)
353#define __NR_SYSV_putmsg (__NR_SYSV + 86)
354#define __NR_SYSV_poll (__NR_SYSV + 87)
355#define __NR_SYSV_sigreturn (__NR_SYSV + 88)
356#define __NR_SYSV_accept (__NR_SYSV + 89)
357#define __NR_SYSV_bind (__NR_SYSV + 90)
358#define __NR_SYSV_connect (__NR_SYSV + 91)
359#define __NR_SYSV_gethostid (__NR_SYSV + 92)
360#define __NR_SYSV_getpeername (__NR_SYSV + 93)
361#define __NR_SYSV_getsockname (__NR_SYSV + 94)
362#define __NR_SYSV_getsockopt (__NR_SYSV + 95)
363#define __NR_SYSV_listen (__NR_SYSV + 96)
364#define __NR_SYSV_recv (__NR_SYSV + 97)
365#define __NR_SYSV_recvfrom (__NR_SYSV + 98)
366#define __NR_SYSV_recvmsg (__NR_SYSV + 99)
367#define __NR_SYSV_select (__NR_SYSV + 100)
368#define __NR_SYSV_send (__NR_SYSV + 101)
369#define __NR_SYSV_sendmsg (__NR_SYSV + 102)
370#define __NR_SYSV_sendto (__NR_SYSV + 103)
371#define __NR_SYSV_sethostid (__NR_SYSV + 104)
372#define __NR_SYSV_setsockopt (__NR_SYSV + 105)
373#define __NR_SYSV_shutdown (__NR_SYSV + 106)
374#define __NR_SYSV_socket (__NR_SYSV + 107)
375#define __NR_SYSV_gethostname (__NR_SYSV + 108)
376#define __NR_SYSV_sethostname (__NR_SYSV + 109)
377#define __NR_SYSV_getdomainname (__NR_SYSV + 110)
378#define __NR_SYSV_setdomainname (__NR_SYSV + 111)
379#define __NR_SYSV_truncate (__NR_SYSV + 112)
380#define __NR_SYSV_ftruncate (__NR_SYSV + 113)
381#define __NR_SYSV_rename (__NR_SYSV + 114)
382#define __NR_SYSV_symlink (__NR_SYSV + 115)
383#define __NR_SYSV_readlink (__NR_SYSV + 116)
384#define __NR_SYSV_lstat (__NR_SYSV + 117)
385#define __NR_SYSV_nfsmount (__NR_SYSV + 118)
386#define __NR_SYSV_nfssvc (__NR_SYSV + 119)
387#define __NR_SYSV_getfh (__NR_SYSV + 120)
388#define __NR_SYSV_async_daemon (__NR_SYSV + 121)
389#define __NR_SYSV_exportfs (__NR_SYSV + 122)
390#define __NR_SYSV_setregid (__NR_SYSV + 123)
391#define __NR_SYSV_setreuid (__NR_SYSV + 124)
392#define __NR_SYSV_getitimer (__NR_SYSV + 125)
393#define __NR_SYSV_setitimer (__NR_SYSV + 126)
394#define __NR_SYSV_adjtime (__NR_SYSV + 127)
395#define __NR_SYSV_BSD_getime (__NR_SYSV + 128)
396#define __NR_SYSV_sproc (__NR_SYSV + 129)
397#define __NR_SYSV_prctl (__NR_SYSV + 130)
398#define __NR_SYSV_procblk (__NR_SYSV + 131)
399#define __NR_SYSV_sprocsp (__NR_SYSV + 132)
400#define __NR_SYSV_sgigsc (__NR_SYSV + 133)
401#define __NR_SYSV_mmap (__NR_SYSV + 134)
402#define __NR_SYSV_munmap (__NR_SYSV + 135)
403#define __NR_SYSV_mprotect (__NR_SYSV + 136)
404#define __NR_SYSV_msync (__NR_SYSV + 137)
405#define __NR_SYSV_madvise (__NR_SYSV + 138)
406#define __NR_SYSV_pagelock (__NR_SYSV + 139)
407#define __NR_SYSV_getpagesize (__NR_SYSV + 140)
408#define __NR_SYSV_quotactl (__NR_SYSV + 141)
409#define __NR_SYSV_libdetach (__NR_SYSV + 142)
410#define __NR_SYSV_BSDgetpgrp (__NR_SYSV + 143)
411#define __NR_SYSV_BSDsetpgrp (__NR_SYSV + 144)
412#define __NR_SYSV_vhangup (__NR_SYSV + 145)
413#define __NR_SYSV_fsync (__NR_SYSV + 146)
414#define __NR_SYSV_fchdir (__NR_SYSV + 147)
415#define __NR_SYSV_getrlimit (__NR_SYSV + 148)
416#define __NR_SYSV_setrlimit (__NR_SYSV + 149)
417#define __NR_SYSV_cacheflush (__NR_SYSV + 150)
418#define __NR_SYSV_cachectl (__NR_SYSV + 151)
419#define __NR_SYSV_fchown (__NR_SYSV + 152)
420#define __NR_SYSV_fchmod (__NR_SYSV + 153)
421#define __NR_SYSV_wait3 (__NR_SYSV + 154)
422#define __NR_SYSV_socketpair (__NR_SYSV + 155)
423#define __NR_SYSV_sysinfo (__NR_SYSV + 156)
424#define __NR_SYSV_nuname (__NR_SYSV + 157)
425#define __NR_SYSV_xstat (__NR_SYSV + 158)
426#define __NR_SYSV_lxstat (__NR_SYSV + 159)
427#define __NR_SYSV_fxstat (__NR_SYSV + 160)
428#define __NR_SYSV_xmknod (__NR_SYSV + 161)
429#define __NR_SYSV_ksigaction (__NR_SYSV + 162)
430#define __NR_SYSV_sigpending (__NR_SYSV + 163)
431#define __NR_SYSV_sigprocmask (__NR_SYSV + 164)
432#define __NR_SYSV_sigsuspend (__NR_SYSV + 165)
433#define __NR_SYSV_sigpoll (__NR_SYSV + 166)
434#define __NR_SYSV_swapctl (__NR_SYSV + 167)
435#define __NR_SYSV_getcontext (__NR_SYSV + 168)
436#define __NR_SYSV_setcontext (__NR_SYSV + 169)
437#define __NR_SYSV_waitsys (__NR_SYSV + 170)
438#define __NR_SYSV_sigstack (__NR_SYSV + 171)
439#define __NR_SYSV_sigaltstack (__NR_SYSV + 172)
440#define __NR_SYSV_sigsendset (__NR_SYSV + 173)
441#define __NR_SYSV_statvfs (__NR_SYSV + 174)
442#define __NR_SYSV_fstatvfs (__NR_SYSV + 175)
443#define __NR_SYSV_getpmsg (__NR_SYSV + 176)
444#define __NR_SYSV_putpmsg (__NR_SYSV + 177)
445#define __NR_SYSV_lchown (__NR_SYSV + 178)
446#define __NR_SYSV_priocntl (__NR_SYSV + 179)
447#define __NR_SYSV_ksigqueue (__NR_SYSV + 180)
448#define __NR_SYSV_readv (__NR_SYSV + 181)
449#define __NR_SYSV_writev (__NR_SYSV + 182)
450#define __NR_SYSV_truncate64 (__NR_SYSV + 183)
451#define __NR_SYSV_ftruncate64 (__NR_SYSV + 184)
452#define __NR_SYSV_mmap64 (__NR_SYSV + 185)
453#define __NR_SYSV_dmi (__NR_SYSV + 186)
454#define __NR_SYSV_pread (__NR_SYSV + 187)
455#define __NR_SYSV_pwrite (__NR_SYSV + 188)
456
457/*
458 * BSD 4.3 syscalls are in the range from 2000 to 2999
459 */
460#define __NR_BSD43 2000
461#define __NR_BSD43_syscall (__NR_BSD43 + 0)
462#define __NR_BSD43_exit (__NR_BSD43 + 1)
463#define __NR_BSD43_fork (__NR_BSD43 + 2)
464#define __NR_BSD43_read (__NR_BSD43 + 3)
465#define __NR_BSD43_write (__NR_BSD43 + 4)
466#define __NR_BSD43_open (__NR_BSD43 + 5)
467#define __NR_BSD43_close (__NR_BSD43 + 6)
468#define __NR_BSD43_wait (__NR_BSD43 + 7)
469#define __NR_BSD43_creat (__NR_BSD43 + 8)
470#define __NR_BSD43_link (__NR_BSD43 + 9)
471#define __NR_BSD43_unlink (__NR_BSD43 + 10)
472#define __NR_BSD43_exec (__NR_BSD43 + 11)
473#define __NR_BSD43_chdir (__NR_BSD43 + 12)
474#define __NR_BSD43_time (__NR_BSD43 + 13)
475#define __NR_BSD43_mknod (__NR_BSD43 + 14)
476#define __NR_BSD43_chmod (__NR_BSD43 + 15)
477#define __NR_BSD43_chown (__NR_BSD43 + 16)
478#define __NR_BSD43_sbreak (__NR_BSD43 + 17)
479#define __NR_BSD43_oldstat (__NR_BSD43 + 18)
480#define __NR_BSD43_lseek (__NR_BSD43 + 19)
481#define __NR_BSD43_getpid (__NR_BSD43 + 20)
482#define __NR_BSD43_oldmount (__NR_BSD43 + 21)
483#define __NR_BSD43_umount (__NR_BSD43 + 22)
484#define __NR_BSD43_setuid (__NR_BSD43 + 23)
485#define __NR_BSD43_getuid (__NR_BSD43 + 24)
486#define __NR_BSD43_stime (__NR_BSD43 + 25)
487#define __NR_BSD43_ptrace (__NR_BSD43 + 26)
488#define __NR_BSD43_alarm (__NR_BSD43 + 27)
489#define __NR_BSD43_oldfstat (__NR_BSD43 + 28)
490#define __NR_BSD43_pause (__NR_BSD43 + 29)
491#define __NR_BSD43_utime (__NR_BSD43 + 30)
492#define __NR_BSD43_stty (__NR_BSD43 + 31)
493#define __NR_BSD43_gtty (__NR_BSD43 + 32)
494#define __NR_BSD43_access (__NR_BSD43 + 33)
495#define __NR_BSD43_nice (__NR_BSD43 + 34)
496#define __NR_BSD43_ftime (__NR_BSD43 + 35)
497#define __NR_BSD43_sync (__NR_BSD43 + 36)
498#define __NR_BSD43_kill (__NR_BSD43 + 37)
499#define __NR_BSD43_stat (__NR_BSD43 + 38)
500#define __NR_BSD43_oldsetpgrp (__NR_BSD43 + 39)
501#define __NR_BSD43_lstat (__NR_BSD43 + 40)
502#define __NR_BSD43_dup (__NR_BSD43 + 41)
503#define __NR_BSD43_pipe (__NR_BSD43 + 42)
504#define __NR_BSD43_times (__NR_BSD43 + 43)
505#define __NR_BSD43_profil (__NR_BSD43 + 44)
506#define __NR_BSD43_msgsys (__NR_BSD43 + 45)
507#define __NR_BSD43_setgid (__NR_BSD43 + 46)
508#define __NR_BSD43_getgid (__NR_BSD43 + 47)
509#define __NR_BSD43_ssig (__NR_BSD43 + 48)
510#define __NR_BSD43_reserved1 (__NR_BSD43 + 49)
511#define __NR_BSD43_reserved2 (__NR_BSD43 + 50)
512#define __NR_BSD43_sysacct (__NR_BSD43 + 51)
513#define __NR_BSD43_phys (__NR_BSD43 + 52)
514#define __NR_BSD43_lock (__NR_BSD43 + 53)
515#define __NR_BSD43_ioctl (__NR_BSD43 + 54)
516#define __NR_BSD43_reboot (__NR_BSD43 + 55)
517#define __NR_BSD43_mpxchan (__NR_BSD43 + 56)
518#define __NR_BSD43_symlink (__NR_BSD43 + 57)
519#define __NR_BSD43_readlink (__NR_BSD43 + 58)
520#define __NR_BSD43_execve (__NR_BSD43 + 59)
521#define __NR_BSD43_umask (__NR_BSD43 + 60)
522#define __NR_BSD43_chroot (__NR_BSD43 + 61)
523#define __NR_BSD43_fstat (__NR_BSD43 + 62)
524#define __NR_BSD43_reserved3 (__NR_BSD43 + 63)
525#define __NR_BSD43_getpagesize (__NR_BSD43 + 64)
526#define __NR_BSD43_mremap (__NR_BSD43 + 65)
527#define __NR_BSD43_vfork (__NR_BSD43 + 66)
528#define __NR_BSD43_vread (__NR_BSD43 + 67)
529#define __NR_BSD43_vwrite (__NR_BSD43 + 68)
530#define __NR_BSD43_sbrk (__NR_BSD43 + 69)
531#define __NR_BSD43_sstk (__NR_BSD43 + 70)
532#define __NR_BSD43_mmap (__NR_BSD43 + 71)
533#define __NR_BSD43_vadvise (__NR_BSD43 + 72)
534#define __NR_BSD43_munmap (__NR_BSD43 + 73)
535#define __NR_BSD43_mprotect (__NR_BSD43 + 74)
536#define __NR_BSD43_madvise (__NR_BSD43 + 75)
537#define __NR_BSD43_vhangup (__NR_BSD43 + 76)
538#define __NR_BSD43_vlimit (__NR_BSD43 + 77)
539#define __NR_BSD43_mincore (__NR_BSD43 + 78)
540#define __NR_BSD43_getgroups (__NR_BSD43 + 79)
541#define __NR_BSD43_setgroups (__NR_BSD43 + 80)
542#define __NR_BSD43_getpgrp (__NR_BSD43 + 81)
543#define __NR_BSD43_setpgrp (__NR_BSD43 + 82)
544#define __NR_BSD43_setitimer (__NR_BSD43 + 83)
545#define __NR_BSD43_wait3 (__NR_BSD43 + 84)
546#define __NR_BSD43_swapon (__NR_BSD43 + 85)
547#define __NR_BSD43_getitimer (__NR_BSD43 + 86)
548#define __NR_BSD43_gethostname (__NR_BSD43 + 87)
549#define __NR_BSD43_sethostname (__NR_BSD43 + 88)
550#define __NR_BSD43_getdtablesize (__NR_BSD43 + 89)
551#define __NR_BSD43_dup2 (__NR_BSD43 + 90)
552#define __NR_BSD43_getdopt (__NR_BSD43 + 91)
553#define __NR_BSD43_fcntl (__NR_BSD43 + 92)
554#define __NR_BSD43_select (__NR_BSD43 + 93)
555#define __NR_BSD43_setdopt (__NR_BSD43 + 94)
556#define __NR_BSD43_fsync (__NR_BSD43 + 95)
557#define __NR_BSD43_setpriority (__NR_BSD43 + 96)
558#define __NR_BSD43_socket (__NR_BSD43 + 97)
559#define __NR_BSD43_connect (__NR_BSD43 + 98)
560#define __NR_BSD43_oldaccept (__NR_BSD43 + 99)
561#define __NR_BSD43_getpriority (__NR_BSD43 + 100)
562#define __NR_BSD43_send (__NR_BSD43 + 101)
563#define __NR_BSD43_recv (__NR_BSD43 + 102)
564#define __NR_BSD43_sigreturn (__NR_BSD43 + 103)
565#define __NR_BSD43_bind (__NR_BSD43 + 104)
566#define __NR_BSD43_setsockopt (__NR_BSD43 + 105)
567#define __NR_BSD43_listen (__NR_BSD43 + 106)
568#define __NR_BSD43_vtimes (__NR_BSD43 + 107)
569#define __NR_BSD43_sigvec (__NR_BSD43 + 108)
570#define __NR_BSD43_sigblock (__NR_BSD43 + 109)
571#define __NR_BSD43_sigsetmask (__NR_BSD43 + 110)
572#define __NR_BSD43_sigpause (__NR_BSD43 + 111)
573#define __NR_BSD43_sigstack (__NR_BSD43 + 112)
574#define __NR_BSD43_oldrecvmsg (__NR_BSD43 + 113)
575#define __NR_BSD43_oldsendmsg (__NR_BSD43 + 114)
576#define __NR_BSD43_vtrace (__NR_BSD43 + 115)
577#define __NR_BSD43_gettimeofday (__NR_BSD43 + 116)
578#define __NR_BSD43_getrusage (__NR_BSD43 + 117)
579#define __NR_BSD43_getsockopt (__NR_BSD43 + 118)
580#define __NR_BSD43_reserved4 (__NR_BSD43 + 119)
581#define __NR_BSD43_readv (__NR_BSD43 + 120)
582#define __NR_BSD43_writev (__NR_BSD43 + 121)
583#define __NR_BSD43_settimeofday (__NR_BSD43 + 122)
584#define __NR_BSD43_fchown (__NR_BSD43 + 123)
585#define __NR_BSD43_fchmod (__NR_BSD43 + 124)
586#define __NR_BSD43_oldrecvfrom (__NR_BSD43 + 125)
587#define __NR_BSD43_setreuid (__NR_BSD43 + 126)
588#define __NR_BSD43_setregid (__NR_BSD43 + 127)
589#define __NR_BSD43_rename (__NR_BSD43 + 128)
590#define __NR_BSD43_truncate (__NR_BSD43 + 129)
591#define __NR_BSD43_ftruncate (__NR_BSD43 + 130)
592#define __NR_BSD43_flock (__NR_BSD43 + 131)
593#define __NR_BSD43_semsys (__NR_BSD43 + 132)
594#define __NR_BSD43_sendto (__NR_BSD43 + 133)
595#define __NR_BSD43_shutdown (__NR_BSD43 + 134)
596#define __NR_BSD43_socketpair (__NR_BSD43 + 135)
597#define __NR_BSD43_mkdir (__NR_BSD43 + 136)
598#define __NR_BSD43_rmdir (__NR_BSD43 + 137)
599#define __NR_BSD43_utimes (__NR_BSD43 + 138)
600#define __NR_BSD43_sigcleanup (__NR_BSD43 + 139)
601#define __NR_BSD43_adjtime (__NR_BSD43 + 140)
602#define __NR_BSD43_oldgetpeername (__NR_BSD43 + 141)
603#define __NR_BSD43_gethostid (__NR_BSD43 + 142)
604#define __NR_BSD43_sethostid (__NR_BSD43 + 143)
605#define __NR_BSD43_getrlimit (__NR_BSD43 + 144)
606#define __NR_BSD43_setrlimit (__NR_BSD43 + 145)
607#define __NR_BSD43_killpg (__NR_BSD43 + 146)
608#define __NR_BSD43_shmsys (__NR_BSD43 + 147)
609#define __NR_BSD43_quota (__NR_BSD43 + 148)
610#define __NR_BSD43_qquota (__NR_BSD43 + 149)
611#define __NR_BSD43_oldgetsockname (__NR_BSD43 + 150)
612#define __NR_BSD43_sysmips (__NR_BSD43 + 151)
613#define __NR_BSD43_cacheflush (__NR_BSD43 + 152)
614#define __NR_BSD43_cachectl (__NR_BSD43 + 153)
615#define __NR_BSD43_debug (__NR_BSD43 + 154)
616#define __NR_BSD43_reserved5 (__NR_BSD43 + 155)
617#define __NR_BSD43_reserved6 (__NR_BSD43 + 156)
618#define __NR_BSD43_nfs_mount (__NR_BSD43 + 157)
619#define __NR_BSD43_nfs_svc (__NR_BSD43 + 158)
620#define __NR_BSD43_getdirentries (__NR_BSD43 + 159)
621#define __NR_BSD43_statfs (__NR_BSD43 + 160)
622#define __NR_BSD43_fstatfs (__NR_BSD43 + 161)
623#define __NR_BSD43_unmount (__NR_BSD43 + 162)
624#define __NR_BSD43_async_daemon (__NR_BSD43 + 163)
625#define __NR_BSD43_nfs_getfh (__NR_BSD43 + 164)
626#define __NR_BSD43_getdomainname (__NR_BSD43 + 165)
627#define __NR_BSD43_setdomainname (__NR_BSD43 + 166)
628#define __NR_BSD43_pcfs_mount (__NR_BSD43 + 167)
629#define __NR_BSD43_quotactl (__NR_BSD43 + 168)
630#define __NR_BSD43_oldexportfs (__NR_BSD43 + 169)
631#define __NR_BSD43_smount (__NR_BSD43 + 170)
632#define __NR_BSD43_mipshwconf (__NR_BSD43 + 171)
633#define __NR_BSD43_exportfs (__NR_BSD43 + 172)
634#define __NR_BSD43_nfsfh_open (__NR_BSD43 + 173)
635#define __NR_BSD43_libattach (__NR_BSD43 + 174)
636#define __NR_BSD43_libdetach (__NR_BSD43 + 175)
637#define __NR_BSD43_accept (__NR_BSD43 + 176)
638#define __NR_BSD43_reserved7 (__NR_BSD43 + 177)
639#define __NR_BSD43_reserved8 (__NR_BSD43 + 178)
640#define __NR_BSD43_recvmsg (__NR_BSD43 + 179)
641#define __NR_BSD43_recvfrom (__NR_BSD43 + 180)
642#define __NR_BSD43_sendmsg (__NR_BSD43 + 181)
643#define __NR_BSD43_getpeername (__NR_BSD43 + 182)
644#define __NR_BSD43_getsockname (__NR_BSD43 + 183)
645#define __NR_BSD43_aread (__NR_BSD43 + 184)
646#define __NR_BSD43_awrite (__NR_BSD43 + 185)
647#define __NR_BSD43_listio (__NR_BSD43 + 186)
648#define __NR_BSD43_acancel (__NR_BSD43 + 187)
649#define __NR_BSD43_astatus (__NR_BSD43 + 188)
650#define __NR_BSD43_await (__NR_BSD43 + 189)
651#define __NR_BSD43_areadv (__NR_BSD43 + 190)
652#define __NR_BSD43_awritev (__NR_BSD43 + 191)
653
654/*
655 * POSIX syscalls are in the range from 3000 to 3999
656 */
657#define __NR_POSIX 3000
658#define __NR_POSIX_syscall (__NR_POSIX + 0)
659#define __NR_POSIX_exit (__NR_POSIX + 1)
660#define __NR_POSIX_fork (__NR_POSIX + 2)
661#define __NR_POSIX_read (__NR_POSIX + 3)
662#define __NR_POSIX_write (__NR_POSIX + 4)
663#define __NR_POSIX_open (__NR_POSIX + 5)
664#define __NR_POSIX_close (__NR_POSIX + 6)
665#define __NR_POSIX_wait (__NR_POSIX + 7)
666#define __NR_POSIX_creat (__NR_POSIX + 8)
667#define __NR_POSIX_link (__NR_POSIX + 9)
668#define __NR_POSIX_unlink (__NR_POSIX + 10)
669#define __NR_POSIX_exec (__NR_POSIX + 11)
670#define __NR_POSIX_chdir (__NR_POSIX + 12)
671#define __NR_POSIX_gtime (__NR_POSIX + 13)
672#define __NR_POSIX_mknod (__NR_POSIX + 14)
673#define __NR_POSIX_chmod (__NR_POSIX + 15)
674#define __NR_POSIX_chown (__NR_POSIX + 16)
675#define __NR_POSIX_sbreak (__NR_POSIX + 17)
676#define __NR_POSIX_stat (__NR_POSIX + 18)
677#define __NR_POSIX_lseek (__NR_POSIX + 19)
678#define __NR_POSIX_getpid (__NR_POSIX + 20)
679#define __NR_POSIX_mount (__NR_POSIX + 21)
680#define __NR_POSIX_umount (__NR_POSIX + 22)
681#define __NR_POSIX_setuid (__NR_POSIX + 23)
682#define __NR_POSIX_getuid (__NR_POSIX + 24)
683#define __NR_POSIX_stime (__NR_POSIX + 25)
684#define __NR_POSIX_ptrace (__NR_POSIX + 26)
685#define __NR_POSIX_alarm (__NR_POSIX + 27)
686#define __NR_POSIX_fstat (__NR_POSIX + 28)
687#define __NR_POSIX_pause (__NR_POSIX + 29)
688#define __NR_POSIX_utime (__NR_POSIX + 30)
689#define __NR_POSIX_stty (__NR_POSIX + 31)
690#define __NR_POSIX_gtty (__NR_POSIX + 32)
691#define __NR_POSIX_access (__NR_POSIX + 33)
692#define __NR_POSIX_nice (__NR_POSIX + 34)
693#define __NR_POSIX_statfs (__NR_POSIX + 35)
694#define __NR_POSIX_sync (__NR_POSIX + 36)
695#define __NR_POSIX_kill (__NR_POSIX + 37)
696#define __NR_POSIX_fstatfs (__NR_POSIX + 38)
697#define __NR_POSIX_getpgrp (__NR_POSIX + 39)
698#define __NR_POSIX_syssgi (__NR_POSIX + 40)
699#define __NR_POSIX_dup (__NR_POSIX + 41)
700#define __NR_POSIX_pipe (__NR_POSIX + 42)
701#define __NR_POSIX_times (__NR_POSIX + 43)
702#define __NR_POSIX_profil (__NR_POSIX + 44)
703#define __NR_POSIX_lock (__NR_POSIX + 45)
704#define __NR_POSIX_setgid (__NR_POSIX + 46)
705#define __NR_POSIX_getgid (__NR_POSIX + 47)
706#define __NR_POSIX_sig (__NR_POSIX + 48)
707#define __NR_POSIX_msgsys (__NR_POSIX + 49)
708#define __NR_POSIX_sysmips (__NR_POSIX + 50)
709#define __NR_POSIX_sysacct (__NR_POSIX + 51)
710#define __NR_POSIX_shmsys (__NR_POSIX + 52)
711#define __NR_POSIX_semsys (__NR_POSIX + 53)
712#define __NR_POSIX_ioctl (__NR_POSIX + 54)
713#define __NR_POSIX_uadmin (__NR_POSIX + 55)
714#define __NR_POSIX_exch (__NR_POSIX + 56)
715#define __NR_POSIX_utssys (__NR_POSIX + 57)
716#define __NR_POSIX_USG_reserved1 (__NR_POSIX + 58)
717#define __NR_POSIX_exece (__NR_POSIX + 59)
718#define __NR_POSIX_umask (__NR_POSIX + 60)
719#define __NR_POSIX_chroot (__NR_POSIX + 61)
720#define __NR_POSIX_fcntl (__NR_POSIX + 62)
721#define __NR_POSIX_ulimit (__NR_POSIX + 63)
722#define __NR_POSIX_SAFARI4_reserved1 (__NR_POSIX + 64)
723#define __NR_POSIX_SAFARI4_reserved2 (__NR_POSIX + 65)
724#define __NR_POSIX_SAFARI4_reserved3 (__NR_POSIX + 66)
725#define __NR_POSIX_SAFARI4_reserved4 (__NR_POSIX + 67)
726#define __NR_POSIX_SAFARI4_reserved5 (__NR_POSIX + 68)
727#define __NR_POSIX_SAFARI4_reserved6 (__NR_POSIX + 69)
728#define __NR_POSIX_advfs (__NR_POSIX + 70)
729#define __NR_POSIX_unadvfs (__NR_POSIX + 71)
730#define __NR_POSIX_rmount (__NR_POSIX + 72)
731#define __NR_POSIX_rumount (__NR_POSIX + 73)
732#define __NR_POSIX_rfstart (__NR_POSIX + 74)
733#define __NR_POSIX_reserved1 (__NR_POSIX + 75)
734#define __NR_POSIX_rdebug (__NR_POSIX + 76)
735#define __NR_POSIX_rfstop (__NR_POSIX + 77)
736#define __NR_POSIX_rfsys (__NR_POSIX + 78)
737#define __NR_POSIX_rmdir (__NR_POSIX + 79)
738#define __NR_POSIX_mkdir (__NR_POSIX + 80)
739#define __NR_POSIX_getdents (__NR_POSIX + 81)
740#define __NR_POSIX_sginap (__NR_POSIX + 82)
741#define __NR_POSIX_sgikopt (__NR_POSIX + 83)
742#define __NR_POSIX_sysfs (__NR_POSIX + 84)
743#define __NR_POSIX_getmsg (__NR_POSIX + 85)
744#define __NR_POSIX_putmsg (__NR_POSIX + 86)
745#define __NR_POSIX_poll (__NR_POSIX + 87)
746#define __NR_POSIX_sigreturn (__NR_POSIX + 88)
747#define __NR_POSIX_accept (__NR_POSIX + 89)
748#define __NR_POSIX_bind (__NR_POSIX + 90)
749#define __NR_POSIX_connect (__NR_POSIX + 91)
750#define __NR_POSIX_gethostid (__NR_POSIX + 92)
751#define __NR_POSIX_getpeername (__NR_POSIX + 93)
752#define __NR_POSIX_getsockname (__NR_POSIX + 94)
753#define __NR_POSIX_getsockopt (__NR_POSIX + 95)
754#define __NR_POSIX_listen (__NR_POSIX + 96)
755#define __NR_POSIX_recv (__NR_POSIX + 97)
756#define __NR_POSIX_recvfrom (__NR_POSIX + 98)
757#define __NR_POSIX_recvmsg (__NR_POSIX + 99)
758#define __NR_POSIX_select (__NR_POSIX + 100)
759#define __NR_POSIX_send (__NR_POSIX + 101)
760#define __NR_POSIX_sendmsg (__NR_POSIX + 102)
761#define __NR_POSIX_sendto (__NR_POSIX + 103)
762#define __NR_POSIX_sethostid (__NR_POSIX + 104)
763#define __NR_POSIX_setsockopt (__NR_POSIX + 105)
764#define __NR_POSIX_shutdown (__NR_POSIX + 106)
765#define __NR_POSIX_socket (__NR_POSIX + 107)
766#define __NR_POSIX_gethostname (__NR_POSIX + 108)
767#define __NR_POSIX_sethostname (__NR_POSIX + 109)
768#define __NR_POSIX_getdomainname (__NR_POSIX + 110)
769#define __NR_POSIX_setdomainname (__NR_POSIX + 111)
770#define __NR_POSIX_truncate (__NR_POSIX + 112)
771#define __NR_POSIX_ftruncate (__NR_POSIX + 113)
772#define __NR_POSIX_rename (__NR_POSIX + 114)
773#define __NR_POSIX_symlink (__NR_POSIX + 115)
774#define __NR_POSIX_readlink (__NR_POSIX + 116)
775#define __NR_POSIX_lstat (__NR_POSIX + 117)
776#define __NR_POSIX_nfs_mount (__NR_POSIX + 118)
777#define __NR_POSIX_nfs_svc (__NR_POSIX + 119)
778#define __NR_POSIX_nfs_getfh (__NR_POSIX + 120)
779#define __NR_POSIX_async_daemon (__NR_POSIX + 121)
780#define __NR_POSIX_exportfs (__NR_POSIX + 122)
781#define __NR_POSIX_SGI_setregid (__NR_POSIX + 123)
782#define __NR_POSIX_SGI_setreuid (__NR_POSIX + 124)
783#define __NR_POSIX_getitimer (__NR_POSIX + 125)
784#define __NR_POSIX_setitimer (__NR_POSIX + 126)
785#define __NR_POSIX_adjtime (__NR_POSIX + 127)
786#define __NR_POSIX_SGI_bsdgettime (__NR_POSIX + 128)
787#define __NR_POSIX_SGI_sproc (__NR_POSIX + 129)
788#define __NR_POSIX_SGI_prctl (__NR_POSIX + 130)
789#define __NR_POSIX_SGI_blkproc (__NR_POSIX + 131)
790#define __NR_POSIX_SGI_reserved1 (__NR_POSIX + 132)
791#define __NR_POSIX_SGI_sgigsc (__NR_POSIX + 133)
792#define __NR_POSIX_SGI_mmap (__NR_POSIX + 134)
793#define __NR_POSIX_SGI_munmap (__NR_POSIX + 135)
794#define __NR_POSIX_SGI_mprotect (__NR_POSIX + 136)
795#define __NR_POSIX_SGI_msync (__NR_POSIX + 137)
796#define __NR_POSIX_SGI_madvise (__NR_POSIX + 138)
797#define __NR_POSIX_SGI_mpin (__NR_POSIX + 139)
798#define __NR_POSIX_SGI_getpagesize (__NR_POSIX + 140)
799#define __NR_POSIX_SGI_libattach (__NR_POSIX + 141)
800#define __NR_POSIX_SGI_libdetach (__NR_POSIX + 142)
801#define __NR_POSIX_SGI_getpgrp (__NR_POSIX + 143)
802#define __NR_POSIX_SGI_setpgrp (__NR_POSIX + 144)
803#define __NR_POSIX_SGI_reserved2 (__NR_POSIX + 145)
804#define __NR_POSIX_SGI_reserved3 (__NR_POSIX + 146)
805#define __NR_POSIX_SGI_reserved4 (__NR_POSIX + 147)
806#define __NR_POSIX_SGI_reserved5 (__NR_POSIX + 148)
807#define __NR_POSIX_SGI_reserved6 (__NR_POSIX + 149)
808#define __NR_POSIX_cacheflush (__NR_POSIX + 150)
809#define __NR_POSIX_cachectl (__NR_POSIX + 151)
810#define __NR_POSIX_fchown (__NR_POSIX + 152)
811#define __NR_POSIX_fchmod (__NR_POSIX + 153)
812#define __NR_POSIX_wait3 (__NR_POSIX + 154)
813#define __NR_POSIX_mmap (__NR_POSIX + 155)
814#define __NR_POSIX_munmap (__NR_POSIX + 156)
815#define __NR_POSIX_madvise (__NR_POSIX + 157)
816#define __NR_POSIX_BSD_getpagesize (__NR_POSIX + 158)
817#define __NR_POSIX_setreuid (__NR_POSIX + 159)
818#define __NR_POSIX_setregid (__NR_POSIX + 160)
819#define __NR_POSIX_setpgid (__NR_POSIX + 161)
820#define __NR_POSIX_getgroups (__NR_POSIX + 162)
821#define __NR_POSIX_setgroups (__NR_POSIX + 163)
822#define __NR_POSIX_gettimeofday (__NR_POSIX + 164)
823#define __NR_POSIX_getrusage (__NR_POSIX + 165)
824#define __NR_POSIX_getrlimit (__NR_POSIX + 166)
825#define __NR_POSIX_setrlimit (__NR_POSIX + 167)
826#define __NR_POSIX_waitpid (__NR_POSIX + 168)
827#define __NR_POSIX_dup2 (__NR_POSIX + 169)
828#define __NR_POSIX_reserved2 (__NR_POSIX + 170)
829#define __NR_POSIX_reserved3 (__NR_POSIX + 171)
830#define __NR_POSIX_reserved4 (__NR_POSIX + 172)
831#define __NR_POSIX_reserved5 (__NR_POSIX + 173)
832#define __NR_POSIX_reserved6 (__NR_POSIX + 174)
833#define __NR_POSIX_reserved7 (__NR_POSIX + 175)
834#define __NR_POSIX_reserved8 (__NR_POSIX + 176)
835#define __NR_POSIX_reserved9 (__NR_POSIX + 177)
836#define __NR_POSIX_reserved10 (__NR_POSIX + 178)
837#define __NR_POSIX_reserved11 (__NR_POSIX + 179)
838#define __NR_POSIX_reserved12 (__NR_POSIX + 180)
839#define __NR_POSIX_reserved13 (__NR_POSIX + 181)
840#define __NR_POSIX_reserved14 (__NR_POSIX + 182)
841#define __NR_POSIX_reserved15 (__NR_POSIX + 183)
842#define __NR_POSIX_reserved16 (__NR_POSIX + 184)
843#define __NR_POSIX_reserved17 (__NR_POSIX + 185)
844#define __NR_POSIX_reserved18 (__NR_POSIX + 186)
845#define __NR_POSIX_reserved19 (__NR_POSIX + 187)
846#define __NR_POSIX_reserved20 (__NR_POSIX + 188)
847#define __NR_POSIX_reserved21 (__NR_POSIX + 189)
848#define __NR_POSIX_reserved22 (__NR_POSIX + 190)
849#define __NR_POSIX_reserved23 (__NR_POSIX + 191)
850#define __NR_POSIX_reserved24 (__NR_POSIX + 192)
851#define __NR_POSIX_reserved25 (__NR_POSIX + 193)
852#define __NR_POSIX_reserved26 (__NR_POSIX + 194)
853#define __NR_POSIX_reserved27 (__NR_POSIX + 195)
854#define __NR_POSIX_reserved28 (__NR_POSIX + 196)
855#define __NR_POSIX_reserved29 (__NR_POSIX + 197)
856#define __NR_POSIX_reserved30 (__NR_POSIX + 198)
857#define __NR_POSIX_reserved31 (__NR_POSIX + 199)
858#define __NR_POSIX_reserved32 (__NR_POSIX + 200)
859#define __NR_POSIX_reserved33 (__NR_POSIX + 201)
860#define __NR_POSIX_reserved34 (__NR_POSIX + 202)
861#define __NR_POSIX_reserved35 (__NR_POSIX + 203)
862#define __NR_POSIX_reserved36 (__NR_POSIX + 204)
863#define __NR_POSIX_reserved37 (__NR_POSIX + 205)
864#define __NR_POSIX_reserved38 (__NR_POSIX + 206)
865#define __NR_POSIX_reserved39 (__NR_POSIX + 207)
866#define __NR_POSIX_reserved40 (__NR_POSIX + 208)
867#define __NR_POSIX_reserved41 (__NR_POSIX + 209)
868#define __NR_POSIX_reserved42 (__NR_POSIX + 210)
869#define __NR_POSIX_reserved43 (__NR_POSIX + 211)
870#define __NR_POSIX_reserved44 (__NR_POSIX + 212)
871#define __NR_POSIX_reserved45 (__NR_POSIX + 213)
872#define __NR_POSIX_reserved46 (__NR_POSIX + 214)
873#define __NR_POSIX_reserved47 (__NR_POSIX + 215)
874#define __NR_POSIX_reserved48 (__NR_POSIX + 216)
875#define __NR_POSIX_reserved49 (__NR_POSIX + 217)
876#define __NR_POSIX_reserved50 (__NR_POSIX + 218)
877#define __NR_POSIX_reserved51 (__NR_POSIX + 219)
878#define __NR_POSIX_reserved52 (__NR_POSIX + 220)
879#define __NR_POSIX_reserved53 (__NR_POSIX + 221)
880#define __NR_POSIX_reserved54 (__NR_POSIX + 222)
881#define __NR_POSIX_reserved55 (__NR_POSIX + 223)
882#define __NR_POSIX_reserved56 (__NR_POSIX + 224)
883#define __NR_POSIX_reserved57 (__NR_POSIX + 225)
884#define __NR_POSIX_reserved58 (__NR_POSIX + 226)
885#define __NR_POSIX_reserved59 (__NR_POSIX + 227)
886#define __NR_POSIX_reserved60 (__NR_POSIX + 228)
887#define __NR_POSIX_reserved61 (__NR_POSIX + 229)
888#define __NR_POSIX_reserved62 (__NR_POSIX + 230)
889#define __NR_POSIX_reserved63 (__NR_POSIX + 231)
890#define __NR_POSIX_reserved64 (__NR_POSIX + 232)
891#define __NR_POSIX_reserved65 (__NR_POSIX + 233)
892#define __NR_POSIX_reserved66 (__NR_POSIX + 234)
893#define __NR_POSIX_reserved67 (__NR_POSIX + 235)
894#define __NR_POSIX_reserved68 (__NR_POSIX + 236)
895#define __NR_POSIX_reserved69 (__NR_POSIX + 237)
896#define __NR_POSIX_reserved70 (__NR_POSIX + 238)
897#define __NR_POSIX_reserved71 (__NR_POSIX + 239)
898#define __NR_POSIX_reserved72 (__NR_POSIX + 240)
899#define __NR_POSIX_reserved73 (__NR_POSIX + 241)
900#define __NR_POSIX_reserved74 (__NR_POSIX + 242)
901#define __NR_POSIX_reserved75 (__NR_POSIX + 243)
902#define __NR_POSIX_reserved76 (__NR_POSIX + 244)
903#define __NR_POSIX_reserved77 (__NR_POSIX + 245)
904#define __NR_POSIX_reserved78 (__NR_POSIX + 246)
905#define __NR_POSIX_reserved79 (__NR_POSIX + 247)
906#define __NR_POSIX_reserved80 (__NR_POSIX + 248)
907#define __NR_POSIX_reserved81 (__NR_POSIX + 249)
908#define __NR_POSIX_reserved82 (__NR_POSIX + 250)
909#define __NR_POSIX_reserved83 (__NR_POSIX + 251)
910#define __NR_POSIX_reserved84 (__NR_POSIX + 252)
911#define __NR_POSIX_reserved85 (__NR_POSIX + 253)
912#define __NR_POSIX_reserved86 (__NR_POSIX + 254)
913#define __NR_POSIX_reserved87 (__NR_POSIX + 255)
914#define __NR_POSIX_reserved88 (__NR_POSIX + 256)
915#define __NR_POSIX_reserved89 (__NR_POSIX + 257)
916#define __NR_POSIX_reserved90 (__NR_POSIX + 258)
917#define __NR_POSIX_reserved91 (__NR_POSIX + 259)
918#define __NR_POSIX_netboot (__NR_POSIX + 260)
919#define __NR_POSIX_netunboot (__NR_POSIX + 261)
920#define __NR_POSIX_rdump (__NR_POSIX + 262)
921#define __NR_POSIX_setsid (__NR_POSIX + 263)
922#define __NR_POSIX_getmaxsig (__NR_POSIX + 264)
923#define __NR_POSIX_sigpending (__NR_POSIX + 265)
924#define __NR_POSIX_sigprocmask (__NR_POSIX + 266)
925#define __NR_POSIX_sigsuspend (__NR_POSIX + 267)
926#define __NR_POSIX_sigaction (__NR_POSIX + 268)
927#define __NR_POSIX_MIPS_reserved1 (__NR_POSIX + 269)
928#define __NR_POSIX_MIPS_reserved2 (__NR_POSIX + 270)
929#define __NR_POSIX_MIPS_reserved3 (__NR_POSIX + 271)
930#define __NR_POSIX_MIPS_reserved4 (__NR_POSIX + 272)
931#define __NR_POSIX_MIPS_reserved5 (__NR_POSIX + 273)
932#define __NR_POSIX_MIPS_reserved6 (__NR_POSIX + 274)
933#define __NR_POSIX_MIPS_reserved7 (__NR_POSIX + 275)
934#define __NR_POSIX_MIPS_reserved8 (__NR_POSIX + 276)
935#define __NR_POSIX_MIPS_reserved9 (__NR_POSIX + 277)
936#define __NR_POSIX_MIPS_reserved10 (__NR_POSIX + 278)
937#define __NR_POSIX_MIPS_reserved11 (__NR_POSIX + 279)
938#define __NR_POSIX_TANDEM_reserved1 (__NR_POSIX + 280)
939#define __NR_POSIX_TANDEM_reserved2 (__NR_POSIX + 281)
940#define __NR_POSIX_TANDEM_reserved3 (__NR_POSIX + 282)
941#define __NR_POSIX_TANDEM_reserved4 (__NR_POSIX + 283)
942#define __NR_POSIX_TANDEM_reserved5 (__NR_POSIX + 284)
943#define __NR_POSIX_TANDEM_reserved6 (__NR_POSIX + 285)
944#define __NR_POSIX_TANDEM_reserved7 (__NR_POSIX + 286)
945#define __NR_POSIX_TANDEM_reserved8 (__NR_POSIX + 287)
946#define __NR_POSIX_TANDEM_reserved9 (__NR_POSIX + 288)
947#define __NR_POSIX_TANDEM_reserved10 (__NR_POSIX + 289)
948#define __NR_POSIX_TANDEM_reserved11 (__NR_POSIX + 290)
949#define __NR_POSIX_TANDEM_reserved12 (__NR_POSIX + 291)
950#define __NR_POSIX_TANDEM_reserved13 (__NR_POSIX + 292)
951#define __NR_POSIX_TANDEM_reserved14 (__NR_POSIX + 293)
952#define __NR_POSIX_TANDEM_reserved15 (__NR_POSIX + 294)
953#define __NR_POSIX_TANDEM_reserved16 (__NR_POSIX + 295)
954#define __NR_POSIX_TANDEM_reserved17 (__NR_POSIX + 296)
955#define __NR_POSIX_TANDEM_reserved18 (__NR_POSIX + 297)
956#define __NR_POSIX_TANDEM_reserved19 (__NR_POSIX + 298)
957#define __NR_POSIX_TANDEM_reserved20 (__NR_POSIX + 299)
958#define __NR_POSIX_SGI_reserved7 (__NR_POSIX + 300)
959#define __NR_POSIX_SGI_reserved8 (__NR_POSIX + 301)
960#define __NR_POSIX_SGI_reserved9 (__NR_POSIX + 302)
961#define __NR_POSIX_SGI_reserved10 (__NR_POSIX + 303)
962#define __NR_POSIX_SGI_reserved11 (__NR_POSIX + 304)
963#define __NR_POSIX_SGI_reserved12 (__NR_POSIX + 305)
964#define __NR_POSIX_SGI_reserved13 (__NR_POSIX + 306)
965#define __NR_POSIX_SGI_reserved14 (__NR_POSIX + 307)
966#define __NR_POSIX_SGI_reserved15 (__NR_POSIX + 308)
967#define __NR_POSIX_SGI_reserved16 (__NR_POSIX + 309)
968#define __NR_POSIX_SGI_reserved17 (__NR_POSIX + 310)
969#define __NR_POSIX_SGI_reserved18 (__NR_POSIX + 311)
970#define __NR_POSIX_SGI_reserved19 (__NR_POSIX + 312)
971#define __NR_POSIX_SGI_reserved20 (__NR_POSIX + 313)
972#define __NR_POSIX_SGI_reserved21 (__NR_POSIX + 314)
973#define __NR_POSIX_SGI_reserved22 (__NR_POSIX + 315)
974#define __NR_POSIX_SGI_reserved23 (__NR_POSIX + 316)
975#define __NR_POSIX_SGI_reserved24 (__NR_POSIX + 317)
976#define __NR_POSIX_SGI_reserved25 (__NR_POSIX + 318)
977#define __NR_POSIX_SGI_reserved26 (__NR_POSIX + 319)
978
979#endif /* _ASM_RISCOS_SYSCALL_H */
diff --git a/include/asm-parisc/atomic.h b/include/asm-parisc/atomic.h
index 983e9a2b6042..64ebd086c40d 100644
--- a/include/asm-parisc/atomic.h
+++ b/include/asm-parisc/atomic.h
@@ -216,4 +216,5 @@ static __inline__ int atomic_read(const atomic_t *v)
216#define smp_mb__before_atomic_inc() smp_mb() 216#define smp_mb__before_atomic_inc() smp_mb()
217#define smp_mb__after_atomic_inc() smp_mb() 217#define smp_mb__after_atomic_inc() smp_mb()
218 218
219#include <asm-generic/atomic.h>
219#endif 220#endif
diff --git a/include/asm-parisc/mman.h b/include/asm-parisc/mman.h
index e829607eb8bc..736b0abcac05 100644
--- a/include/asm-parisc/mman.h
+++ b/include/asm-parisc/mman.h
@@ -38,6 +38,7 @@
38#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */ 38#define MADV_SPACEAVAIL 5 /* insure that resources are reserved */
39#define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */ 39#define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */
40#define MADV_VPS_INHERIT 7 /* Inherit parents page size */ 40#define MADV_VPS_INHERIT 7 /* Inherit parents page size */
41#define MADV_REMOVE 8 /* remove these pages & resources */
41 42
42/* The range 12-64 is reserved for page size specification. */ 43/* The range 12-64 is reserved for page size specification. */
43#define MADV_4K_PAGES 12 /* Use 4K pages */ 44#define MADV_4K_PAGES 12 /* Use 4K pages */
diff --git a/include/asm-powerpc/atomic.h b/include/asm-powerpc/atomic.h
index ec4b14468959..ae395a0632a6 100644
--- a/include/asm-powerpc/atomic.h
+++ b/include/asm-powerpc/atomic.h
@@ -402,5 +402,6 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
402 402
403#endif /* __powerpc64__ */ 403#endif /* __powerpc64__ */
404 404
405#include <asm-generic/atomic.h>
405#endif /* __KERNEL__ */ 406#endif /* __KERNEL__ */
406#endif /* _ASM_POWERPC_ATOMIC_H_ */ 407#endif /* _ASM_POWERPC_ATOMIC_H_ */
diff --git a/include/asm-powerpc/mman.h b/include/asm-powerpc/mman.h
index f5e5342fcac5..a2e34c21b44f 100644
--- a/include/asm-powerpc/mman.h
+++ b/include/asm-powerpc/mman.h
@@ -44,6 +44,7 @@
44#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 44#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
45#define MADV_WILLNEED 0x3 /* pre-fault pages */ 45#define MADV_WILLNEED 0x3 /* pre-fault pages */
46#define MADV_DONTNEED 0x4 /* discard these pages */ 46#define MADV_DONTNEED 0x4 /* discard these pages */
47#define MADV_REMOVE 0x5 /* remove these pages & resources */
47 48
48/* compatibility flags */ 49/* compatibility flags */
49#define MAP_ANON MAP_ANONYMOUS 50#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-ppc/ibm_ocp.h b/include/asm-ppc/ibm_ocp.h
index 9c21de1ff4ed..ddce616f765a 100644
--- a/include/asm-ppc/ibm_ocp.h
+++ b/include/asm-ppc/ibm_ocp.h
@@ -63,7 +63,6 @@ struct ocp_func_emac_data {
63 int wol_irq; /* WOL interrupt */ 63 int wol_irq; /* WOL interrupt */
64 int mdio_idx; /* EMAC idx of MDIO master or -1 */ 64 int mdio_idx; /* EMAC idx of MDIO master or -1 */
65 int tah_idx; /* TAH device index or -1 */ 65 int tah_idx; /* TAH device index or -1 */
66 int jumbo; /* Jumbo frames capable flag */
67 int phy_mode; /* PHY type or configurable mode */ 66 int phy_mode; /* PHY type or configurable mode */
68 u8 mac_addr[6]; /* EMAC mac address */ 67 u8 mac_addr[6]; /* EMAC mac address */
69 u32 phy_map; /* EMAC phy map */ 68 u32 phy_map; /* EMAC phy map */
diff --git a/include/asm-ppc/io.h b/include/asm-ppc/io.h
index 84ac6e258eef..df9cf6ed189d 100644
--- a/include/asm-ppc/io.h
+++ b/include/asm-ppc/io.h
@@ -27,6 +27,8 @@
27 27
28#if defined(CONFIG_4xx) 28#if defined(CONFIG_4xx)
29#include <asm/ibm4xx.h> 29#include <asm/ibm4xx.h>
30#elif defined(CONFIG_PPC_MPC52xx)
31#include <asm/mpc52xx.h>
30#elif defined(CONFIG_8xx) 32#elif defined(CONFIG_8xx)
31#include <asm/mpc8xx.h> 33#include <asm/mpc8xx.h>
32#elif defined(CONFIG_8260) 34#elif defined(CONFIG_8260)
diff --git a/include/asm-ppc/mpc52xx.h b/include/asm-ppc/mpc52xx.h
index e5f80c22fbfc..a055e0756b9d 100644
--- a/include/asm-ppc/mpc52xx.h
+++ b/include/asm-ppc/mpc52xx.h
@@ -29,6 +29,17 @@ struct pt_regs;
29#endif /* __ASSEMBLY__ */ 29#endif /* __ASSEMBLY__ */
30 30
31 31
32#ifdef CONFIG_PCI
33#define _IO_BASE isa_io_base
34#define _ISA_MEM_BASE isa_mem_base
35#define PCI_DRAM_OFFSET pci_dram_offset
36#else
37#define _IO_BASE 0
38#define _ISA_MEM_BASE 0
39#define PCI_DRAM_OFFSET 0
40#endif
41
42
32/* ======================================================================== */ 43/* ======================================================================== */
33/* PPC Sys devices definition */ 44/* PPC Sys devices definition */
34/* ======================================================================== */ 45/* ======================================================================== */
@@ -107,7 +118,7 @@ enum ppc_sys_devices {
107#define MPC52xx_SDMA_IRQ_NUM 17 118#define MPC52xx_SDMA_IRQ_NUM 17
108#define MPC52xx_PERP_IRQ_NUM 23 119#define MPC52xx_PERP_IRQ_NUM 23
109 120
110#define MPC52xx_CRIT_IRQ_BASE 0 121#define MPC52xx_CRIT_IRQ_BASE 1
111#define MPC52xx_MAIN_IRQ_BASE (MPC52xx_CRIT_IRQ_BASE + MPC52xx_CRIT_IRQ_NUM) 122#define MPC52xx_MAIN_IRQ_BASE (MPC52xx_CRIT_IRQ_BASE + MPC52xx_CRIT_IRQ_NUM)
112#define MPC52xx_SDMA_IRQ_BASE (MPC52xx_MAIN_IRQ_BASE + MPC52xx_MAIN_IRQ_NUM) 123#define MPC52xx_SDMA_IRQ_BASE (MPC52xx_MAIN_IRQ_BASE + MPC52xx_MAIN_IRQ_NUM)
113#define MPC52xx_PERP_IRQ_BASE (MPC52xx_SDMA_IRQ_BASE + MPC52xx_SDMA_IRQ_NUM) 124#define MPC52xx_PERP_IRQ_BASE (MPC52xx_SDMA_IRQ_BASE + MPC52xx_SDMA_IRQ_NUM)
diff --git a/include/asm-s390/atomic.h b/include/asm-s390/atomic.h
index b3bd4f679f72..d82aedf616fe 100644
--- a/include/asm-s390/atomic.h
+++ b/include/asm-s390/atomic.h
@@ -5,7 +5,7 @@
5 * include/asm-s390/atomic.h 5 * include/asm-s390/atomic.h
6 * 6 *
7 * S390 version 7 * S390 version
8 * Copyright (C) 1999-2003 IBM Deutschland Entwicklung GmbH, IBM Corporation 8 * Copyright (C) 1999-2005 IBM Deutschland Entwicklung GmbH, IBM Corporation
9 * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), 9 * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com),
10 * Denis Joseph Barrow, 10 * Denis Joseph Barrow,
11 * Arnd Bergmann (arndb@de.ibm.com) 11 * Arnd Bergmann (arndb@de.ibm.com)
@@ -45,59 +45,57 @@ typedef struct {
45#define atomic_read(v) ((v)->counter) 45#define atomic_read(v) ((v)->counter)
46#define atomic_set(v,i) (((v)->counter) = (i)) 46#define atomic_set(v,i) (((v)->counter) = (i))
47 47
48static __inline__ void atomic_add(int i, atomic_t * v)
49{
50 __CS_LOOP(v, i, "ar");
51}
52static __inline__ int atomic_add_return(int i, atomic_t * v) 48static __inline__ int atomic_add_return(int i, atomic_t * v)
53{ 49{
54 return __CS_LOOP(v, i, "ar"); 50 return __CS_LOOP(v, i, "ar");
55} 51}
56static __inline__ int atomic_add_negative(int i, atomic_t * v) 52#define atomic_add(_i, _v) atomic_add_return(_i, _v)
57{ 53#define atomic_add_negative(_i, _v) (atomic_add_return(_i, _v) < 0)
58 return __CS_LOOP(v, i, "ar") < 0; 54#define atomic_inc(_v) atomic_add_return(1, _v)
59} 55#define atomic_inc_return(_v) atomic_add_return(1, _v)
60static __inline__ void atomic_sub(int i, atomic_t * v) 56#define atomic_inc_and_test(_v) (atomic_add_return(1, _v) == 0)
61{ 57
62 __CS_LOOP(v, i, "sr");
63}
64static __inline__ int atomic_sub_return(int i, atomic_t * v) 58static __inline__ int atomic_sub_return(int i, atomic_t * v)
65{ 59{
66 return __CS_LOOP(v, i, "sr"); 60 return __CS_LOOP(v, i, "sr");
67} 61}
68static __inline__ void atomic_inc(volatile atomic_t * v) 62#define atomic_sub(_i, _v) atomic_sub_return(_i, _v)
69{ 63#define atomic_sub_and_test(_i, _v) (atomic_sub_return(_i, _v) == 0)
70 __CS_LOOP(v, 1, "ar"); 64#define atomic_dec(_v) atomic_sub_return(1, _v)
71} 65#define atomic_dec_return(_v) atomic_sub_return(1, _v)
72static __inline__ int atomic_inc_return(volatile atomic_t * v) 66#define atomic_dec_and_test(_v) (atomic_sub_return(1, _v) == 0)
73{
74 return __CS_LOOP(v, 1, "ar");
75}
76 67
77static __inline__ int atomic_inc_and_test(volatile atomic_t * v)
78{
79 return __CS_LOOP(v, 1, "ar") == 0;
80}
81static __inline__ void atomic_dec(volatile atomic_t * v)
82{
83 __CS_LOOP(v, 1, "sr");
84}
85static __inline__ int atomic_dec_return(volatile atomic_t * v)
86{
87 return __CS_LOOP(v, 1, "sr");
88}
89static __inline__ int atomic_dec_and_test(volatile atomic_t * v)
90{
91 return __CS_LOOP(v, 1, "sr") == 0;
92}
93static __inline__ void atomic_clear_mask(unsigned long mask, atomic_t * v) 68static __inline__ void atomic_clear_mask(unsigned long mask, atomic_t * v)
94{ 69{
95 __CS_LOOP(v, ~mask, "nr"); 70 __CS_LOOP(v, ~mask, "nr");
96} 71}
72
97static __inline__ void atomic_set_mask(unsigned long mask, atomic_t * v) 73static __inline__ void atomic_set_mask(unsigned long mask, atomic_t * v)
98{ 74{
99 __CS_LOOP(v, mask, "or"); 75 __CS_LOOP(v, mask, "or");
100} 76}
77
78static __inline__ int atomic_cmpxchg(atomic_t *v, int old, int new)
79{
80 __asm__ __volatile__(" cs %0,%3,0(%2)\n"
81 : "+d" (old), "=m" (v->counter)
82 : "a" (v), "d" (new), "m" (v->counter)
83 : "cc", "memory" );
84 return old;
85}
86
87static __inline__ int atomic_add_unless(atomic_t *v, int a, int u)
88{
89 int c, old;
90
91 c = atomic_read(v);
92 while (c != u && (old = atomic_cmpxchg(v, c, c + a)) != c)
93 c = old;
94 return c != u;
95}
96
97#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
98
101#undef __CS_LOOP 99#undef __CS_LOOP
102 100
103#ifdef __s390x__ 101#ifdef __s390x__
@@ -123,97 +121,67 @@ typedef struct {
123#define atomic64_read(v) ((v)->counter) 121#define atomic64_read(v) ((v)->counter)
124#define atomic64_set(v,i) (((v)->counter) = (i)) 122#define atomic64_set(v,i) (((v)->counter) = (i))
125 123
126static __inline__ void atomic64_add(long long i, atomic64_t * v)
127{
128 __CSG_LOOP(v, i, "agr");
129}
130static __inline__ long long atomic64_add_return(long long i, atomic64_t * v) 124static __inline__ long long atomic64_add_return(long long i, atomic64_t * v)
131{ 125{
132 return __CSG_LOOP(v, i, "agr"); 126 return __CSG_LOOP(v, i, "agr");
133} 127}
134static __inline__ long long atomic64_add_negative(long long i, atomic64_t * v) 128#define atomic64_add(_i, _v) atomic64_add_return(_i, _v)
135{ 129#define atomic64_add_negative(_i, _v) (atomic64_add_return(_i, _v) < 0)
136 return __CSG_LOOP(v, i, "agr") < 0; 130#define atomic64_inc(_v) atomic64_add_return(1, _v)
137} 131#define atomic64_inc_return(_v) atomic64_add_return(1, _v)
138static __inline__ void atomic64_sub(long long i, atomic64_t * v) 132#define atomic64_inc_and_test(_v) (atomic64_add_return(1, _v) == 0)
139{ 133
140 __CSG_LOOP(v, i, "sgr"); 134static __inline__ long long atomic64_sub_return(long long i, atomic64_t * v)
141}
142static __inline__ void atomic64_inc(volatile atomic64_t * v)
143{
144 __CSG_LOOP(v, 1, "agr");
145}
146static __inline__ long long atomic64_inc_return(volatile atomic64_t * v)
147{
148 return __CSG_LOOP(v, 1, "agr");
149}
150static __inline__ long long atomic64_inc_and_test(volatile atomic64_t * v)
151{
152 return __CSG_LOOP(v, 1, "agr") == 0;
153}
154static __inline__ void atomic64_dec(volatile atomic64_t * v)
155{
156 __CSG_LOOP(v, 1, "sgr");
157}
158static __inline__ long long atomic64_dec_return(volatile atomic64_t * v)
159{
160 return __CSG_LOOP(v, 1, "sgr");
161}
162static __inline__ long long atomic64_dec_and_test(volatile atomic64_t * v)
163{ 135{
164 return __CSG_LOOP(v, 1, "sgr") == 0; 136 return __CSG_LOOP(v, i, "sgr");
165} 137}
138#define atomic64_sub(_i, _v) atomic64_sub_return(_i, _v)
139#define atomic64_sub_and_test(_i, _v) (atomic64_sub_return(_i, _v) == 0)
140#define atomic64_dec(_v) atomic64_sub_return(1, _v)
141#define atomic64_dec_return(_v) atomic64_sub_return(1, _v)
142#define atomic64_dec_and_test(_v) (atomic64_sub_return(1, _v) == 0)
143
166static __inline__ void atomic64_clear_mask(unsigned long mask, atomic64_t * v) 144static __inline__ void atomic64_clear_mask(unsigned long mask, atomic64_t * v)
167{ 145{
168 __CSG_LOOP(v, ~mask, "ngr"); 146 __CSG_LOOP(v, ~mask, "ngr");
169} 147}
148
170static __inline__ void atomic64_set_mask(unsigned long mask, atomic64_t * v) 149static __inline__ void atomic64_set_mask(unsigned long mask, atomic64_t * v)
171{ 150{
172 __CSG_LOOP(v, mask, "ogr"); 151 __CSG_LOOP(v, mask, "ogr");
173} 152}
174 153
175#undef __CSG_LOOP 154static __inline__ long long atomic64_cmpxchg(atomic64_t *v,
176#endif 155 long long old, long long new)
177 156{
178/* 157 __asm__ __volatile__(" csg %0,%3,0(%2)\n"
179 returns 0 if expected_oldval==value in *v ( swap was successful ) 158 : "+d" (old), "=m" (v->counter)
180 returns 1 if unsuccessful. 159 : "a" (v), "d" (new), "m" (v->counter)
160 : "cc", "memory" );
161 return old;
162}
181 163
182 This is non-portable, use bitops or spinlocks instead! 164static __inline__ int atomic64_add_unless(atomic64_t *v,
183*/ 165 long long a, long long u)
184static __inline__ int
185atomic_compare_and_swap(int expected_oldval,int new_val,atomic_t *v)
186{ 166{
187 int retval; 167 long long c, old;
188 168
189 __asm__ __volatile__( 169 c = atomic64_read(v);
190 " lr %0,%3\n" 170 while (c != u && (old = atomic64_cmpxchg(v, c, c + a)) != c)
191 " cs %0,%4,0(%2)\n" 171 c = old;
192 " ipm %0\n" 172 return c != u;
193 " srl %0,28\n"
194 "0:"
195 : "=&d" (retval), "=m" (v->counter)
196 : "a" (v), "d" (expected_oldval) , "d" (new_val),
197 "m" (v->counter) : "cc", "memory" );
198 return retval;
199} 173}
200 174
201#define atomic_cmpxchg(v, o, n) (atomic_compare_and_swap((o), (n), &((v)->counter))) 175#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
202 176
203#define atomic_add_unless(v, a, u) \ 177#undef __CSG_LOOP
204({ \ 178#endif
205 int c, old; \
206 c = atomic_read(v); \
207 while (c != (u) && (old = atomic_cmpxchg((v), c, c + (a))) != c) \
208 c = old; \
209 c != (u); \
210})
211#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
212 179
213#define smp_mb__before_atomic_dec() smp_mb() 180#define smp_mb__before_atomic_dec() smp_mb()
214#define smp_mb__after_atomic_dec() smp_mb() 181#define smp_mb__after_atomic_dec() smp_mb()
215#define smp_mb__before_atomic_inc() smp_mb() 182#define smp_mb__before_atomic_inc() smp_mb()
216#define smp_mb__after_atomic_inc() smp_mb() 183#define smp_mb__after_atomic_inc() smp_mb()
217 184
185#include <asm-generic/atomic.h>
218#endif /* __KERNEL__ */ 186#endif /* __KERNEL__ */
219#endif /* __ARCH_S390_ATOMIC__ */ 187#endif /* __ARCH_S390_ATOMIC__ */
diff --git a/include/asm-s390/ccwdev.h b/include/asm-s390/ccwdev.h
index 3eb231af5d51..12456cb2f882 100644
--- a/include/asm-s390/ccwdev.h
+++ b/include/asm-s390/ccwdev.h
@@ -185,8 +185,5 @@ extern struct ccw_device *ccw_device_probe_console(void);
185extern int _ccw_device_get_device_number(struct ccw_device *); 185extern int _ccw_device_get_device_number(struct ccw_device *);
186extern int _ccw_device_get_subchannel_number(struct ccw_device *); 186extern int _ccw_device_get_subchannel_number(struct ccw_device *);
187 187
188extern struct device *s390_root_dev_register(const char *);
189extern void s390_root_dev_unregister(struct device *);
190
191extern void *ccw_device_get_chp_desc(struct ccw_device *, int); 188extern void *ccw_device_get_chp_desc(struct ccw_device *, int);
192#endif /* _S390_CCWDEV_H_ */ 189#endif /* _S390_CCWDEV_H_ */
diff --git a/include/asm-s390/mman.h b/include/asm-s390/mman.h
index ea86bd12204f..c8d5409b5d56 100644
--- a/include/asm-s390/mman.h
+++ b/include/asm-s390/mman.h
@@ -43,6 +43,7 @@
43#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 43#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
44#define MADV_WILLNEED 0x3 /* pre-fault pages */ 44#define MADV_WILLNEED 0x3 /* pre-fault pages */
45#define MADV_DONTNEED 0x4 /* discard these pages */ 45#define MADV_DONTNEED 0x4 /* discard these pages */
46#define MADV_REMOVE 0x5 /* remove these pages & resources */
46 47
47/* compatibility flags */ 48/* compatibility flags */
48#define MAP_ANON MAP_ANONYMOUS 49#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-s390/qdio.h b/include/asm-s390/qdio.h
index 0ddf0a8ef8de..7bc15f0231db 100644
--- a/include/asm-s390/qdio.h
+++ b/include/asm-s390/qdio.h
@@ -195,12 +195,14 @@ struct qdr {
195/* 195/*
196 * queue information block (QIB) 196 * queue information block (QIB)
197 */ 197 */
198#define QIB_AC_INBOUND_PCI_SUPPORTED 0x80 198#define QIB_AC_INBOUND_PCI_SUPPORTED 0x80
199#define QIB_AC_OUTBOUND_PCI_SUPPORTED 0x40 199#define QIB_AC_OUTBOUND_PCI_SUPPORTED 0x40
200#define QIB_RFLAGS_ENABLE_QEBSM 0x80
201
200struct qib { 202struct qib {
201 unsigned int qfmt : 8; /* queue format */ 203 unsigned int qfmt : 8; /* queue format */
202 unsigned int pfmt : 8; /* impl. dep. parameter format */ 204 unsigned int pfmt : 8; /* impl. dep. parameter format */
203 unsigned int res1 : 8; /* reserved */ 205 unsigned int rflags : 8; /* QEBSM */
204 unsigned int ac : 8; /* adapter characteristics */ 206 unsigned int ac : 8; /* adapter characteristics */
205 unsigned int res2; /* reserved */ 207 unsigned int res2; /* reserved */
206#ifdef QDIO_32_BIT 208#ifdef QDIO_32_BIT
diff --git a/include/asm-s390/s390_rdev.h b/include/asm-s390/s390_rdev.h
new file mode 100644
index 000000000000..3ad78f2b9c48
--- /dev/null
+++ b/include/asm-s390/s390_rdev.h
@@ -0,0 +1,15 @@
1/*
2 * include/asm-s390/ccwdev.h
3 *
4 * Copyright (C) 2002,2005 IBM Deutschland Entwicklung GmbH, IBM Corporation
5 * Author(s): Cornelia Huck <cohuck@de.ibm.com>
6 * Carsten Otte <cotte@de.ibm.com>
7 *
8 * Interface for s390 root device
9 */
10
11#ifndef _S390_RDEV_H_
12#define _S390_RDEV_H_
13extern struct device *s390_root_dev_register(const char *);
14extern void s390_root_dev_unregister(struct device *);
15#endif /* _S390_RDEV_H_ */
diff --git a/include/asm-s390/uaccess.h b/include/asm-s390/uaccess.h
index 10a619da4761..be104f21c70a 100644
--- a/include/asm-s390/uaccess.h
+++ b/include/asm-s390/uaccess.h
@@ -61,8 +61,10 @@
61#define segment_eq(a,b) ((a).ar4 == (b).ar4) 61#define segment_eq(a,b) ((a).ar4 == (b).ar4)
62 62
63 63
64#define __access_ok(addr,size) (1) 64static inline int __access_ok(const void *addr, unsigned long size)
65 65{
66 return 1;
67}
66#define access_ok(type,addr,size) __access_ok(addr,size) 68#define access_ok(type,addr,size) __access_ok(addr,size)
67 69
68/* 70/*
@@ -206,25 +208,25 @@ extern int __put_user_bad(void) __attribute__((noreturn));
206 case 1: { \ 208 case 1: { \
207 unsigned char __x; \ 209 unsigned char __x; \
208 __get_user_asm(__x, ptr, __gu_err); \ 210 __get_user_asm(__x, ptr, __gu_err); \
209 (x) = (__typeof__(*(ptr))) __x; \ 211 (x) = *(__typeof__(*(ptr)) *) &__x; \
210 break; \ 212 break; \
211 }; \ 213 }; \
212 case 2: { \ 214 case 2: { \
213 unsigned short __x; \ 215 unsigned short __x; \
214 __get_user_asm(__x, ptr, __gu_err); \ 216 __get_user_asm(__x, ptr, __gu_err); \
215 (x) = (__typeof__(*(ptr))) __x; \ 217 (x) = *(__typeof__(*(ptr)) *) &__x; \
216 break; \ 218 break; \
217 }; \ 219 }; \
218 case 4: { \ 220 case 4: { \
219 unsigned int __x; \ 221 unsigned int __x; \
220 __get_user_asm(__x, ptr, __gu_err); \ 222 __get_user_asm(__x, ptr, __gu_err); \
221 (x) = (__typeof__(*(ptr))) __x; \ 223 (x) = *(__typeof__(*(ptr)) *) &__x; \
222 break; \ 224 break; \
223 }; \ 225 }; \
224 case 8: { \ 226 case 8: { \
225 unsigned long long __x; \ 227 unsigned long long __x; \
226 __get_user_asm(__x, ptr, __gu_err); \ 228 __get_user_asm(__x, ptr, __gu_err); \
227 (x) = (__typeof__(*(ptr))) __x; \ 229 (x) = *(__typeof__(*(ptr)) *) &__x; \
228 break; \ 230 break; \
229 }; \ 231 }; \
230 default: \ 232 default: \
diff --git a/include/asm-s390/unistd.h b/include/asm-s390/unistd.h
index f97d92691f17..2861cdc243ad 100644
--- a/include/asm-s390/unistd.h
+++ b/include/asm-s390/unistd.h
@@ -539,7 +539,7 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \
539#define __ARCH_WANT_SYS_SIGPENDING 539#define __ARCH_WANT_SYS_SIGPENDING
540#define __ARCH_WANT_SYS_SIGPROCMASK 540#define __ARCH_WANT_SYS_SIGPROCMASK
541#define __ARCH_WANT_SYS_RT_SIGACTION 541#define __ARCH_WANT_SYS_RT_SIGACTION
542# ifdef CONFIG_ARCH_S390_31 542# ifndef CONFIG_64BIT
543# define __ARCH_WANT_STAT64 543# define __ARCH_WANT_STAT64
544# define __ARCH_WANT_SYS_TIME 544# define __ARCH_WANT_SYS_TIME
545# endif 545# endif
diff --git a/include/asm-s390/vtoc.h b/include/asm-s390/vtoc.h
index 41d369f38b0e..d1de5b7ebb0b 100644
--- a/include/asm-s390/vtoc.h
+++ b/include/asm-s390/vtoc.h
@@ -176,4 +176,28 @@ struct vtoc_format7_label
176 struct vtoc_cchhb DS7PTRDS; /* pointer to next FMT7 DSCB */ 176 struct vtoc_cchhb DS7PTRDS; /* pointer to next FMT7 DSCB */
177} __attribute__ ((packed)); 177} __attribute__ ((packed));
178 178
179struct vtoc_cms_label {
180 u8 label_id[4]; /* Label identifier */
181 u8 vol_id[6]; /* Volid */
182 u16 version_id; /* Version identifier */
183 u32 block_size; /* Disk block size */
184 u32 origin_ptr; /* Disk origin pointer */
185 u32 usable_count; /* Number of usable cylinders/blocks */
186 u32 formatted_count; /* Maximum number of formatted cylinders/
187 * blocks */
188 u32 block_count; /* Disk size in CMS blocks */
189 u32 used_count; /* Number of CMS blocks in use */
190 u32 fst_size; /* File Status Table (FST) size */
191 u32 fst_count; /* Number of FSTs per CMS block */
192 u8 format_date[6]; /* Disk FORMAT date */
193 u8 reserved1[2];
194 u32 disk_offset; /* Disk offset when reserved*/
195 u32 map_block; /* Allocation Map Block with next hole */
196 u32 hblk_disp; /* Displacement into HBLK data of next hole */
197 u32 user_disp; /* Displacement into user part of Allocation
198 * map */
199 u8 reserved2[4];
200 u8 segment_name[8]; /* Name of shared segment */
201} __attribute__ ((packed));
202
179#endif /* _ASM_S390_VTOC_H */ 203#endif /* _ASM_S390_VTOC_H */
diff --git a/include/asm-sh/atomic.h b/include/asm-sh/atomic.h
index aabfd334462c..618d8e0de348 100644
--- a/include/asm-sh/atomic.h
+++ b/include/asm-sh/atomic.h
@@ -140,4 +140,5 @@ static __inline__ void atomic_set_mask(unsigned int mask, atomic_t *v)
140#define smp_mb__before_atomic_inc() barrier() 140#define smp_mb__before_atomic_inc() barrier()
141#define smp_mb__after_atomic_inc() barrier() 141#define smp_mb__after_atomic_inc() barrier()
142 142
143#include <asm-generic/atomic.h>
143#endif /* __ASM_SH_ATOMIC_H */ 144#endif /* __ASM_SH_ATOMIC_H */
diff --git a/include/asm-sh/mman.h b/include/asm-sh/mman.h
index 3ebab5f79db7..693bd55a3710 100644
--- a/include/asm-sh/mman.h
+++ b/include/asm-sh/mman.h
@@ -35,6 +35,7 @@
35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 35#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
36#define MADV_WILLNEED 0x3 /* pre-fault pages */ 36#define MADV_WILLNEED 0x3 /* pre-fault pages */
37#define MADV_DONTNEED 0x4 /* discard these pages */ 37#define MADV_DONTNEED 0x4 /* discard these pages */
38#define MADV_REMOVE 0x5 /* remove these pages & resources */
38 39
39/* compatibility flags */ 40/* compatibility flags */
40#define MAP_ANON MAP_ANONYMOUS 41#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-sh64/atomic.h b/include/asm-sh64/atomic.h
index 927a2bc27b30..f3ce5c0df13a 100644
--- a/include/asm-sh64/atomic.h
+++ b/include/asm-sh64/atomic.h
@@ -152,4 +152,5 @@ static __inline__ void atomic_set_mask(unsigned int mask, atomic_t *v)
152#define smp_mb__before_atomic_inc() barrier() 152#define smp_mb__before_atomic_inc() barrier()
153#define smp_mb__after_atomic_inc() barrier() 153#define smp_mb__after_atomic_inc() barrier()
154 154
155#include <asm-generic/atomic.h>
155#endif /* __ASM_SH64_ATOMIC_H */ 156#endif /* __ASM_SH64_ATOMIC_H */
diff --git a/include/asm-sparc/atomic.h b/include/asm-sparc/atomic.h
index 62bec7ad271c..accb4967e9d2 100644
--- a/include/asm-sparc/atomic.h
+++ b/include/asm-sparc/atomic.h
@@ -159,4 +159,5 @@ static inline int __atomic24_sub(int i, atomic24_t *v)
159 159
160#endif /* !(__KERNEL__) */ 160#endif /* !(__KERNEL__) */
161 161
162#include <asm-generic/atomic.h>
162#endif /* !(__ARCH_SPARC_ATOMIC__) */ 163#endif /* !(__ARCH_SPARC_ATOMIC__) */
diff --git a/include/asm-sparc/mman.h b/include/asm-sparc/mman.h
index 138eb81dd70d..98435ad8619e 100644
--- a/include/asm-sparc/mman.h
+++ b/include/asm-sparc/mman.h
@@ -54,6 +54,7 @@
54#define MADV_WILLNEED 0x3 /* pre-fault pages */ 54#define MADV_WILLNEED 0x3 /* pre-fault pages */
55#define MADV_DONTNEED 0x4 /* discard these pages */ 55#define MADV_DONTNEED 0x4 /* discard these pages */
56#define MADV_FREE 0x5 /* (Solaris) contents can be freed */ 56#define MADV_FREE 0x5 /* (Solaris) contents can be freed */
57#define MADV_REMOVE 0x6 /* remove these pages & resources */
57 58
58/* compatibility flags */ 59/* compatibility flags */
59#define MAP_ANON MAP_ANONYMOUS 60#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-sparc64/atomic.h b/include/asm-sparc64/atomic.h
index 3789fe315992..11f5aa5d108c 100644
--- a/include/asm-sparc64/atomic.h
+++ b/include/asm-sparc64/atomic.h
@@ -96,4 +96,5 @@ extern int atomic64_sub_ret(int, atomic64_t *);
96#define smp_mb__after_atomic_inc() barrier() 96#define smp_mb__after_atomic_inc() barrier()
97#endif 97#endif
98 98
99#include <asm-generic/atomic.h>
99#endif /* !(__ARCH_SPARC64_ATOMIC__) */ 100#endif /* !(__ARCH_SPARC64_ATOMIC__) */
diff --git a/include/asm-sparc64/mman.h b/include/asm-sparc64/mman.h
index 01cecf54357b..cb4b6156194d 100644
--- a/include/asm-sparc64/mman.h
+++ b/include/asm-sparc64/mman.h
@@ -54,6 +54,7 @@
54#define MADV_WILLNEED 0x3 /* pre-fault pages */ 54#define MADV_WILLNEED 0x3 /* pre-fault pages */
55#define MADV_DONTNEED 0x4 /* discard these pages */ 55#define MADV_DONTNEED 0x4 /* discard these pages */
56#define MADV_FREE 0x5 /* (Solaris) contents can be freed */ 56#define MADV_FREE 0x5 /* (Solaris) contents can be freed */
57#define MADV_REMOVE 0x6 /* remove these pages & resources */
57 58
58/* compatibility flags */ 59/* compatibility flags */
59#define MAP_ANON MAP_ANONYMOUS 60#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-v850/atomic.h b/include/asm-v850/atomic.h
index bede3172ce7f..f5b9ab6f4e70 100644
--- a/include/asm-v850/atomic.h
+++ b/include/asm-v850/atomic.h
@@ -126,4 +126,5 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
126#define smp_mb__before_atomic_inc() barrier() 126#define smp_mb__before_atomic_inc() barrier()
127#define smp_mb__after_atomic_inc() barrier() 127#define smp_mb__after_atomic_inc() barrier()
128 128
129#include <asm-generic/atomic.h>
129#endif /* __V850_ATOMIC_H__ */ 130#endif /* __V850_ATOMIC_H__ */
diff --git a/include/asm-v850/mman.h b/include/asm-v850/mman.h
index e2b90081b56f..edc79965193a 100644
--- a/include/asm-v850/mman.h
+++ b/include/asm-v850/mman.h
@@ -32,6 +32,7 @@
32#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 32#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
33#define MADV_WILLNEED 0x3 /* pre-fault pages */ 33#define MADV_WILLNEED 0x3 /* pre-fault pages */
34#define MADV_DONTNEED 0x4 /* discard these pages */ 34#define MADV_DONTNEED 0x4 /* discard these pages */
35#define MADV_REMOVE 0x5 /* remove these pages & resources */
35 36
36/* compatibility flags */ 37/* compatibility flags */
37#define MAP_ANON MAP_ANONYMOUS 38#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-x86_64/atomic.h b/include/asm-x86_64/atomic.h
index 50db9f39274f..72eb071488c7 100644
--- a/include/asm-x86_64/atomic.h
+++ b/include/asm-x86_64/atomic.h
@@ -424,4 +424,5 @@ __asm__ __volatile__(LOCK "orl %0,%1" \
424#define smp_mb__before_atomic_inc() barrier() 424#define smp_mb__before_atomic_inc() barrier()
425#define smp_mb__after_atomic_inc() barrier() 425#define smp_mb__after_atomic_inc() barrier()
426 426
427#include <asm-generic/atomic.h>
427#endif 428#endif
diff --git a/include/asm-x86_64/cacheflush.h b/include/asm-x86_64/cacheflush.h
index b3189fb229d1..d32f7f58752a 100644
--- a/include/asm-x86_64/cacheflush.h
+++ b/include/asm-x86_64/cacheflush.h
@@ -27,4 +27,8 @@ void global_flush_tlb(void);
27int change_page_attr(struct page *page, int numpages, pgprot_t prot); 27int change_page_attr(struct page *page, int numpages, pgprot_t prot);
28int change_page_attr_addr(unsigned long addr, int numpages, pgprot_t prot); 28int change_page_attr_addr(unsigned long addr, int numpages, pgprot_t prot);
29 29
30#ifdef CONFIG_DEBUG_RODATA
31void mark_rodata_ro(void);
32#endif
33
30#endif /* _X8664_CACHEFLUSH_H */ 34#endif /* _X8664_CACHEFLUSH_H */
diff --git a/include/asm-x86_64/mman.h b/include/asm-x86_64/mman.h
index 78e60a4fd4ee..d0e97b74f735 100644
--- a/include/asm-x86_64/mman.h
+++ b/include/asm-x86_64/mman.h
@@ -36,6 +36,7 @@
36#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 36#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
37#define MADV_WILLNEED 0x3 /* pre-fault pages */ 37#define MADV_WILLNEED 0x3 /* pre-fault pages */
38#define MADV_DONTNEED 0x4 /* discard these pages */ 38#define MADV_DONTNEED 0x4 /* discard these pages */
39#define MADV_REMOVE 0x5 /* remove these pages & resources */
39 40
40/* compatibility flags */ 41/* compatibility flags */
41#define MAP_ANON MAP_ANONYMOUS 42#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h
index 6f8a17d105ab..10248a9a0582 100644
--- a/include/asm-x86_64/mpspec.h
+++ b/include/asm-x86_64/mpspec.h
@@ -76,7 +76,7 @@ struct mpc_config_bus
76{ 76{
77 unsigned char mpc_type; 77 unsigned char mpc_type;
78 unsigned char mpc_busid; 78 unsigned char mpc_busid;
79 unsigned char mpc_bustype[6] __attribute((packed)); 79 unsigned char mpc_bustype[6];
80}; 80};
81 81
82/* List of Bus Type string values, Intel MP Spec. */ 82/* List of Bus Type string values, Intel MP Spec. */
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index ecf58c7c1650..02888d7a496f 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -122,6 +122,8 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long
122 122
123#define pte_same(a, b) ((a).pte == (b).pte) 123#define pte_same(a, b) ((a).pte == (b).pte)
124 124
125#define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
126
125#define PMD_SIZE (1UL << PMD_SHIFT) 127#define PMD_SIZE (1UL << PMD_SHIFT)
126#define PMD_MASK (~(PMD_SIZE-1)) 128#define PMD_MASK (~(PMD_SIZE-1))
127#define PUD_SIZE (1UL << PUD_SHIFT) 129#define PUD_SIZE (1UL << PUD_SHIFT)
diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h
index 85348e02ad2e..b34cc2ee222b 100644
--- a/include/asm-x86_64/system.h
+++ b/include/asm-x86_64/system.h
@@ -315,6 +315,8 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
315#define local_irq_enable() __asm__ __volatile__("sti": : :"memory") 315#define local_irq_enable() __asm__ __volatile__("sti": : :"memory")
316/* used in the idle loop; sti takes one instruction cycle to complete */ 316/* used in the idle loop; sti takes one instruction cycle to complete */
317#define safe_halt() __asm__ __volatile__("sti; hlt": : :"memory") 317#define safe_halt() __asm__ __volatile__("sti; hlt": : :"memory")
318/* used when interrupts are already enabled or to shutdown the processor */
319#define halt() __asm__ __volatile__("hlt": : :"memory")
318 320
319#define irqs_disabled() \ 321#define irqs_disabled() \
320({ \ 322({ \
diff --git a/include/asm-xtensa/atomic.h b/include/asm-xtensa/atomic.h
index 3670cc7695da..e2ce06b101ad 100644
--- a/include/asm-xtensa/atomic.h
+++ b/include/asm-xtensa/atomic.h
@@ -286,6 +286,7 @@ static inline void atomic_set_mask(unsigned int mask, atomic_t *v)
286#define smp_mb__before_atomic_inc() barrier() 286#define smp_mb__before_atomic_inc() barrier()
287#define smp_mb__after_atomic_inc() barrier() 287#define smp_mb__after_atomic_inc() barrier()
288 288
289#include <asm-generic/atomic.h>
289#endif /* __KERNEL__ */ 290#endif /* __KERNEL__ */
290 291
291#endif /* _XTENSA_ATOMIC_H */ 292#endif /* _XTENSA_ATOMIC_H */
diff --git a/include/asm-xtensa/mman.h b/include/asm-xtensa/mman.h
index 9a95a45df996..082a7504925e 100644
--- a/include/asm-xtensa/mman.h
+++ b/include/asm-xtensa/mman.h
@@ -72,6 +72,7 @@
72#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */ 72#define MADV_SEQUENTIAL 0x2 /* read-ahead aggressively */
73#define MADV_WILLNEED 0x3 /* pre-fault pages */ 73#define MADV_WILLNEED 0x3 /* pre-fault pages */
74#define MADV_DONTNEED 0x4 /* discard these pages */ 74#define MADV_DONTNEED 0x4 /* discard these pages */
75#define MADV_REMOVE 0x5 /* remove these pages & resources */
75 76
76/* compatibility flags */ 77/* compatibility flags */
77#define MAP_ANON MAP_ANONYMOUS 78#define MAP_ANON MAP_ANONYMOUS
diff --git a/include/keys/user-type.h b/include/keys/user-type.h
index 26f6ec38577a..a3dae1803f45 100644
--- a/include/keys/user-type.h
+++ b/include/keys/user-type.h
@@ -35,7 +35,6 @@ struct user_key_payload {
35extern struct key_type key_type_user; 35extern struct key_type key_type_user;
36 36
37extern int user_instantiate(struct key *key, const void *data, size_t datalen); 37extern int user_instantiate(struct key *key, const void *data, size_t datalen);
38extern int user_duplicate(struct key *key, const struct key *source);
39extern int user_update(struct key *key, const void *data, size_t datalen); 38extern int user_update(struct key *key, const void *data, size_t datalen);
40extern int user_match(const struct key *key, const void *criterion); 39extern int user_match(const struct key *key, const void *criterion);
41extern void user_destroy(struct key *key); 40extern void user_destroy(struct key *key);
diff --git a/include/linux/ata.h b/include/linux/ata.h
index d2873b732bb1..94f77cce27fa 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -129,6 +129,7 @@ enum {
129 ATA_CMD_READ_EXT = 0x25, 129 ATA_CMD_READ_EXT = 0x25,
130 ATA_CMD_WRITE = 0xCA, 130 ATA_CMD_WRITE = 0xCA,
131 ATA_CMD_WRITE_EXT = 0x35, 131 ATA_CMD_WRITE_EXT = 0x35,
132 ATA_CMD_WRITE_FUA_EXT = 0x3D,
132 ATA_CMD_PIO_READ = 0x20, 133 ATA_CMD_PIO_READ = 0x20,
133 ATA_CMD_PIO_READ_EXT = 0x24, 134 ATA_CMD_PIO_READ_EXT = 0x24,
134 ATA_CMD_PIO_WRITE = 0x30, 135 ATA_CMD_PIO_WRITE = 0x30,
@@ -137,10 +138,13 @@ enum {
137 ATA_CMD_READ_MULTI_EXT = 0x29, 138 ATA_CMD_READ_MULTI_EXT = 0x29,
138 ATA_CMD_WRITE_MULTI = 0xC5, 139 ATA_CMD_WRITE_MULTI = 0xC5,
139 ATA_CMD_WRITE_MULTI_EXT = 0x39, 140 ATA_CMD_WRITE_MULTI_EXT = 0x39,
141 ATA_CMD_WRITE_MULTI_FUA_EXT = 0xCE,
140 ATA_CMD_SET_FEATURES = 0xEF, 142 ATA_CMD_SET_FEATURES = 0xEF,
141 ATA_CMD_PACKET = 0xA0, 143 ATA_CMD_PACKET = 0xA0,
142 ATA_CMD_VERIFY = 0x40, 144 ATA_CMD_VERIFY = 0x40,
143 ATA_CMD_VERIFY_EXT = 0x42, 145 ATA_CMD_VERIFY_EXT = 0x42,
146 ATA_CMD_STANDBYNOW1 = 0xE0,
147 ATA_CMD_IDLEIMMEDIATE = 0xE1,
144 ATA_CMD_INIT_DEV_PARAMS = 0x91, 148 ATA_CMD_INIT_DEV_PARAMS = 0x91,
145 149
146 /* SETFEATURES stuff */ 150 /* SETFEATURES stuff */
@@ -192,6 +196,7 @@ enum {
192 ATA_TFLAG_DEVICE = (1 << 2), /* enable r/w to device reg */ 196 ATA_TFLAG_DEVICE = (1 << 2), /* enable r/w to device reg */
193 ATA_TFLAG_WRITE = (1 << 3), /* data dir: host->dev==1 (write) */ 197 ATA_TFLAG_WRITE = (1 << 3), /* data dir: host->dev==1 (write) */
194 ATA_TFLAG_LBA = (1 << 4), /* enable LBA */ 198 ATA_TFLAG_LBA = (1 << 4), /* enable LBA */
199 ATA_TFLAG_FUA = (1 << 5), /* enable FUA */
195}; 200};
196 201
197enum ata_tf_protocols { 202enum ata_tf_protocols {
@@ -245,7 +250,8 @@ struct ata_taskfile {
245#define ata_id_is_sata(id) ((id)[93] == 0) 250#define ata_id_is_sata(id) ((id)[93] == 0)
246#define ata_id_rahead_enabled(id) ((id)[85] & (1 << 6)) 251#define ata_id_rahead_enabled(id) ((id)[85] & (1 << 6))
247#define ata_id_wcache_enabled(id) ((id)[85] & (1 << 5)) 252#define ata_id_wcache_enabled(id) ((id)[85] & (1 << 5))
248#define ata_id_has_flush(id) ((id)[83] & (1 << 12)) 253#define ata_id_has_fua(id) ((id)[84] & (1 << 6))
254#define ata_id_has_flush(id) ((id)[83] & (1 << 12))
249#define ata_id_has_flush_ext(id) ((id)[83] & (1 << 13)) 255#define ata_id_has_flush_ext(id) ((id)[83] & (1 << 13))
250#define ata_id_has_lba48(id) ((id)[83] & (1 << 10)) 256#define ata_id_has_lba48(id) ((id)[83] & (1 << 10))
251#define ata_id_has_wcache(id) ((id)[82] & (1 << 5)) 257#define ata_id_has_wcache(id) ((id)[82] & (1 << 5))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a18500d196e1..fb0985377421 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -102,7 +102,7 @@ void copy_io_context(struct io_context **pdst, struct io_context **psrc);
102void swap_io_context(struct io_context **ioc1, struct io_context **ioc2); 102void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
103 103
104struct request; 104struct request;
105typedef void (rq_end_io_fn)(struct request *); 105typedef void (rq_end_io_fn)(struct request *, int);
106 106
107struct request_list { 107struct request_list {
108 int count[2]; 108 int count[2];
@@ -207,6 +207,7 @@ enum rq_flag_bits {
207 __REQ_SORTED, /* elevator knows about this request */ 207 __REQ_SORTED, /* elevator knows about this request */
208 __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ 208 __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */
209 __REQ_HARDBARRIER, /* may not be passed by drive either */ 209 __REQ_HARDBARRIER, /* may not be passed by drive either */
210 __REQ_FUA, /* forced unit access */
210 __REQ_CMD, /* is a regular fs rw request */ 211 __REQ_CMD, /* is a regular fs rw request */
211 __REQ_NOMERGE, /* don't touch this for merging */ 212 __REQ_NOMERGE, /* don't touch this for merging */
212 __REQ_STARTED, /* drive already may have started this one */ 213 __REQ_STARTED, /* drive already may have started this one */
@@ -230,9 +231,7 @@ enum rq_flag_bits {
230 __REQ_PM_SUSPEND, /* suspend request */ 231 __REQ_PM_SUSPEND, /* suspend request */
231 __REQ_PM_RESUME, /* resume request */ 232 __REQ_PM_RESUME, /* resume request */
232 __REQ_PM_SHUTDOWN, /* shutdown request */ 233 __REQ_PM_SHUTDOWN, /* shutdown request */
233 __REQ_BAR_PREFLUSH, /* barrier pre-flush done */ 234 __REQ_ORDERED_COLOR, /* is before or after barrier */
234 __REQ_BAR_POSTFLUSH, /* barrier post-flush */
235 __REQ_BAR_FLUSH, /* rq is the flush request */
236 __REQ_NR_BITS, /* stops here */ 235 __REQ_NR_BITS, /* stops here */
237}; 236};
238 237
@@ -241,6 +240,7 @@ enum rq_flag_bits {
241#define REQ_SORTED (1 << __REQ_SORTED) 240#define REQ_SORTED (1 << __REQ_SORTED)
242#define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) 241#define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER)
243#define REQ_HARDBARRIER (1 << __REQ_HARDBARRIER) 242#define REQ_HARDBARRIER (1 << __REQ_HARDBARRIER)
243#define REQ_FUA (1 << __REQ_FUA)
244#define REQ_CMD (1 << __REQ_CMD) 244#define REQ_CMD (1 << __REQ_CMD)
245#define REQ_NOMERGE (1 << __REQ_NOMERGE) 245#define REQ_NOMERGE (1 << __REQ_NOMERGE)
246#define REQ_STARTED (1 << __REQ_STARTED) 246#define REQ_STARTED (1 << __REQ_STARTED)
@@ -260,9 +260,7 @@ enum rq_flag_bits {
260#define REQ_PM_SUSPEND (1 << __REQ_PM_SUSPEND) 260#define REQ_PM_SUSPEND (1 << __REQ_PM_SUSPEND)
261#define REQ_PM_RESUME (1 << __REQ_PM_RESUME) 261#define REQ_PM_RESUME (1 << __REQ_PM_RESUME)
262#define REQ_PM_SHUTDOWN (1 << __REQ_PM_SHUTDOWN) 262#define REQ_PM_SHUTDOWN (1 << __REQ_PM_SHUTDOWN)
263#define REQ_BAR_PREFLUSH (1 << __REQ_BAR_PREFLUSH) 263#define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR)
264#define REQ_BAR_POSTFLUSH (1 << __REQ_BAR_POSTFLUSH)
265#define REQ_BAR_FLUSH (1 << __REQ_BAR_FLUSH)
266 264
267/* 265/*
268 * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME 266 * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME
@@ -292,8 +290,7 @@ struct bio_vec;
292typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *); 290typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
293typedef void (activity_fn) (void *data, int rw); 291typedef void (activity_fn) (void *data, int rw);
294typedef int (issue_flush_fn) (request_queue_t *, struct gendisk *, sector_t *); 292typedef int (issue_flush_fn) (request_queue_t *, struct gendisk *, sector_t *);
295typedef int (prepare_flush_fn) (request_queue_t *, struct request *); 293typedef void (prepare_flush_fn) (request_queue_t *, struct request *);
296typedef void (end_flush_fn) (request_queue_t *, struct request *);
297 294
298enum blk_queue_state { 295enum blk_queue_state {
299 Queue_down, 296 Queue_down,
@@ -335,7 +332,6 @@ struct request_queue
335 activity_fn *activity_fn; 332 activity_fn *activity_fn;
336 issue_flush_fn *issue_flush_fn; 333 issue_flush_fn *issue_flush_fn;
337 prepare_flush_fn *prepare_flush_fn; 334 prepare_flush_fn *prepare_flush_fn;
338 end_flush_fn *end_flush_fn;
339 335
340 /* 336 /*
341 * Dispatch queue sorting 337 * Dispatch queue sorting
@@ -420,14 +416,11 @@ struct request_queue
420 /* 416 /*
421 * reserved for flush operations 417 * reserved for flush operations
422 */ 418 */
423 struct request *flush_rq; 419 unsigned int ordered, next_ordered, ordseq;
424 unsigned char ordered; 420 int orderr, ordcolor;
425}; 421 struct request pre_flush_rq, bar_rq, post_flush_rq;
426 422 struct request *orig_bar_rq;
427enum { 423 unsigned int bi_size;
428 QUEUE_ORDERED_NONE,
429 QUEUE_ORDERED_TAG,
430 QUEUE_ORDERED_FLUSH,
431}; 424};
432 425
433#define RQ_INACTIVE (-1) 426#define RQ_INACTIVE (-1)
@@ -445,12 +438,51 @@ enum {
445#define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */ 438#define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */
446#define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */ 439#define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */
447#define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */ 440#define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */
448#define QUEUE_FLAG_FLUSH 9 /* doing barrier flush sequence */ 441
442enum {
443 /*
444 * Hardbarrier is supported with one of the following methods.
445 *
446 * NONE : hardbarrier unsupported
447 * DRAIN : ordering by draining is enough
448 * DRAIN_FLUSH : ordering by draining w/ pre and post flushes
449 * DRAIN_FUA : ordering by draining w/ pre flush and FUA write
450 * TAG : ordering by tag is enough
451 * TAG_FLUSH : ordering by tag w/ pre and post flushes
452 * TAG_FUA : ordering by tag w/ pre flush and FUA write
453 */
454 QUEUE_ORDERED_NONE = 0x00,
455 QUEUE_ORDERED_DRAIN = 0x01,
456 QUEUE_ORDERED_TAG = 0x02,
457
458 QUEUE_ORDERED_PREFLUSH = 0x10,
459 QUEUE_ORDERED_POSTFLUSH = 0x20,
460 QUEUE_ORDERED_FUA = 0x40,
461
462 QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
463 QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
464 QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN |
465 QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
466 QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG |
467 QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
468 QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG |
469 QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
470
471 /*
472 * Ordered operation sequence
473 */
474 QUEUE_ORDSEQ_STARTED = 0x01, /* flushing in progress */
475 QUEUE_ORDSEQ_DRAIN = 0x02, /* waiting for the queue to be drained */
476 QUEUE_ORDSEQ_PREFLUSH = 0x04, /* pre-flushing in progress */
477 QUEUE_ORDSEQ_BAR = 0x08, /* original barrier req in progress */
478 QUEUE_ORDSEQ_POSTFLUSH = 0x10, /* post-flushing in progress */
479 QUEUE_ORDSEQ_DONE = 0x20,
480};
449 481
450#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags) 482#define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
451#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) 483#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
452#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) 484#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
453#define blk_queue_flushing(q) test_bit(QUEUE_FLAG_FLUSH, &(q)->queue_flags) 485#define blk_queue_flushing(q) ((q)->ordseq)
454 486
455#define blk_fs_request(rq) ((rq)->flags & REQ_CMD) 487#define blk_fs_request(rq) ((rq)->flags & REQ_CMD)
456#define blk_pc_request(rq) ((rq)->flags & REQ_BLOCK_PC) 488#define blk_pc_request(rq) ((rq)->flags & REQ_BLOCK_PC)
@@ -466,8 +498,7 @@ enum {
466 498
467#define blk_sorted_rq(rq) ((rq)->flags & REQ_SORTED) 499#define blk_sorted_rq(rq) ((rq)->flags & REQ_SORTED)
468#define blk_barrier_rq(rq) ((rq)->flags & REQ_HARDBARRIER) 500#define blk_barrier_rq(rq) ((rq)->flags & REQ_HARDBARRIER)
469#define blk_barrier_preflush(rq) ((rq)->flags & REQ_BAR_PREFLUSH) 501#define blk_fua_rq(rq) ((rq)->flags & REQ_FUA)
470#define blk_barrier_postflush(rq) ((rq)->flags & REQ_BAR_POSTFLUSH)
471 502
472#define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) 503#define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist)
473 504
@@ -560,7 +591,7 @@ extern void register_disk(struct gendisk *dev);
560extern void generic_make_request(struct bio *bio); 591extern void generic_make_request(struct bio *bio);
561extern void blk_put_request(struct request *); 592extern void blk_put_request(struct request *);
562extern void __blk_put_request(request_queue_t *, struct request *); 593extern void __blk_put_request(request_queue_t *, struct request *);
563extern void blk_end_sync_rq(struct request *rq); 594extern void blk_end_sync_rq(struct request *rq, int error);
564extern void blk_attempt_remerge(request_queue_t *, struct request *); 595extern void blk_attempt_remerge(request_queue_t *, struct request *);
565extern struct request *blk_get_request(request_queue_t *, int, gfp_t); 596extern struct request *blk_get_request(request_queue_t *, int, gfp_t);
566extern void blk_insert_request(request_queue_t *, struct request *, int, void *); 597extern void blk_insert_request(request_queue_t *, struct request *, int, void *);
@@ -582,8 +613,7 @@ extern int blk_rq_map_user_iov(request_queue_t *, struct request *, struct sg_io
582extern int blk_execute_rq(request_queue_t *, struct gendisk *, 613extern int blk_execute_rq(request_queue_t *, struct gendisk *,
583 struct request *, int); 614 struct request *, int);
584extern void blk_execute_rq_nowait(request_queue_t *, struct gendisk *, 615extern void blk_execute_rq_nowait(request_queue_t *, struct gendisk *,
585 struct request *, int, 616 struct request *, int, rq_end_io_fn *);
586 void (*done)(struct request *));
587 617
588static inline request_queue_t *bdev_get_queue(struct block_device *bdev) 618static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
589{ 619{
@@ -614,7 +644,7 @@ static inline void blk_run_address_space(struct address_space *mapping)
614 */ 644 */
615extern int end_that_request_first(struct request *, int, int); 645extern int end_that_request_first(struct request *, int, int);
616extern int end_that_request_chunk(struct request *, int, int); 646extern int end_that_request_chunk(struct request *, int, int);
617extern void end_that_request_last(struct request *); 647extern void end_that_request_last(struct request *, int);
618extern void end_request(struct request *req, int uptodate); 648extern void end_request(struct request *req, int uptodate);
619 649
620/* 650/*
@@ -665,11 +695,12 @@ extern void blk_queue_prep_rq(request_queue_t *, prep_rq_fn *pfn);
665extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *); 695extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *);
666extern void blk_queue_dma_alignment(request_queue_t *, int); 696extern void blk_queue_dma_alignment(request_queue_t *, int);
667extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); 697extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
668extern void blk_queue_ordered(request_queue_t *, int); 698extern int blk_queue_ordered(request_queue_t *, unsigned, prepare_flush_fn *);
669extern void blk_queue_issue_flush_fn(request_queue_t *, issue_flush_fn *); 699extern void blk_queue_issue_flush_fn(request_queue_t *, issue_flush_fn *);
670extern struct request *blk_start_pre_flush(request_queue_t *,struct request *); 700extern int blk_do_ordered(request_queue_t *, struct request **);
671extern int blk_complete_barrier_rq(request_queue_t *, struct request *, int); 701extern unsigned blk_ordered_cur_seq(request_queue_t *);
672extern int blk_complete_barrier_rq_locked(request_queue_t *, struct request *, int); 702extern unsigned blk_ordered_req_seq(struct request *);
703extern void blk_ordered_complete_seq(request_queue_t *, unsigned, int);
673 704
674extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *); 705extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
675extern void blk_dump_rq_flags(struct request *, char *); 706extern void blk_dump_rq_flags(struct request *, char *);
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 3b03b0b868dd..993da8cc9706 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -43,50 +43,38 @@ typedef struct bootmem_data {
43extern unsigned long __init bootmem_bootmap_pages (unsigned long); 43extern unsigned long __init bootmem_bootmap_pages (unsigned long);
44extern unsigned long __init init_bootmem (unsigned long addr, unsigned long memend); 44extern unsigned long __init init_bootmem (unsigned long addr, unsigned long memend);
45extern void __init free_bootmem (unsigned long addr, unsigned long size); 45extern void __init free_bootmem (unsigned long addr, unsigned long size);
46extern void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, unsigned long limit); 46extern void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal);
47extern void * __init __alloc_bootmem_low(unsigned long size,
48 unsigned long align,
49 unsigned long goal);
50extern void * __init __alloc_bootmem_low_node(pg_data_t *pgdat,
51 unsigned long size,
52 unsigned long align,
53 unsigned long goal);
47#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 54#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
48extern void __init reserve_bootmem (unsigned long addr, unsigned long size); 55extern void __init reserve_bootmem (unsigned long addr, unsigned long size);
49#define alloc_bootmem(x) \ 56#define alloc_bootmem(x) \
50 __alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) 57 __alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
51#define alloc_bootmem_low(x) \ 58#define alloc_bootmem_low(x) \
52 __alloc_bootmem((x), SMP_CACHE_BYTES, 0) 59 __alloc_bootmem_low((x), SMP_CACHE_BYTES, 0)
53#define alloc_bootmem_pages(x) \ 60#define alloc_bootmem_pages(x) \
54 __alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) 61 __alloc_bootmem((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
55#define alloc_bootmem_low_pages(x) \ 62#define alloc_bootmem_low_pages(x) \
56 __alloc_bootmem((x), PAGE_SIZE, 0) 63 __alloc_bootmem_low((x), PAGE_SIZE, 0)
57
58#define alloc_bootmem_limit(x, limit) \
59 __alloc_bootmem_limit((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS), (limit))
60#define alloc_bootmem_low_limit(x, limit) \
61 __alloc_bootmem_limit((x), SMP_CACHE_BYTES, 0, (limit))
62#define alloc_bootmem_pages_limit(x, limit) \
63 __alloc_bootmem_limit((x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS), (limit))
64#define alloc_bootmem_low_pages_limit(x, limit) \
65 __alloc_bootmem_limit((x), PAGE_SIZE, 0, (limit))
66
67#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 64#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
68extern unsigned long __init free_all_bootmem (void); 65extern unsigned long __init free_all_bootmem (void);
69 66extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal);
70extern unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn); 67extern unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn);
71extern void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size); 68extern void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size);
72extern void __init free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size); 69extern void __init free_bootmem_node (pg_data_t *pgdat, unsigned long addr, unsigned long size);
73extern unsigned long __init free_all_bootmem_node (pg_data_t *pgdat); 70extern unsigned long __init free_all_bootmem_node (pg_data_t *pgdat);
74extern void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit);
75#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 71#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
76#define alloc_bootmem_node(pgdat, x) \ 72#define alloc_bootmem_node(pgdat, x) \
77 __alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) 73 __alloc_bootmem_node((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
78#define alloc_bootmem_pages_node(pgdat, x) \ 74#define alloc_bootmem_pages_node(pgdat, x) \
79 __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) 75 __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
80#define alloc_bootmem_low_pages_node(pgdat, x) \ 76#define alloc_bootmem_low_pages_node(pgdat, x) \
81 __alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0) 77 __alloc_bootmem_low_node((pgdat), (x), PAGE_SIZE, 0)
82
83#define alloc_bootmem_node_limit(pgdat, x, limit) \
84 __alloc_bootmem_node_limit((pgdat), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS), (limit))
85#define alloc_bootmem_pages_node_limit(pgdat, x, limit) \
86 __alloc_bootmem_node_limit((pgdat), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS), (limit))
87#define alloc_bootmem_low_pages_node_limit(pgdat, x, limit) \
88 __alloc_bootmem_node_limit((pgdat), (x), PAGE_SIZE, 0, (limit))
89
90#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 78#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
91 79
92#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP 80#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
@@ -123,15 +111,5 @@ extern void *__init alloc_large_system_hash(const char *tablename,
123#endif 111#endif
124extern int __initdata hashdist; /* Distribute hashes across NUMA nodes? */ 112extern int __initdata hashdist; /* Distribute hashes across NUMA nodes? */
125 113
126static inline void *__alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
127{
128 return __alloc_bootmem_limit(size, align, goal, 0);
129}
130
131static inline void *__alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align,
132 unsigned long goal)
133{
134 return __alloc_bootmem_node_limit(pgdat, size, align, goal, 0);
135}
136 114
137#endif /* _LINUX_BOOTMEM_H */ 115#endif /* _LINUX_BOOTMEM_H */
diff --git a/include/linux/configfs.h b/include/linux/configfs.h
new file mode 100644
index 000000000000..acffb8c9073a
--- /dev/null
+++ b/include/linux/configfs.h
@@ -0,0 +1,205 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * configfs.h - definitions for the device driver filesystem
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public
17 * License along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA.
20 *
21 * Based on sysfs:
22 * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
23 *
24 * Based on kobject.h:
25 * Copyright (c) 2002-2003 Patrick Mochel
26 * Copyright (c) 2002-2003 Open Source Development Labs
27 *
28 * configfs Copyright (C) 2005 Oracle. All rights reserved.
29 *
30 * Please read Documentation/filesystems/configfs.txt before using the
31 * configfs interface, ESPECIALLY the parts about reference counts and
32 * item destructors.
33 */
34
35#ifndef _CONFIGFS_H_
36#define _CONFIGFS_H_
37
38#ifdef __KERNEL__
39
40#include <linux/types.h>
41#include <linux/list.h>
42#include <linux/kref.h>
43
44#include <asm/atomic.h>
45#include <asm/semaphore.h>
46
47#define CONFIGFS_ITEM_NAME_LEN 20
48
49struct module;
50
51struct configfs_item_operations;
52struct configfs_group_operations;
53struct configfs_attribute;
54struct configfs_subsystem;
55
56struct config_item {
57 char *ci_name;
58 char ci_namebuf[CONFIGFS_ITEM_NAME_LEN];
59 struct kref ci_kref;
60 struct list_head ci_entry;
61 struct config_item *ci_parent;
62 struct config_group *ci_group;
63 struct config_item_type *ci_type;
64 struct dentry *ci_dentry;
65};
66
67extern int config_item_set_name(struct config_item *, const char *, ...);
68
69static inline char *config_item_name(struct config_item * item)
70{
71 return item->ci_name;
72}
73
74extern void config_item_init(struct config_item *);
75extern void config_item_init_type_name(struct config_item *item,
76 const char *name,
77 struct config_item_type *type);
78extern void config_item_cleanup(struct config_item *);
79
80extern struct config_item * config_item_get(struct config_item *);
81extern void config_item_put(struct config_item *);
82
83struct config_item_type {
84 struct module *ct_owner;
85 struct configfs_item_operations *ct_item_ops;
86 struct configfs_group_operations *ct_group_ops;
87 struct configfs_attribute **ct_attrs;
88};
89
90
91/**
92 * group - a group of config_items of a specific type, belonging
93 * to a specific subsystem.
94 */
95
96struct config_group {
97 struct config_item cg_item;
98 struct list_head cg_children;
99 struct configfs_subsystem *cg_subsys;
100 struct config_group **default_groups;
101};
102
103
104extern void config_group_init(struct config_group *group);
105extern void config_group_init_type_name(struct config_group *group,
106 const char *name,
107 struct config_item_type *type);
108
109
110static inline struct config_group *to_config_group(struct config_item *item)
111{
112 return item ? container_of(item,struct config_group,cg_item) : NULL;
113}
114
115static inline struct config_group *config_group_get(struct config_group *group)
116{
117 return group ? to_config_group(config_item_get(&group->cg_item)) : NULL;
118}
119
120static inline void config_group_put(struct config_group *group)
121{
122 config_item_put(&group->cg_item);
123}
124
125extern struct config_item *config_group_find_obj(struct config_group *, const char *);
126
127
128struct configfs_attribute {
129 char *ca_name;
130 struct module *ca_owner;
131 mode_t ca_mode;
132};
133
134
135/*
136 * If allow_link() exists, the item can symlink(2) out to other
137 * items. If the item is a group, it may support mkdir(2).
138 * Groups supply one of make_group() and make_item(). If the
139 * group supports make_group(), one can create group children. If it
140 * supports make_item(), one can create config_item children. If it has
141 * default_groups on group->default_groups, it has automatically created
142 * group children. default_groups may coexist alongsize make_group() or
143 * make_item(), but if the group wishes to have only default_groups
144 * children (disallowing mkdir(2)), it need not provide either function.
145 * If the group has commit(), it supports pending and commited (active)
146 * items.
147 */
148struct configfs_item_operations {
149 void (*release)(struct config_item *);
150 ssize_t (*show_attribute)(struct config_item *, struct configfs_attribute *,char *);
151 ssize_t (*store_attribute)(struct config_item *,struct configfs_attribute *,const char *, size_t);
152 int (*allow_link)(struct config_item *src, struct config_item *target);
153 int (*drop_link)(struct config_item *src, struct config_item *target);
154};
155
156struct configfs_group_operations {
157 struct config_item *(*make_item)(struct config_group *group, const char *name);
158 struct config_group *(*make_group)(struct config_group *group, const char *name);
159 int (*commit_item)(struct config_item *item);
160 void (*drop_item)(struct config_group *group, struct config_item *item);
161};
162
163
164
165/**
166 * Use these macros to make defining attributes easier. See include/linux/device.h
167 * for examples..
168 */
169
170#if 0
171#define __ATTR(_name,_mode,_show,_store) { \
172 .attr = {.ca_name = __stringify(_name), .ca_mode = _mode, .ca_owner = THIS_MODULE }, \
173 .show = _show, \
174 .store = _store, \
175}
176
177#define __ATTR_RO(_name) { \
178 .attr = { .ca_name = __stringify(_name), .ca_mode = 0444, .ca_owner = THIS_MODULE }, \
179 .show = _name##_show, \
180}
181
182#define __ATTR_NULL { .attr = { .name = NULL } }
183
184#define attr_name(_attr) (_attr).attr.name
185#endif
186
187
188struct configfs_subsystem {
189 struct config_group su_group;
190 struct semaphore su_sem;
191};
192
193static inline struct configfs_subsystem *to_configfs_subsystem(struct config_group *group)
194{
195 return group ?
196 container_of(group, struct configfs_subsystem, su_group) :
197 NULL;
198}
199
200int configfs_register_subsystem(struct configfs_subsystem *subsys);
201void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
202
203#endif /* __KERNEL__ */
204
205#endif /* _CONFIGFS_H_ */
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index f5eb6b6cd109..fa75ba0d635e 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -272,9 +272,9 @@ typedef char ioctl_struct[308];
272#define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl) 272#define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl)
273 273
274#define DM_VERSION_MAJOR 4 274#define DM_VERSION_MAJOR 4
275#define DM_VERSION_MINOR 4 275#define DM_VERSION_MINOR 5
276#define DM_VERSION_PATCHLEVEL 0 276#define DM_VERSION_PATCHLEVEL 0
277#define DM_VERSION_EXTRA "-ioctl (2005-01-12)" 277#define DM_VERSION_EXTRA "-ioctl (2005-10-04)"
278 278
279/* Status bits */ 279/* Status bits */
280#define DM_READONLY_FLAG (1 << 0) /* In/Out */ 280#define DM_READONLY_FLAG (1 << 0) /* In/Out */
@@ -301,8 +301,13 @@ typedef char ioctl_struct[308];
301#define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */ 301#define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */
302 302
303/* 303/*
304 * Set this to improve performance when you aren't going to use open_count 304 * Set this to improve performance when you aren't going to use open_count.
305 */ 305 */
306#define DM_SKIP_BDGET_FLAG (1 << 9) /* In */ 306#define DM_SKIP_BDGET_FLAG (1 << 9) /* In */
307 307
308/*
309 * Set this to avoid attempting to freeze any filesystem when suspending.
310 */
311#define DM_SKIP_LOCKFS_FLAG (1 << 10) /* In */
312
308#endif /* _LINUX_DM_IOCTL_H */ 313#endif /* _LINUX_DM_IOCTL_H */
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index a74c27e460ba..fb80fa44c4dd 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -130,6 +130,7 @@ extern int elv_try_last_merge(request_queue_t *, struct bio *);
130#define ELEVATOR_INSERT_FRONT 1 130#define ELEVATOR_INSERT_FRONT 1
131#define ELEVATOR_INSERT_BACK 2 131#define ELEVATOR_INSERT_BACK 2
132#define ELEVATOR_INSERT_SORT 3 132#define ELEVATOR_INSERT_SORT 3
133#define ELEVATOR_INSERT_REQUEUE 4
133 134
134/* 135/*
135 * return values from elevator_may_queue_fn 136 * return values from elevator_may_queue_fn
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cc35b6ac778d..115e72be25d0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -302,6 +302,37 @@ struct iattr {
302 */ 302 */
303#include <linux/quota.h> 303#include <linux/quota.h>
304 304
305/**
306 * enum positive_aop_returns - aop return codes with specific semantics
307 *
308 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
309 * completed, that the page is still locked, and
310 * should be considered active. The VM uses this hint
311 * to return the page to the active list -- it won't
312 * be a candidate for writeback again in the near
313 * future. Other callers must be careful to unlock
314 * the page if they get this return. Returned by
315 * writepage();
316 *
317 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
318 * unlocked it and the page might have been truncated.
319 * The caller should back up to acquiring a new page and
320 * trying again. The aop will be taking reasonable
321 * precautions not to livelock. If the caller held a page
322 * reference, it should drop it before retrying. Returned
323 * by readpage(), prepare_write(), and commit_write().
324 *
325 * address_space_operation functions return these large constants to indicate
326 * special semantics to the caller. These are much larger than the bytes in a
327 * page to allow for functions that return the number of bytes operated on in a
328 * given page.
329 */
330
331enum positive_aop_returns {
332 AOP_WRITEPAGE_ACTIVATE = 0x80000,
333 AOP_TRUNCATED_PAGE = 0x80001,
334};
335
305/* 336/*
306 * oh the beauties of C type declarations. 337 * oh the beauties of C type declarations.
307 */ 338 */
@@ -1019,6 +1050,7 @@ struct inode_operations {
1019 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); 1050 ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
1020 ssize_t (*listxattr) (struct dentry *, char *, size_t); 1051 ssize_t (*listxattr) (struct dentry *, char *, size_t);
1021 int (*removexattr) (struct dentry *, const char *); 1052 int (*removexattr) (struct dentry *, const char *);
1053 void (*truncate_range)(struct inode *, loff_t, loff_t);
1022}; 1054};
1023 1055
1024struct seq_file; 1056struct seq_file;
diff --git a/include/linux/fuse.h b/include/linux/fuse.h
index b76b558b03d4..528959c52f1b 100644
--- a/include/linux/fuse.h
+++ b/include/linux/fuse.h
@@ -14,7 +14,7 @@
14#define FUSE_KERNEL_VERSION 7 14#define FUSE_KERNEL_VERSION 7
15 15
16/** Minor version number of this interface */ 16/** Minor version number of this interface */
17#define FUSE_KERNEL_MINOR_VERSION 3 17#define FUSE_KERNEL_MINOR_VERSION 5
18 18
19/** The node ID of the root inode */ 19/** The node ID of the root inode */
20#define FUSE_ROOT_ID 1 20#define FUSE_ROOT_ID 1
@@ -53,6 +53,9 @@ struct fuse_kstatfs {
53 __u64 ffree; 53 __u64 ffree;
54 __u32 bsize; 54 __u32 bsize;
55 __u32 namelen; 55 __u32 namelen;
56 __u32 frsize;
57 __u32 padding;
58 __u32 spare[6];
56}; 59};
57 60
58#define FATTR_MODE (1 << 0) 61#define FATTR_MODE (1 << 0)
@@ -105,12 +108,8 @@ enum fuse_opcode {
105 FUSE_CREATE = 35 108 FUSE_CREATE = 35
106}; 109};
107 110
108/* Conservative buffer size for the client */ 111/* The read buffer is required to be at least 8k, but may be much larger */
109#define FUSE_MAX_IN 8192 112#define FUSE_MIN_READ_BUFFER 8192
110
111#define FUSE_NAME_MAX 1024
112#define FUSE_SYMLINK_MAX 4096
113#define FUSE_XATTR_SIZE_MAX 4096
114 113
115struct fuse_entry_out { 114struct fuse_entry_out {
116 __u64 nodeid; /* Inode ID */ 115 __u64 nodeid; /* Inode ID */
@@ -213,6 +212,8 @@ struct fuse_write_out {
213 __u32 padding; 212 __u32 padding;
214}; 213};
215 214
215#define FUSE_COMPAT_STATFS_SIZE 48
216
216struct fuse_statfs_out { 217struct fuse_statfs_out {
217 struct fuse_kstatfs st; 218 struct fuse_kstatfs st;
218}; 219};
@@ -243,9 +244,16 @@ struct fuse_access_in {
243 __u32 padding; 244 __u32 padding;
244}; 245};
245 246
246struct fuse_init_in_out { 247struct fuse_init_in {
248 __u32 major;
249 __u32 minor;
250};
251
252struct fuse_init_out {
247 __u32 major; 253 __u32 major;
248 __u32 minor; 254 __u32 minor;
255 __u32 unused[3];
256 __u32 max_write;
249}; 257};
250 258
251struct fuse_in_header { 259struct fuse_in_header {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1056717ee501..68d82ad6b17c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -22,7 +22,7 @@ int hugetlb_report_meminfo(char *);
22int hugetlb_report_node_meminfo(int, char *); 22int hugetlb_report_node_meminfo(int, char *);
23int is_hugepage_mem_enough(size_t); 23int is_hugepage_mem_enough(size_t);
24unsigned long hugetlb_total_pages(void); 24unsigned long hugetlb_total_pages(void);
25struct page *alloc_huge_page(void); 25struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
26void free_huge_page(struct page *); 26void free_huge_page(struct page *);
27int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 27int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
28 unsigned long address, int write_access); 28 unsigned long address, int write_access);
@@ -97,7 +97,7 @@ static inline unsigned long hugetlb_total_pages(void)
97#define is_hugepage_only_range(mm, addr, len) 0 97#define is_hugepage_only_range(mm, addr, len) 0
98#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ 98#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
99 do { } while (0) 99 do { } while (0)
100#define alloc_huge_page() ({ NULL; }) 100#define alloc_huge_page(vma, addr) ({ NULL; })
101#define free_huge_page(p) ({ (void)(p); BUG(); }) 101#define free_huge_page(p) ({ (void)(p); BUG(); })
102#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) 102#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; })
103 103
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index d79c8a4bc4f8..9ba806796667 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -30,6 +30,7 @@
30#include <linux/string.h> 30#include <linux/string.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/workqueue.h> /* work_struct */ 32#include <linux/workqueue.h> /* work_struct */
33#include <linux/mempool.h>
33 34
34#include <asm/io.h> 35#include <asm/io.h>
35#include <asm/semaphore.h> /* Needed for MUTEX init macros */ 36#include <asm/semaphore.h> /* Needed for MUTEX init macros */
@@ -38,6 +39,355 @@
38#define I2O_QUEUE_EMPTY 0xffffffff 39#define I2O_QUEUE_EMPTY 0xffffffff
39 40
40/* 41/*
42 * Cache strategies
43 */
44
45/* The NULL strategy leaves everything up to the controller. This tends to be a
46 * pessimal but functional choice.
47 */
48#define CACHE_NULL 0
49/* Prefetch data when reading. We continually attempt to load the next 32 sectors
50 * into the controller cache.
51 */
52#define CACHE_PREFETCH 1
53/* Prefetch data when reading. We sometimes attempt to load the next 32 sectors
54 * into the controller cache. When an I/O is less <= 8K we assume its probably
55 * not sequential and don't prefetch (default)
56 */
57#define CACHE_SMARTFETCH 2
58/* Data is written to the cache and then out on to the disk. The I/O must be
59 * physically on the medium before the write is acknowledged (default without
60 * NVRAM)
61 */
62#define CACHE_WRITETHROUGH 17
63/* Data is written to the cache and then out on to the disk. The controller
64 * is permitted to write back the cache any way it wants. (default if battery
65 * backed NVRAM is present). It can be useful to set this for swap regardless of
66 * battery state.
67 */
68#define CACHE_WRITEBACK 18
69/* Optimise for under powered controllers, especially on RAID1 and RAID0. We
70 * write large I/O's directly to disk bypassing the cache to avoid the extra
71 * memory copy hits. Small writes are writeback cached
72 */
73#define CACHE_SMARTBACK 19
74/* Optimise for under powered controllers, especially on RAID1 and RAID0. We
75 * write large I/O's directly to disk bypassing the cache to avoid the extra
76 * memory copy hits. Small writes are writethrough cached. Suitable for devices
77 * lacking battery backup
78 */
79#define CACHE_SMARTTHROUGH 20
80
81/*
82 * Ioctl structures
83 */
84
85#define BLKI2OGRSTRAT _IOR('2', 1, int)
86#define BLKI2OGWSTRAT _IOR('2', 2, int)
87#define BLKI2OSRSTRAT _IOW('2', 3, int)
88#define BLKI2OSWSTRAT _IOW('2', 4, int)
89
90/*
91 * I2O Function codes
92 */
93
94/*
95 * Executive Class
96 */
97#define I2O_CMD_ADAPTER_ASSIGN 0xB3
98#define I2O_CMD_ADAPTER_READ 0xB2
99#define I2O_CMD_ADAPTER_RELEASE 0xB5
100#define I2O_CMD_BIOS_INFO_SET 0xA5
101#define I2O_CMD_BOOT_DEVICE_SET 0xA7
102#define I2O_CMD_CONFIG_VALIDATE 0xBB
103#define I2O_CMD_CONN_SETUP 0xCA
104#define I2O_CMD_DDM_DESTROY 0xB1
105#define I2O_CMD_DDM_ENABLE 0xD5
106#define I2O_CMD_DDM_QUIESCE 0xC7
107#define I2O_CMD_DDM_RESET 0xD9
108#define I2O_CMD_DDM_SUSPEND 0xAF
109#define I2O_CMD_DEVICE_ASSIGN 0xB7
110#define I2O_CMD_DEVICE_RELEASE 0xB9
111#define I2O_CMD_HRT_GET 0xA8
112#define I2O_CMD_ADAPTER_CLEAR 0xBE
113#define I2O_CMD_ADAPTER_CONNECT 0xC9
114#define I2O_CMD_ADAPTER_RESET 0xBD
115#define I2O_CMD_LCT_NOTIFY 0xA2
116#define I2O_CMD_OUTBOUND_INIT 0xA1
117#define I2O_CMD_PATH_ENABLE 0xD3
118#define I2O_CMD_PATH_QUIESCE 0xC5
119#define I2O_CMD_PATH_RESET 0xD7
120#define I2O_CMD_STATIC_MF_CREATE 0xDD
121#define I2O_CMD_STATIC_MF_RELEASE 0xDF
122#define I2O_CMD_STATUS_GET 0xA0
123#define I2O_CMD_SW_DOWNLOAD 0xA9
124#define I2O_CMD_SW_UPLOAD 0xAB
125#define I2O_CMD_SW_REMOVE 0xAD
126#define I2O_CMD_SYS_ENABLE 0xD1
127#define I2O_CMD_SYS_MODIFY 0xC1
128#define I2O_CMD_SYS_QUIESCE 0xC3
129#define I2O_CMD_SYS_TAB_SET 0xA3
130
131/*
132 * Utility Class
133 */
134#define I2O_CMD_UTIL_NOP 0x00
135#define I2O_CMD_UTIL_ABORT 0x01
136#define I2O_CMD_UTIL_CLAIM 0x09
137#define I2O_CMD_UTIL_RELEASE 0x0B
138#define I2O_CMD_UTIL_PARAMS_GET 0x06
139#define I2O_CMD_UTIL_PARAMS_SET 0x05
140#define I2O_CMD_UTIL_EVT_REGISTER 0x13
141#define I2O_CMD_UTIL_EVT_ACK 0x14
142#define I2O_CMD_UTIL_CONFIG_DIALOG 0x10
143#define I2O_CMD_UTIL_DEVICE_RESERVE 0x0D
144#define I2O_CMD_UTIL_DEVICE_RELEASE 0x0F
145#define I2O_CMD_UTIL_LOCK 0x17
146#define I2O_CMD_UTIL_LOCK_RELEASE 0x19
147#define I2O_CMD_UTIL_REPLY_FAULT_NOTIFY 0x15
148
149/*
150 * SCSI Host Bus Adapter Class
151 */
152#define I2O_CMD_SCSI_EXEC 0x81
153#define I2O_CMD_SCSI_ABORT 0x83
154#define I2O_CMD_SCSI_BUSRESET 0x27
155
156/*
157 * Bus Adapter Class
158 */
159#define I2O_CMD_BUS_ADAPTER_RESET 0x85
160#define I2O_CMD_BUS_RESET 0x87
161#define I2O_CMD_BUS_SCAN 0x89
162#define I2O_CMD_BUS_QUIESCE 0x8b
163
164/*
165 * Random Block Storage Class
166 */
167#define I2O_CMD_BLOCK_READ 0x30
168#define I2O_CMD_BLOCK_WRITE 0x31
169#define I2O_CMD_BLOCK_CFLUSH 0x37
170#define I2O_CMD_BLOCK_MLOCK 0x49
171#define I2O_CMD_BLOCK_MUNLOCK 0x4B
172#define I2O_CMD_BLOCK_MMOUNT 0x41
173#define I2O_CMD_BLOCK_MEJECT 0x43
174#define I2O_CMD_BLOCK_POWER 0x70
175
176#define I2O_CMD_PRIVATE 0xFF
177
178/* Command status values */
179
180#define I2O_CMD_IN_PROGRESS 0x01
181#define I2O_CMD_REJECTED 0x02
182#define I2O_CMD_FAILED 0x03
183#define I2O_CMD_COMPLETED 0x04
184
185/* I2O API function return values */
186
187#define I2O_RTN_NO_ERROR 0
188#define I2O_RTN_NOT_INIT 1
189#define I2O_RTN_FREE_Q_EMPTY 2
190#define I2O_RTN_TCB_ERROR 3
191#define I2O_RTN_TRANSACTION_ERROR 4
192#define I2O_RTN_ADAPTER_ALREADY_INIT 5
193#define I2O_RTN_MALLOC_ERROR 6
194#define I2O_RTN_ADPTR_NOT_REGISTERED 7
195#define I2O_RTN_MSG_REPLY_TIMEOUT 8
196#define I2O_RTN_NO_STATUS 9
197#define I2O_RTN_NO_FIRM_VER 10
198#define I2O_RTN_NO_LINK_SPEED 11
199
200/* Reply message status defines for all messages */
201
202#define I2O_REPLY_STATUS_SUCCESS 0x00
203#define I2O_REPLY_STATUS_ABORT_DIRTY 0x01
204#define I2O_REPLY_STATUS_ABORT_NO_DATA_TRANSFER 0x02
205#define I2O_REPLY_STATUS_ABORT_PARTIAL_TRANSFER 0x03
206#define I2O_REPLY_STATUS_ERROR_DIRTY 0x04
207#define I2O_REPLY_STATUS_ERROR_NO_DATA_TRANSFER 0x05
208#define I2O_REPLY_STATUS_ERROR_PARTIAL_TRANSFER 0x06
209#define I2O_REPLY_STATUS_PROCESS_ABORT_DIRTY 0x08
210#define I2O_REPLY_STATUS_PROCESS_ABORT_NO_DATA_TRANSFER 0x09
211#define I2O_REPLY_STATUS_PROCESS_ABORT_PARTIAL_TRANSFER 0x0A
212#define I2O_REPLY_STATUS_TRANSACTION_ERROR 0x0B
213#define I2O_REPLY_STATUS_PROGRESS_REPORT 0x80
214
215/* Status codes and Error Information for Parameter functions */
216
217#define I2O_PARAMS_STATUS_SUCCESS 0x00
218#define I2O_PARAMS_STATUS_BAD_KEY_ABORT 0x01
219#define I2O_PARAMS_STATUS_BAD_KEY_CONTINUE 0x02
220#define I2O_PARAMS_STATUS_BUFFER_FULL 0x03
221#define I2O_PARAMS_STATUS_BUFFER_TOO_SMALL 0x04
222#define I2O_PARAMS_STATUS_FIELD_UNREADABLE 0x05
223#define I2O_PARAMS_STATUS_FIELD_UNWRITEABLE 0x06
224#define I2O_PARAMS_STATUS_INSUFFICIENT_FIELDS 0x07
225#define I2O_PARAMS_STATUS_INVALID_GROUP_ID 0x08
226#define I2O_PARAMS_STATUS_INVALID_OPERATION 0x09
227#define I2O_PARAMS_STATUS_NO_KEY_FIELD 0x0A
228#define I2O_PARAMS_STATUS_NO_SUCH_FIELD 0x0B
229#define I2O_PARAMS_STATUS_NON_DYNAMIC_GROUP 0x0C
230#define I2O_PARAMS_STATUS_OPERATION_ERROR 0x0D
231#define I2O_PARAMS_STATUS_SCALAR_ERROR 0x0E
232#define I2O_PARAMS_STATUS_TABLE_ERROR 0x0F
233#define I2O_PARAMS_STATUS_WRONG_GROUP_TYPE 0x10
234
235/* DetailedStatusCode defines for Executive, DDM, Util and Transaction error
236 * messages: Table 3-2 Detailed Status Codes.*/
237
238#define I2O_DSC_SUCCESS 0x0000
239#define I2O_DSC_BAD_KEY 0x0002
240#define I2O_DSC_TCL_ERROR 0x0003
241#define I2O_DSC_REPLY_BUFFER_FULL 0x0004
242#define I2O_DSC_NO_SUCH_PAGE 0x0005
243#define I2O_DSC_INSUFFICIENT_RESOURCE_SOFT 0x0006
244#define I2O_DSC_INSUFFICIENT_RESOURCE_HARD 0x0007
245#define I2O_DSC_CHAIN_BUFFER_TOO_LARGE 0x0009
246#define I2O_DSC_UNSUPPORTED_FUNCTION 0x000A
247#define I2O_DSC_DEVICE_LOCKED 0x000B
248#define I2O_DSC_DEVICE_RESET 0x000C
249#define I2O_DSC_INAPPROPRIATE_FUNCTION 0x000D
250#define I2O_DSC_INVALID_INITIATOR_ADDRESS 0x000E
251#define I2O_DSC_INVALID_MESSAGE_FLAGS 0x000F
252#define I2O_DSC_INVALID_OFFSET 0x0010
253#define I2O_DSC_INVALID_PARAMETER 0x0011
254#define I2O_DSC_INVALID_REQUEST 0x0012
255#define I2O_DSC_INVALID_TARGET_ADDRESS 0x0013
256#define I2O_DSC_MESSAGE_TOO_LARGE 0x0014
257#define I2O_DSC_MESSAGE_TOO_SMALL 0x0015
258#define I2O_DSC_MISSING_PARAMETER 0x0016
259#define I2O_DSC_TIMEOUT 0x0017
260#define I2O_DSC_UNKNOWN_ERROR 0x0018
261#define I2O_DSC_UNKNOWN_FUNCTION 0x0019
262#define I2O_DSC_UNSUPPORTED_VERSION 0x001A
263#define I2O_DSC_DEVICE_BUSY 0x001B
264#define I2O_DSC_DEVICE_NOT_AVAILABLE 0x001C
265
266/* DetailedStatusCode defines for Block Storage Operation: Table 6-7 Detailed
267 Status Codes.*/
268
269#define I2O_BSA_DSC_SUCCESS 0x0000
270#define I2O_BSA_DSC_MEDIA_ERROR 0x0001
271#define I2O_BSA_DSC_ACCESS_ERROR 0x0002
272#define I2O_BSA_DSC_DEVICE_FAILURE 0x0003
273#define I2O_BSA_DSC_DEVICE_NOT_READY 0x0004
274#define I2O_BSA_DSC_MEDIA_NOT_PRESENT 0x0005
275#define I2O_BSA_DSC_MEDIA_LOCKED 0x0006
276#define I2O_BSA_DSC_MEDIA_FAILURE 0x0007
277#define I2O_BSA_DSC_PROTOCOL_FAILURE 0x0008
278#define I2O_BSA_DSC_BUS_FAILURE 0x0009
279#define I2O_BSA_DSC_ACCESS_VIOLATION 0x000A
280#define I2O_BSA_DSC_WRITE_PROTECTED 0x000B
281#define I2O_BSA_DSC_DEVICE_RESET 0x000C
282#define I2O_BSA_DSC_VOLUME_CHANGED 0x000D
283#define I2O_BSA_DSC_TIMEOUT 0x000E
284
285/* FailureStatusCodes, Table 3-3 Message Failure Codes */
286
287#define I2O_FSC_TRANSPORT_SERVICE_SUSPENDED 0x81
288#define I2O_FSC_TRANSPORT_SERVICE_TERMINATED 0x82
289#define I2O_FSC_TRANSPORT_CONGESTION 0x83
290#define I2O_FSC_TRANSPORT_FAILURE 0x84
291#define I2O_FSC_TRANSPORT_STATE_ERROR 0x85
292#define I2O_FSC_TRANSPORT_TIME_OUT 0x86
293#define I2O_FSC_TRANSPORT_ROUTING_FAILURE 0x87
294#define I2O_FSC_TRANSPORT_INVALID_VERSION 0x88
295#define I2O_FSC_TRANSPORT_INVALID_OFFSET 0x89
296#define I2O_FSC_TRANSPORT_INVALID_MSG_FLAGS 0x8A
297#define I2O_FSC_TRANSPORT_FRAME_TOO_SMALL 0x8B
298#define I2O_FSC_TRANSPORT_FRAME_TOO_LARGE 0x8C
299#define I2O_FSC_TRANSPORT_INVALID_TARGET_ID 0x8D
300#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_ID 0x8E
301#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_CONTEXT 0x8F
302#define I2O_FSC_TRANSPORT_UNKNOWN_FAILURE 0xFF
303
304/* Device Claim Types */
305#define I2O_CLAIM_PRIMARY 0x01000000
306#define I2O_CLAIM_MANAGEMENT 0x02000000
307#define I2O_CLAIM_AUTHORIZED 0x03000000
308#define I2O_CLAIM_SECONDARY 0x04000000
309
310/* Message header defines for VersionOffset */
311#define I2OVER15 0x0001
312#define I2OVER20 0x0002
313
314/* Default is 1.5 */
315#define I2OVERSION I2OVER15
316
317#define SGL_OFFSET_0 I2OVERSION
318#define SGL_OFFSET_4 (0x0040 | I2OVERSION)
319#define SGL_OFFSET_5 (0x0050 | I2OVERSION)
320#define SGL_OFFSET_6 (0x0060 | I2OVERSION)
321#define SGL_OFFSET_7 (0x0070 | I2OVERSION)
322#define SGL_OFFSET_8 (0x0080 | I2OVERSION)
323#define SGL_OFFSET_9 (0x0090 | I2OVERSION)
324#define SGL_OFFSET_10 (0x00A0 | I2OVERSION)
325#define SGL_OFFSET_11 (0x00B0 | I2OVERSION)
326#define SGL_OFFSET_12 (0x00C0 | I2OVERSION)
327#define SGL_OFFSET(x) (((x)<<4) | I2OVERSION)
328
329/* Transaction Reply Lists (TRL) Control Word structure */
330#define TRL_SINGLE_FIXED_LENGTH 0x00
331#define TRL_SINGLE_VARIABLE_LENGTH 0x40
332#define TRL_MULTIPLE_FIXED_LENGTH 0x80
333
334 /* msg header defines for MsgFlags */
335#define MSG_STATIC 0x0100
336#define MSG_64BIT_CNTXT 0x0200
337#define MSG_MULTI_TRANS 0x1000
338#define MSG_FAIL 0x2000
339#define MSG_FINAL 0x4000
340#define MSG_REPLY 0x8000
341
342 /* minimum size msg */
343#define THREE_WORD_MSG_SIZE 0x00030000
344#define FOUR_WORD_MSG_SIZE 0x00040000
345#define FIVE_WORD_MSG_SIZE 0x00050000
346#define SIX_WORD_MSG_SIZE 0x00060000
347#define SEVEN_WORD_MSG_SIZE 0x00070000
348#define EIGHT_WORD_MSG_SIZE 0x00080000
349#define NINE_WORD_MSG_SIZE 0x00090000
350#define TEN_WORD_MSG_SIZE 0x000A0000
351#define ELEVEN_WORD_MSG_SIZE 0x000B0000
352#define I2O_MESSAGE_SIZE(x) ((x)<<16)
353
354/* special TID assignments */
355#define ADAPTER_TID 0
356#define HOST_TID 1
357
358/* outbound queue defines */
359#define I2O_MAX_OUTBOUND_MSG_FRAMES 128
360#define I2O_OUTBOUND_MSG_FRAME_SIZE 128 /* in 32-bit words */
361
362/* inbound queue definitions */
363#define I2O_MSG_INPOOL_MIN 32
364#define I2O_INBOUND_MSG_FRAME_SIZE 128 /* in 32-bit words */
365
366#define I2O_POST_WAIT_OK 0
367#define I2O_POST_WAIT_TIMEOUT -ETIMEDOUT
368
369#define I2O_CONTEXT_LIST_MIN_LENGTH 15
370#define I2O_CONTEXT_LIST_USED 0x01
371#define I2O_CONTEXT_LIST_DELETED 0x02
372
373/* timeouts */
374#define I2O_TIMEOUT_INIT_OUTBOUND_QUEUE 15
375#define I2O_TIMEOUT_MESSAGE_GET 5
376#define I2O_TIMEOUT_RESET 30
377#define I2O_TIMEOUT_STATUS_GET 5
378#define I2O_TIMEOUT_LCT_GET 360
379#define I2O_TIMEOUT_SCSI_SCB_ABORT 240
380
381/* retries */
382#define I2O_HRT_GET_TRIES 3
383#define I2O_LCT_GET_TRIES 3
384
385/* defines for max_sectors and max_phys_segments */
386#define I2O_MAX_SECTORS 1024
387#define I2O_MAX_SECTORS_LIMITED 128
388#define I2O_MAX_PHYS_SEGMENTS MAX_PHYS_SEGMENTS
389
390/*
41 * Message structures 391 * Message structures
42 */ 392 */
43struct i2o_message { 393struct i2o_message {
@@ -58,6 +408,12 @@ struct i2o_message {
58 u32 body[0]; 408 u32 body[0];
59}; 409};
60 410
411/* MFA and I2O message used by mempool */
412struct i2o_msg_mfa {
413 u32 mfa; /* MFA returned by the controller */
414 struct i2o_message msg; /* I2O message */
415};
416
61/* 417/*
62 * Each I2O device entity has one of these. There is one per device. 418 * Each I2O device entity has one of these. There is one per device.
63 */ 419 */
@@ -130,6 +486,15 @@ struct i2o_dma {
130}; 486};
131 487
132/* 488/*
489 * Contains slab cache and mempool information
490 */
491struct i2o_pool {
492 char *name;
493 kmem_cache_t *slab;
494 mempool_t *mempool;
495};
496
497/*
133 * Contains IO mapped address information 498 * Contains IO mapped address information
134 */ 499 */
135struct i2o_io { 500struct i2o_io {
@@ -174,8 +539,6 @@ struct i2o_controller {
174 void __iomem *irq_status; /* Interrupt status register address */ 539 void __iomem *irq_status; /* Interrupt status register address */
175 void __iomem *irq_mask; /* Interrupt mask register address */ 540 void __iomem *irq_mask; /* Interrupt mask register address */
176 541
177 /* Dynamic LCT related data */
178
179 struct i2o_dma status; /* IOP status block */ 542 struct i2o_dma status; /* IOP status block */
180 543
181 struct i2o_dma hrt; /* HW Resource Table */ 544 struct i2o_dma hrt; /* HW Resource Table */
@@ -188,6 +551,8 @@ struct i2o_controller {
188 struct i2o_io in_queue; /* inbound message queue Host->IOP */ 551 struct i2o_io in_queue; /* inbound message queue Host->IOP */
189 struct i2o_dma out_queue; /* outbound message queue IOP->Host */ 552 struct i2o_dma out_queue; /* outbound message queue IOP->Host */
190 553
554 struct i2o_pool in_msg; /* mempool for inbound messages */
555
191 unsigned int battery:1; /* Has a battery backup */ 556 unsigned int battery:1; /* Has a battery backup */
192 unsigned int io_alloc:1; /* An I/O resource was allocated */ 557 unsigned int io_alloc:1; /* An I/O resource was allocated */
193 unsigned int mem_alloc:1; /* A memory resource was allocated */ 558 unsigned int mem_alloc:1; /* A memory resource was allocated */
@@ -196,7 +561,6 @@ struct i2o_controller {
196 struct resource mem_resource; /* Mem resource allocated to the IOP */ 561 struct resource mem_resource; /* Mem resource allocated to the IOP */
197 562
198 struct device device; 563 struct device device;
199 struct class_device *classdev; /* I2O controller class device */
200 struct i2o_device *exec; /* Executive */ 564 struct i2o_device *exec; /* Executive */
201#if BITS_PER_LONG == 64 565#if BITS_PER_LONG == 64
202 spinlock_t context_list_lock; /* lock for context_list */ 566 spinlock_t context_list_lock; /* lock for context_list */
@@ -247,16 +611,13 @@ struct i2o_sys_tbl {
247extern struct list_head i2o_controllers; 611extern struct list_head i2o_controllers;
248 612
249/* Message functions */ 613/* Message functions */
250static inline u32 i2o_msg_get(struct i2o_controller *, 614static inline struct i2o_message *i2o_msg_get(struct i2o_controller *);
251 struct i2o_message __iomem **); 615extern struct i2o_message *i2o_msg_get_wait(struct i2o_controller *, int);
252extern u32 i2o_msg_get_wait(struct i2o_controller *, 616static inline void i2o_msg_post(struct i2o_controller *, struct i2o_message *);
253 struct i2o_message __iomem **, int); 617static inline int i2o_msg_post_wait(struct i2o_controller *,
254static inline void i2o_msg_post(struct i2o_controller *, u32); 618 struct i2o_message *, unsigned long);
255static inline int i2o_msg_post_wait(struct i2o_controller *, u32, 619extern int i2o_msg_post_wait_mem(struct i2o_controller *, struct i2o_message *,
256 unsigned long); 620 unsigned long, struct i2o_dma *);
257extern int i2o_msg_post_wait_mem(struct i2o_controller *, u32, unsigned long,
258 struct i2o_dma *);
259extern void i2o_msg_nop(struct i2o_controller *, u32);
260static inline void i2o_flush_reply(struct i2o_controller *, u32); 621static inline void i2o_flush_reply(struct i2o_controller *, u32);
261 622
262/* IOP functions */ 623/* IOP functions */
@@ -384,10 +745,10 @@ static inline u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size)
384static inline dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr, 745static inline dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
385 size_t size, 746 size_t size,
386 enum dma_data_direction direction, 747 enum dma_data_direction direction,
387 u32 __iomem ** sg_ptr) 748 u32 ** sg_ptr)
388{ 749{
389 u32 sg_flags; 750 u32 sg_flags;
390 u32 __iomem *mptr = *sg_ptr; 751 u32 *mptr = *sg_ptr;
391 dma_addr_t dma_addr; 752 dma_addr_t dma_addr;
392 753
393 switch (direction) { 754 switch (direction) {
@@ -405,16 +766,16 @@ static inline dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
405 if (!dma_mapping_error(dma_addr)) { 766 if (!dma_mapping_error(dma_addr)) {
406#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64 767#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
407 if ((sizeof(dma_addr_t) > 4) && c->pae_support) { 768 if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
408 writel(0x7C020002, mptr++); 769 *mptr++ = cpu_to_le32(0x7C020002);
409 writel(PAGE_SIZE, mptr++); 770 *mptr++ = cpu_to_le32(PAGE_SIZE);
410 } 771 }
411#endif 772#endif
412 773
413 writel(sg_flags | size, mptr++); 774 *mptr++ = cpu_to_le32(sg_flags | size);
414 writel(i2o_dma_low(dma_addr), mptr++); 775 *mptr++ = cpu_to_le32(i2o_dma_low(dma_addr));
415#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64 776#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
416 if ((sizeof(dma_addr_t) > 4) && c->pae_support) 777 if ((sizeof(dma_addr_t) > 4) && c->pae_support)
417 writel(i2o_dma_high(dma_addr), mptr++); 778 *mptr++ = cpu_to_le32(i2o_dma_high(dma_addr));
418#endif 779#endif
419 *sg_ptr = mptr; 780 *sg_ptr = mptr;
420 } 781 }
@@ -439,10 +800,10 @@ static inline dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
439static inline int i2o_dma_map_sg(struct i2o_controller *c, 800static inline int i2o_dma_map_sg(struct i2o_controller *c,
440 struct scatterlist *sg, int sg_count, 801 struct scatterlist *sg, int sg_count,
441 enum dma_data_direction direction, 802 enum dma_data_direction direction,
442 u32 __iomem ** sg_ptr) 803 u32 ** sg_ptr)
443{ 804{
444 u32 sg_flags; 805 u32 sg_flags;
445 u32 __iomem *mptr = *sg_ptr; 806 u32 *mptr = *sg_ptr;
446 807
447 switch (direction) { 808 switch (direction) {
448 case DMA_TO_DEVICE: 809 case DMA_TO_DEVICE:
@@ -461,19 +822,19 @@ static inline int i2o_dma_map_sg(struct i2o_controller *c,
461 822
462#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64 823#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
463 if ((sizeof(dma_addr_t) > 4) && c->pae_support) { 824 if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
464 writel(0x7C020002, mptr++); 825 *mptr++ = cpu_to_le32(0x7C020002);
465 writel(PAGE_SIZE, mptr++); 826 *mptr++ = cpu_to_le32(PAGE_SIZE);
466 } 827 }
467#endif 828#endif
468 829
469 while (sg_count-- > 0) { 830 while (sg_count-- > 0) {
470 if (!sg_count) 831 if (!sg_count)
471 sg_flags |= 0xC0000000; 832 sg_flags |= 0xC0000000;
472 writel(sg_flags | sg_dma_len(sg), mptr++); 833 *mptr++ = cpu_to_le32(sg_flags | sg_dma_len(sg));
473 writel(i2o_dma_low(sg_dma_address(sg)), mptr++); 834 *mptr++ = cpu_to_le32(i2o_dma_low(sg_dma_address(sg)));
474#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64 835#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
475 if ((sizeof(dma_addr_t) > 4) && c->pae_support) 836 if ((sizeof(dma_addr_t) > 4) && c->pae_support)
476 writel(i2o_dma_high(sg_dma_address(sg)), mptr++); 837 *mptr++ = cpu_to_le32(i2o_dma_high(sg_dma_address(sg)));
477#endif 838#endif
478 sg++; 839 sg++;
479 } 840 }
@@ -563,6 +924,64 @@ static inline int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr,
563 return 0; 924 return 0;
564}; 925};
565 926
927/*
928 * i2o_pool_alloc - Allocate an slab cache and mempool
929 * @mempool: pointer to struct i2o_pool to write data into.
930 * @name: name which is used to identify cache
931 * @size: size of each object
932 * @min_nr: minimum number of objects
933 *
934 * First allocates a slab cache with name and size. Then allocates a
935 * mempool which uses the slab cache for allocation and freeing.
936 *
937 * Returns 0 on success or negative error code on failure.
938 */
939static inline int i2o_pool_alloc(struct i2o_pool *pool, const char *name,
940 size_t size, int min_nr)
941{
942 pool->name = kmalloc(strlen(name) + 1, GFP_KERNEL);
943 if (!pool->name)
944 goto exit;
945 strcpy(pool->name, name);
946
947 pool->slab =
948 kmem_cache_create(pool->name, size, 0, SLAB_HWCACHE_ALIGN, NULL,
949 NULL);
950 if (!pool->slab)
951 goto free_name;
952
953 pool->mempool =
954 mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab,
955 pool->slab);
956 if (!pool->mempool)
957 goto free_slab;
958
959 return 0;
960
961 free_slab:
962 kmem_cache_destroy(pool->slab);
963
964 free_name:
965 kfree(pool->name);
966
967 exit:
968 return -ENOMEM;
969};
970
971/*
972 * i2o_pool_free - Free slab cache and mempool again
973 * @mempool: pointer to struct i2o_pool which should be freed
974 *
975 * Note that you have to return all objects to the mempool again before
976 * calling i2o_pool_free().
977 */
978static inline void i2o_pool_free(struct i2o_pool *pool)
979{
980 mempool_destroy(pool->mempool);
981 kmem_cache_destroy(pool->slab);
982 kfree(pool->name);
983};
984
566/* I2O driver (OSM) functions */ 985/* I2O driver (OSM) functions */
567extern int i2o_driver_register(struct i2o_driver *); 986extern int i2o_driver_register(struct i2o_driver *);
568extern void i2o_driver_unregister(struct i2o_driver *); 987extern void i2o_driver_unregister(struct i2o_driver *);
@@ -638,39 +1057,89 @@ extern int i2o_exec_lct_get(struct i2o_controller *);
638#define kobj_to_i2o_device(kobj) to_i2o_device(container_of(kobj, struct device, kobj)) 1057#define kobj_to_i2o_device(kobj) to_i2o_device(container_of(kobj, struct device, kobj))
639 1058
640/** 1059/**
1060 * i2o_out_to_virt - Turn an I2O message to a virtual address
1061 * @c: controller
1062 * @m: message engine value
1063 *
1064 * Turn a receive message from an I2O controller bus address into
1065 * a Linux virtual address. The shared page frame is a linear block
1066 * so we simply have to shift the offset. This function does not
1067 * work for sender side messages as they are ioremap objects
1068 * provided by the I2O controller.
1069 */
1070static inline struct i2o_message *i2o_msg_out_to_virt(struct i2o_controller *c,
1071 u32 m)
1072{
1073 BUG_ON(m < c->out_queue.phys
1074 || m >= c->out_queue.phys + c->out_queue.len);
1075
1076 return c->out_queue.virt + (m - c->out_queue.phys);
1077};
1078
1079/**
1080 * i2o_msg_in_to_virt - Turn an I2O message to a virtual address
1081 * @c: controller
1082 * @m: message engine value
1083 *
1084 * Turn a send message from an I2O controller bus address into
1085 * a Linux virtual address. The shared page frame is a linear block
1086 * so we simply have to shift the offset. This function does not
1087 * work for receive side messages as they are kmalloc objects
1088 * in a different pool.
1089 */
1090static inline struct i2o_message __iomem *i2o_msg_in_to_virt(struct
1091 i2o_controller *c,
1092 u32 m)
1093{
1094 return c->in_queue.virt + m;
1095};
1096
1097/**
641 * i2o_msg_get - obtain an I2O message from the IOP 1098 * i2o_msg_get - obtain an I2O message from the IOP
642 * @c: I2O controller 1099 * @c: I2O controller
643 * @msg: pointer to a I2O message pointer
644 * 1100 *
645 * This function tries to get a message slot. If no message slot is 1101 * This function tries to get a message frame. If no message frame is
646 * available do not wait until one is availabe (see also i2o_msg_get_wait). 1102 * available do not wait until one is availabe (see also i2o_msg_get_wait).
1103 * The returned pointer to the message frame is not in I/O memory, it is
1104 * allocated from a mempool. But because a MFA is allocated from the
1105 * controller too it is guaranteed that i2o_msg_post() will never fail.
647 * 1106 *
648 * On a success the message is returned and the pointer to the message is 1107 * On a success a pointer to the message frame is returned. If the message
649 * set in msg. The returned message is the physical page frame offset 1108 * queue is empty -EBUSY is returned and if no memory is available -ENOMEM
650 * address from the read port (see the i2o spec). If no message is 1109 * is returned.
651 * available returns I2O_QUEUE_EMPTY and msg is leaved untouched.
652 */ 1110 */
653static inline u32 i2o_msg_get(struct i2o_controller *c, 1111static inline struct i2o_message *i2o_msg_get(struct i2o_controller *c)
654 struct i2o_message __iomem ** msg)
655{ 1112{
656 u32 m = readl(c->in_port); 1113 struct i2o_msg_mfa *mmsg = mempool_alloc(c->in_msg.mempool, GFP_ATOMIC);
657 1114 if (!mmsg)
658 if (m != I2O_QUEUE_EMPTY) 1115 return ERR_PTR(-ENOMEM);
659 *msg = c->in_queue.virt + m; 1116
1117 mmsg->mfa = readl(c->in_port);
1118 if (mmsg->mfa == I2O_QUEUE_EMPTY) {
1119 mempool_free(mmsg, c->in_msg.mempool);
1120 return ERR_PTR(-EBUSY);
1121 }
660 1122
661 return m; 1123 return &mmsg->msg;
662}; 1124};
663 1125
664/** 1126/**
665 * i2o_msg_post - Post I2O message to I2O controller 1127 * i2o_msg_post - Post I2O message to I2O controller
666 * @c: I2O controller to which the message should be send 1128 * @c: I2O controller to which the message should be send
667 * @m: the message identifier 1129 * @msg: message returned by i2o_msg_get()
668 * 1130 *
669 * Post the message to the I2O controller. 1131 * Post the message to the I2O controller and return immediately.
670 */ 1132 */
671static inline void i2o_msg_post(struct i2o_controller *c, u32 m) 1133static inline void i2o_msg_post(struct i2o_controller *c,
1134 struct i2o_message *msg)
672{ 1135{
673 writel(m, c->in_port); 1136 struct i2o_msg_mfa *mmsg;
1137
1138 mmsg = container_of(msg, struct i2o_msg_mfa, msg);
1139 memcpy_toio(i2o_msg_in_to_virt(c, mmsg->mfa), msg,
1140 (le32_to_cpu(msg->u.head[0]) >> 16) << 2);
1141 writel(mmsg->mfa, c->in_port);
1142 mempool_free(mmsg, c->in_msg.mempool);
674}; 1143};
675 1144
676/** 1145/**
@@ -685,62 +1154,66 @@ static inline void i2o_msg_post(struct i2o_controller *c, u32 m)
685 * 1154 *
686 * Returns 0 on success or negative error code on failure. 1155 * Returns 0 on success or negative error code on failure.
687 */ 1156 */
688static inline int i2o_msg_post_wait(struct i2o_controller *c, u32 m, 1157static inline int i2o_msg_post_wait(struct i2o_controller *c,
1158 struct i2o_message *msg,
689 unsigned long timeout) 1159 unsigned long timeout)
690{ 1160{
691 return i2o_msg_post_wait_mem(c, m, timeout, NULL); 1161 return i2o_msg_post_wait_mem(c, msg, timeout, NULL);
692}; 1162};
693 1163
694/** 1164/**
695 * i2o_flush_reply - Flush reply from I2O controller 1165 * i2o_msg_nop_mfa - Returns a fetched MFA back to the controller
696 * @c: I2O controller 1166 * @c: I2O controller from which the MFA was fetched
697 * @m: the message identifier 1167 * @mfa: MFA which should be returned
698 * 1168 *
699 * The I2O controller must be informed that the reply message is not needed 1169 * This function must be used for preserved messages, because i2o_msg_nop()
700 * anymore. If you forget to flush the reply, the message frame can't be 1170 * also returns the allocated memory back to the msg_pool mempool.
701 * used by the controller anymore and is therefore lost.
702 */ 1171 */
703static inline void i2o_flush_reply(struct i2o_controller *c, u32 m) 1172static inline void i2o_msg_nop_mfa(struct i2o_controller *c, u32 mfa)
704{ 1173{
705 writel(m, c->out_port); 1174 struct i2o_message __iomem *msg;
1175 u32 nop[3] = {
1176 THREE_WORD_MSG_SIZE | SGL_OFFSET_0,
1177 I2O_CMD_UTIL_NOP << 24 | HOST_TID << 12 | ADAPTER_TID,
1178 0x00000000
1179 };
1180
1181 msg = i2o_msg_in_to_virt(c, mfa);
1182 memcpy_toio(msg, nop, sizeof(nop));
1183 writel(mfa, c->in_port);
706}; 1184};
707 1185
708/** 1186/**
709 * i2o_out_to_virt - Turn an I2O message to a virtual address 1187 * i2o_msg_nop - Returns a message which is not used
710 * @c: controller 1188 * @c: I2O controller from which the message was created
711 * @m: message engine value 1189 * @msg: message which should be returned
712 * 1190 *
713 * Turn a receive message from an I2O controller bus address into 1191 * If you fetch a message via i2o_msg_get, and can't use it, you must
714 * a Linux virtual address. The shared page frame is a linear block 1192 * return the message with this function. Otherwise the MFA is lost as well
715 * so we simply have to shift the offset. This function does not 1193 * as the allocated memory from the mempool.
716 * work for sender side messages as they are ioremap objects
717 * provided by the I2O controller.
718 */ 1194 */
719static inline struct i2o_message *i2o_msg_out_to_virt(struct i2o_controller *c, 1195static inline void i2o_msg_nop(struct i2o_controller *c,
720 u32 m) 1196 struct i2o_message *msg)
721{ 1197{
722 BUG_ON(m < c->out_queue.phys 1198 struct i2o_msg_mfa *mmsg;
723 || m >= c->out_queue.phys + c->out_queue.len); 1199 mmsg = container_of(msg, struct i2o_msg_mfa, msg);
724 1200
725 return c->out_queue.virt + (m - c->out_queue.phys); 1201 i2o_msg_nop_mfa(c, mmsg->mfa);
1202 mempool_free(mmsg, c->in_msg.mempool);
726}; 1203};
727 1204
728/** 1205/**
729 * i2o_msg_in_to_virt - Turn an I2O message to a virtual address 1206 * i2o_flush_reply - Flush reply from I2O controller
730 * @c: controller 1207 * @c: I2O controller
731 * @m: message engine value 1208 * @m: the message identifier
732 * 1209 *
733 * Turn a send message from an I2O controller bus address into 1210 * The I2O controller must be informed that the reply message is not needed
734 * a Linux virtual address. The shared page frame is a linear block 1211 * anymore. If you forget to flush the reply, the message frame can't be
735 * so we simply have to shift the offset. This function does not 1212 * used by the controller anymore and is therefore lost.
736 * work for receive side messages as they are kmalloc objects
737 * in a different pool.
738 */ 1213 */
739static inline struct i2o_message __iomem *i2o_msg_in_to_virt(struct 1214static inline void i2o_flush_reply(struct i2o_controller *c, u32 m)
740 i2o_controller *c,
741 u32 m)
742{ 1215{
743 return c->in_queue.virt + m; 1216 writel(m, c->out_port);
744}; 1217};
745 1218
746/* 1219/*
@@ -779,350 +1252,5 @@ extern void i2o_dump_message(struct i2o_message *);
779extern void i2o_dump_hrt(struct i2o_controller *c); 1252extern void i2o_dump_hrt(struct i2o_controller *c);
780extern void i2o_debug_state(struct i2o_controller *c); 1253extern void i2o_debug_state(struct i2o_controller *c);
781 1254
782/*
783 * Cache strategies
784 */
785
786/* The NULL strategy leaves everything up to the controller. This tends to be a
787 * pessimal but functional choice.
788 */
789#define CACHE_NULL 0
790/* Prefetch data when reading. We continually attempt to load the next 32 sectors
791 * into the controller cache.
792 */
793#define CACHE_PREFETCH 1
794/* Prefetch data when reading. We sometimes attempt to load the next 32 sectors
795 * into the controller cache. When an I/O is less <= 8K we assume its probably
796 * not sequential and don't prefetch (default)
797 */
798#define CACHE_SMARTFETCH 2
799/* Data is written to the cache and then out on to the disk. The I/O must be
800 * physically on the medium before the write is acknowledged (default without
801 * NVRAM)
802 */
803#define CACHE_WRITETHROUGH 17
804/* Data is written to the cache and then out on to the disk. The controller
805 * is permitted to write back the cache any way it wants. (default if battery
806 * backed NVRAM is present). It can be useful to set this for swap regardless of
807 * battery state.
808 */
809#define CACHE_WRITEBACK 18
810/* Optimise for under powered controllers, especially on RAID1 and RAID0. We
811 * write large I/O's directly to disk bypassing the cache to avoid the extra
812 * memory copy hits. Small writes are writeback cached
813 */
814#define CACHE_SMARTBACK 19
815/* Optimise for under powered controllers, especially on RAID1 and RAID0. We
816 * write large I/O's directly to disk bypassing the cache to avoid the extra
817 * memory copy hits. Small writes are writethrough cached. Suitable for devices
818 * lacking battery backup
819 */
820#define CACHE_SMARTTHROUGH 20
821
822/*
823 * Ioctl structures
824 */
825
826#define BLKI2OGRSTRAT _IOR('2', 1, int)
827#define BLKI2OGWSTRAT _IOR('2', 2, int)
828#define BLKI2OSRSTRAT _IOW('2', 3, int)
829#define BLKI2OSWSTRAT _IOW('2', 4, int)
830
831/*
832 * I2O Function codes
833 */
834
835/*
836 * Executive Class
837 */
838#define I2O_CMD_ADAPTER_ASSIGN 0xB3
839#define I2O_CMD_ADAPTER_READ 0xB2
840#define I2O_CMD_ADAPTER_RELEASE 0xB5
841#define I2O_CMD_BIOS_INFO_SET 0xA5
842#define I2O_CMD_BOOT_DEVICE_SET 0xA7
843#define I2O_CMD_CONFIG_VALIDATE 0xBB
844#define I2O_CMD_CONN_SETUP 0xCA
845#define I2O_CMD_DDM_DESTROY 0xB1
846#define I2O_CMD_DDM_ENABLE 0xD5
847#define I2O_CMD_DDM_QUIESCE 0xC7
848#define I2O_CMD_DDM_RESET 0xD9
849#define I2O_CMD_DDM_SUSPEND 0xAF
850#define I2O_CMD_DEVICE_ASSIGN 0xB7
851#define I2O_CMD_DEVICE_RELEASE 0xB9
852#define I2O_CMD_HRT_GET 0xA8
853#define I2O_CMD_ADAPTER_CLEAR 0xBE
854#define I2O_CMD_ADAPTER_CONNECT 0xC9
855#define I2O_CMD_ADAPTER_RESET 0xBD
856#define I2O_CMD_LCT_NOTIFY 0xA2
857#define I2O_CMD_OUTBOUND_INIT 0xA1
858#define I2O_CMD_PATH_ENABLE 0xD3
859#define I2O_CMD_PATH_QUIESCE 0xC5
860#define I2O_CMD_PATH_RESET 0xD7
861#define I2O_CMD_STATIC_MF_CREATE 0xDD
862#define I2O_CMD_STATIC_MF_RELEASE 0xDF
863#define I2O_CMD_STATUS_GET 0xA0
864#define I2O_CMD_SW_DOWNLOAD 0xA9
865#define I2O_CMD_SW_UPLOAD 0xAB
866#define I2O_CMD_SW_REMOVE 0xAD
867#define I2O_CMD_SYS_ENABLE 0xD1
868#define I2O_CMD_SYS_MODIFY 0xC1
869#define I2O_CMD_SYS_QUIESCE 0xC3
870#define I2O_CMD_SYS_TAB_SET 0xA3
871
872/*
873 * Utility Class
874 */
875#define I2O_CMD_UTIL_NOP 0x00
876#define I2O_CMD_UTIL_ABORT 0x01
877#define I2O_CMD_UTIL_CLAIM 0x09
878#define I2O_CMD_UTIL_RELEASE 0x0B
879#define I2O_CMD_UTIL_PARAMS_GET 0x06
880#define I2O_CMD_UTIL_PARAMS_SET 0x05
881#define I2O_CMD_UTIL_EVT_REGISTER 0x13
882#define I2O_CMD_UTIL_EVT_ACK 0x14
883#define I2O_CMD_UTIL_CONFIG_DIALOG 0x10
884#define I2O_CMD_UTIL_DEVICE_RESERVE 0x0D
885#define I2O_CMD_UTIL_DEVICE_RELEASE 0x0F
886#define I2O_CMD_UTIL_LOCK 0x17
887#define I2O_CMD_UTIL_LOCK_RELEASE 0x19
888#define I2O_CMD_UTIL_REPLY_FAULT_NOTIFY 0x15
889
890/*
891 * SCSI Host Bus Adapter Class
892 */
893#define I2O_CMD_SCSI_EXEC 0x81
894#define I2O_CMD_SCSI_ABORT 0x83
895#define I2O_CMD_SCSI_BUSRESET 0x27
896
897/*
898 * Bus Adapter Class
899 */
900#define I2O_CMD_BUS_ADAPTER_RESET 0x85
901#define I2O_CMD_BUS_RESET 0x87
902#define I2O_CMD_BUS_SCAN 0x89
903#define I2O_CMD_BUS_QUIESCE 0x8b
904
905/*
906 * Random Block Storage Class
907 */
908#define I2O_CMD_BLOCK_READ 0x30
909#define I2O_CMD_BLOCK_WRITE 0x31
910#define I2O_CMD_BLOCK_CFLUSH 0x37
911#define I2O_CMD_BLOCK_MLOCK 0x49
912#define I2O_CMD_BLOCK_MUNLOCK 0x4B
913#define I2O_CMD_BLOCK_MMOUNT 0x41
914#define I2O_CMD_BLOCK_MEJECT 0x43
915#define I2O_CMD_BLOCK_POWER 0x70
916
917#define I2O_CMD_PRIVATE 0xFF
918
919/* Command status values */
920
921#define I2O_CMD_IN_PROGRESS 0x01
922#define I2O_CMD_REJECTED 0x02
923#define I2O_CMD_FAILED 0x03
924#define I2O_CMD_COMPLETED 0x04
925
926/* I2O API function return values */
927
928#define I2O_RTN_NO_ERROR 0
929#define I2O_RTN_NOT_INIT 1
930#define I2O_RTN_FREE_Q_EMPTY 2
931#define I2O_RTN_TCB_ERROR 3
932#define I2O_RTN_TRANSACTION_ERROR 4
933#define I2O_RTN_ADAPTER_ALREADY_INIT 5
934#define I2O_RTN_MALLOC_ERROR 6
935#define I2O_RTN_ADPTR_NOT_REGISTERED 7
936#define I2O_RTN_MSG_REPLY_TIMEOUT 8
937#define I2O_RTN_NO_STATUS 9
938#define I2O_RTN_NO_FIRM_VER 10
939#define I2O_RTN_NO_LINK_SPEED 11
940
941/* Reply message status defines for all messages */
942
943#define I2O_REPLY_STATUS_SUCCESS 0x00
944#define I2O_REPLY_STATUS_ABORT_DIRTY 0x01
945#define I2O_REPLY_STATUS_ABORT_NO_DATA_TRANSFER 0x02
946#define I2O_REPLY_STATUS_ABORT_PARTIAL_TRANSFER 0x03
947#define I2O_REPLY_STATUS_ERROR_DIRTY 0x04
948#define I2O_REPLY_STATUS_ERROR_NO_DATA_TRANSFER 0x05
949#define I2O_REPLY_STATUS_ERROR_PARTIAL_TRANSFER 0x06
950#define I2O_REPLY_STATUS_PROCESS_ABORT_DIRTY 0x08
951#define I2O_REPLY_STATUS_PROCESS_ABORT_NO_DATA_TRANSFER 0x09
952#define I2O_REPLY_STATUS_PROCESS_ABORT_PARTIAL_TRANSFER 0x0A
953#define I2O_REPLY_STATUS_TRANSACTION_ERROR 0x0B
954#define I2O_REPLY_STATUS_PROGRESS_REPORT 0x80
955
956/* Status codes and Error Information for Parameter functions */
957
958#define I2O_PARAMS_STATUS_SUCCESS 0x00
959#define I2O_PARAMS_STATUS_BAD_KEY_ABORT 0x01
960#define I2O_PARAMS_STATUS_BAD_KEY_CONTINUE 0x02
961#define I2O_PARAMS_STATUS_BUFFER_FULL 0x03
962#define I2O_PARAMS_STATUS_BUFFER_TOO_SMALL 0x04
963#define I2O_PARAMS_STATUS_FIELD_UNREADABLE 0x05
964#define I2O_PARAMS_STATUS_FIELD_UNWRITEABLE 0x06
965#define I2O_PARAMS_STATUS_INSUFFICIENT_FIELDS 0x07
966#define I2O_PARAMS_STATUS_INVALID_GROUP_ID 0x08
967#define I2O_PARAMS_STATUS_INVALID_OPERATION 0x09
968#define I2O_PARAMS_STATUS_NO_KEY_FIELD 0x0A
969#define I2O_PARAMS_STATUS_NO_SUCH_FIELD 0x0B
970#define I2O_PARAMS_STATUS_NON_DYNAMIC_GROUP 0x0C
971#define I2O_PARAMS_STATUS_OPERATION_ERROR 0x0D
972#define I2O_PARAMS_STATUS_SCALAR_ERROR 0x0E
973#define I2O_PARAMS_STATUS_TABLE_ERROR 0x0F
974#define I2O_PARAMS_STATUS_WRONG_GROUP_TYPE 0x10
975
976/* DetailedStatusCode defines for Executive, DDM, Util and Transaction error
977 * messages: Table 3-2 Detailed Status Codes.*/
978
979#define I2O_DSC_SUCCESS 0x0000
980#define I2O_DSC_BAD_KEY 0x0002
981#define I2O_DSC_TCL_ERROR 0x0003
982#define I2O_DSC_REPLY_BUFFER_FULL 0x0004
983#define I2O_DSC_NO_SUCH_PAGE 0x0005
984#define I2O_DSC_INSUFFICIENT_RESOURCE_SOFT 0x0006
985#define I2O_DSC_INSUFFICIENT_RESOURCE_HARD 0x0007
986#define I2O_DSC_CHAIN_BUFFER_TOO_LARGE 0x0009
987#define I2O_DSC_UNSUPPORTED_FUNCTION 0x000A
988#define I2O_DSC_DEVICE_LOCKED 0x000B
989#define I2O_DSC_DEVICE_RESET 0x000C
990#define I2O_DSC_INAPPROPRIATE_FUNCTION 0x000D
991#define I2O_DSC_INVALID_INITIATOR_ADDRESS 0x000E
992#define I2O_DSC_INVALID_MESSAGE_FLAGS 0x000F
993#define I2O_DSC_INVALID_OFFSET 0x0010
994#define I2O_DSC_INVALID_PARAMETER 0x0011
995#define I2O_DSC_INVALID_REQUEST 0x0012
996#define I2O_DSC_INVALID_TARGET_ADDRESS 0x0013
997#define I2O_DSC_MESSAGE_TOO_LARGE 0x0014
998#define I2O_DSC_MESSAGE_TOO_SMALL 0x0015
999#define I2O_DSC_MISSING_PARAMETER 0x0016
1000#define I2O_DSC_TIMEOUT 0x0017
1001#define I2O_DSC_UNKNOWN_ERROR 0x0018
1002#define I2O_DSC_UNKNOWN_FUNCTION 0x0019
1003#define I2O_DSC_UNSUPPORTED_VERSION 0x001A
1004#define I2O_DSC_DEVICE_BUSY 0x001B
1005#define I2O_DSC_DEVICE_NOT_AVAILABLE 0x001C
1006
1007/* DetailedStatusCode defines for Block Storage Operation: Table 6-7 Detailed
1008 Status Codes.*/
1009
1010#define I2O_BSA_DSC_SUCCESS 0x0000
1011#define I2O_BSA_DSC_MEDIA_ERROR 0x0001
1012#define I2O_BSA_DSC_ACCESS_ERROR 0x0002
1013#define I2O_BSA_DSC_DEVICE_FAILURE 0x0003
1014#define I2O_BSA_DSC_DEVICE_NOT_READY 0x0004
1015#define I2O_BSA_DSC_MEDIA_NOT_PRESENT 0x0005
1016#define I2O_BSA_DSC_MEDIA_LOCKED 0x0006
1017#define I2O_BSA_DSC_MEDIA_FAILURE 0x0007
1018#define I2O_BSA_DSC_PROTOCOL_FAILURE 0x0008
1019#define I2O_BSA_DSC_BUS_FAILURE 0x0009
1020#define I2O_BSA_DSC_ACCESS_VIOLATION 0x000A
1021#define I2O_BSA_DSC_WRITE_PROTECTED 0x000B
1022#define I2O_BSA_DSC_DEVICE_RESET 0x000C
1023#define I2O_BSA_DSC_VOLUME_CHANGED 0x000D
1024#define I2O_BSA_DSC_TIMEOUT 0x000E
1025
1026/* FailureStatusCodes, Table 3-3 Message Failure Codes */
1027
1028#define I2O_FSC_TRANSPORT_SERVICE_SUSPENDED 0x81
1029#define I2O_FSC_TRANSPORT_SERVICE_TERMINATED 0x82
1030#define I2O_FSC_TRANSPORT_CONGESTION 0x83
1031#define I2O_FSC_TRANSPORT_FAILURE 0x84
1032#define I2O_FSC_TRANSPORT_STATE_ERROR 0x85
1033#define I2O_FSC_TRANSPORT_TIME_OUT 0x86
1034#define I2O_FSC_TRANSPORT_ROUTING_FAILURE 0x87
1035#define I2O_FSC_TRANSPORT_INVALID_VERSION 0x88
1036#define I2O_FSC_TRANSPORT_INVALID_OFFSET 0x89
1037#define I2O_FSC_TRANSPORT_INVALID_MSG_FLAGS 0x8A
1038#define I2O_FSC_TRANSPORT_FRAME_TOO_SMALL 0x8B
1039#define I2O_FSC_TRANSPORT_FRAME_TOO_LARGE 0x8C
1040#define I2O_FSC_TRANSPORT_INVALID_TARGET_ID 0x8D
1041#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_ID 0x8E
1042#define I2O_FSC_TRANSPORT_INVALID_INITIATOR_CONTEXT 0x8F
1043#define I2O_FSC_TRANSPORT_UNKNOWN_FAILURE 0xFF
1044
1045/* Device Claim Types */
1046#define I2O_CLAIM_PRIMARY 0x01000000
1047#define I2O_CLAIM_MANAGEMENT 0x02000000
1048#define I2O_CLAIM_AUTHORIZED 0x03000000
1049#define I2O_CLAIM_SECONDARY 0x04000000
1050
1051/* Message header defines for VersionOffset */
1052#define I2OVER15 0x0001
1053#define I2OVER20 0x0002
1054
1055/* Default is 1.5 */
1056#define I2OVERSION I2OVER15
1057
1058#define SGL_OFFSET_0 I2OVERSION
1059#define SGL_OFFSET_4 (0x0040 | I2OVERSION)
1060#define SGL_OFFSET_5 (0x0050 | I2OVERSION)
1061#define SGL_OFFSET_6 (0x0060 | I2OVERSION)
1062#define SGL_OFFSET_7 (0x0070 | I2OVERSION)
1063#define SGL_OFFSET_8 (0x0080 | I2OVERSION)
1064#define SGL_OFFSET_9 (0x0090 | I2OVERSION)
1065#define SGL_OFFSET_10 (0x00A0 | I2OVERSION)
1066#define SGL_OFFSET_11 (0x00B0 | I2OVERSION)
1067#define SGL_OFFSET_12 (0x00C0 | I2OVERSION)
1068#define SGL_OFFSET(x) (((x)<<4) | I2OVERSION)
1069
1070/* Transaction Reply Lists (TRL) Control Word structure */
1071#define TRL_SINGLE_FIXED_LENGTH 0x00
1072#define TRL_SINGLE_VARIABLE_LENGTH 0x40
1073#define TRL_MULTIPLE_FIXED_LENGTH 0x80
1074
1075 /* msg header defines for MsgFlags */
1076#define MSG_STATIC 0x0100
1077#define MSG_64BIT_CNTXT 0x0200
1078#define MSG_MULTI_TRANS 0x1000
1079#define MSG_FAIL 0x2000
1080#define MSG_FINAL 0x4000
1081#define MSG_REPLY 0x8000
1082
1083 /* minimum size msg */
1084#define THREE_WORD_MSG_SIZE 0x00030000
1085#define FOUR_WORD_MSG_SIZE 0x00040000
1086#define FIVE_WORD_MSG_SIZE 0x00050000
1087#define SIX_WORD_MSG_SIZE 0x00060000
1088#define SEVEN_WORD_MSG_SIZE 0x00070000
1089#define EIGHT_WORD_MSG_SIZE 0x00080000
1090#define NINE_WORD_MSG_SIZE 0x00090000
1091#define TEN_WORD_MSG_SIZE 0x000A0000
1092#define ELEVEN_WORD_MSG_SIZE 0x000B0000
1093#define I2O_MESSAGE_SIZE(x) ((x)<<16)
1094
1095/* special TID assignments */
1096#define ADAPTER_TID 0
1097#define HOST_TID 1
1098
1099/* outbound queue defines */
1100#define I2O_MAX_OUTBOUND_MSG_FRAMES 128
1101#define I2O_OUTBOUND_MSG_FRAME_SIZE 128 /* in 32-bit words */
1102
1103#define I2O_POST_WAIT_OK 0
1104#define I2O_POST_WAIT_TIMEOUT -ETIMEDOUT
1105
1106#define I2O_CONTEXT_LIST_MIN_LENGTH 15
1107#define I2O_CONTEXT_LIST_USED 0x01
1108#define I2O_CONTEXT_LIST_DELETED 0x02
1109
1110/* timeouts */
1111#define I2O_TIMEOUT_INIT_OUTBOUND_QUEUE 15
1112#define I2O_TIMEOUT_MESSAGE_GET 5
1113#define I2O_TIMEOUT_RESET 30
1114#define I2O_TIMEOUT_STATUS_GET 5
1115#define I2O_TIMEOUT_LCT_GET 360
1116#define I2O_TIMEOUT_SCSI_SCB_ABORT 240
1117
1118/* retries */
1119#define I2O_HRT_GET_TRIES 3
1120#define I2O_LCT_GET_TRIES 3
1121
1122/* defines for max_sectors and max_phys_segments */
1123#define I2O_MAX_SECTORS 1024
1124#define I2O_MAX_SECTORS_LIMITED 256
1125#define I2O_MAX_PHYS_SEGMENTS MAX_PHYS_SEGMENTS
1126
1127#endif /* __KERNEL__ */ 1255#endif /* __KERNEL__ */
1128#endif /* _I2O_H */ 1256#endif /* _I2O_H */
diff --git a/include/linux/irq.h b/include/linux/irq.h
index f04ba20712a2..6c5d4c898ccb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -12,7 +12,7 @@
12#include <linux/config.h> 12#include <linux/config.h>
13#include <linux/smp.h> 13#include <linux/smp.h>
14 14
15#if !defined(CONFIG_ARCH_S390) 15#if !defined(CONFIG_S390)
16 16
17#include <linux/linkage.h> 17#include <linux/linkage.h>
18#include <linux/cache.h> 18#include <linux/cache.h>
@@ -221,6 +221,17 @@ extern void note_interrupt(unsigned int irq, irq_desc_t *desc,
221extern int can_request_irq(unsigned int irq, unsigned long irqflags); 221extern int can_request_irq(unsigned int irq, unsigned long irqflags);
222 222
223extern void init_irq_proc(void); 223extern void init_irq_proc(void);
224
225#ifdef CONFIG_AUTO_IRQ_AFFINITY
226extern int select_smp_affinity(unsigned int irq);
227#else
228static inline int
229select_smp_affinity(unsigned int irq)
230{
231 return 1;
232}
233#endif
234
224#endif 235#endif
225 236
226extern hw_irq_controller no_irq_type; /* needed in every arch ? */ 237extern hw_irq_controller no_irq_type; /* needed in every arch ? */
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index dcde7adfdce5..558cb4c26ec9 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -498,6 +498,12 @@ struct transaction_s
498 struct journal_head *t_checkpoint_list; 498 struct journal_head *t_checkpoint_list;
499 499
500 /* 500 /*
501 * Doubly-linked circular list of all buffers submitted for IO while
502 * checkpointing. [j_list_lock]
503 */
504 struct journal_head *t_checkpoint_io_list;
505
506 /*
501 * Doubly-linked circular list of temporary buffers currently undergoing 507 * Doubly-linked circular list of temporary buffers currently undergoing
502 * IO in the log [j_list_lock] 508 * IO in the log [j_list_lock]
503 */ 509 */
@@ -843,7 +849,7 @@ extern void journal_commit_transaction(journal_t *);
843 849
844/* Checkpoint list management */ 850/* Checkpoint list management */
845int __journal_clean_checkpoint_list(journal_t *journal); 851int __journal_clean_checkpoint_list(journal_t *journal);
846void __journal_remove_checkpoint(struct journal_head *); 852int __journal_remove_checkpoint(struct journal_head *);
847void __journal_insert_checkpoint(struct journal_head *, transaction_t *); 853void __journal_insert_checkpoint(struct journal_head *, transaction_t *);
848 854
849/* Buffer IO */ 855/* Buffer IO */
diff --git a/include/linux/key.h b/include/linux/key.h
index 53513a3be53b..4d189e51bc6c 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -193,14 +193,6 @@ struct key_type {
193 */ 193 */
194 int (*instantiate)(struct key *key, const void *data, size_t datalen); 194 int (*instantiate)(struct key *key, const void *data, size_t datalen);
195 195
196 /* duplicate a key of this type (optional)
197 * - the source key will be locked against change
198 * - the new description will be attached
199 * - the quota will have been adjusted automatically from
200 * source->quotalen
201 */
202 int (*duplicate)(struct key *key, const struct key *source);
203
204 /* update a key of this type (optional) 196 /* update a key of this type (optional)
205 * - this method should call key_payload_reserve() to recalculate the 197 * - this method should call key_payload_reserve() to recalculate the
206 * quota consumption 198 * quota consumption
diff --git a/include/linux/libata.h b/include/linux/libata.h
index e828e172ccbf..a43c95f8f968 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -124,6 +124,8 @@ enum {
124 ATA_FLAG_DEBUGMSG = (1 << 10), 124 ATA_FLAG_DEBUGMSG = (1 << 10),
125 ATA_FLAG_NO_ATAPI = (1 << 11), /* No ATAPI support */ 125 ATA_FLAG_NO_ATAPI = (1 << 11), /* No ATAPI support */
126 126
127 ATA_FLAG_SUSPENDED = (1 << 12), /* port is suspended */
128
127 ATA_QCFLAG_ACTIVE = (1 << 1), /* cmd not yet ack'd to scsi lyer */ 129 ATA_QCFLAG_ACTIVE = (1 << 1), /* cmd not yet ack'd to scsi lyer */
128 ATA_QCFLAG_SG = (1 << 3), /* have s/g table? */ 130 ATA_QCFLAG_SG = (1 << 3), /* have s/g table? */
129 ATA_QCFLAG_SINGLE = (1 << 4), /* no s/g, just a single buffer */ 131 ATA_QCFLAG_SINGLE = (1 << 4), /* no s/g, just a single buffer */
@@ -436,6 +438,8 @@ extern void ata_std_ports(struct ata_ioports *ioaddr);
436extern int ata_pci_init_one (struct pci_dev *pdev, struct ata_port_info **port_info, 438extern int ata_pci_init_one (struct pci_dev *pdev, struct ata_port_info **port_info,
437 unsigned int n_ports); 439 unsigned int n_ports);
438extern void ata_pci_remove_one (struct pci_dev *pdev); 440extern void ata_pci_remove_one (struct pci_dev *pdev);
441extern int ata_pci_device_suspend(struct pci_dev *pdev, pm_message_t state);
442extern int ata_pci_device_resume(struct pci_dev *pdev);
439#endif /* CONFIG_PCI */ 443#endif /* CONFIG_PCI */
440extern int ata_device_add(const struct ata_probe_ent *ent); 444extern int ata_device_add(const struct ata_probe_ent *ent);
441extern void ata_host_set_remove(struct ata_host_set *host_set); 445extern void ata_host_set_remove(struct ata_host_set *host_set);
@@ -445,6 +449,10 @@ extern int ata_scsi_queuecmd(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmn
445extern int ata_scsi_error(struct Scsi_Host *host); 449extern int ata_scsi_error(struct Scsi_Host *host);
446extern int ata_scsi_release(struct Scsi_Host *host); 450extern int ata_scsi_release(struct Scsi_Host *host);
447extern unsigned int ata_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc); 451extern unsigned int ata_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc);
452extern int ata_scsi_device_resume(struct scsi_device *);
453extern int ata_scsi_device_suspend(struct scsi_device *);
454extern int ata_device_resume(struct ata_port *, struct ata_device *);
455extern int ata_device_suspend(struct ata_port *, struct ata_device *);
448extern int ata_ratelimit(void); 456extern int ata_ratelimit(void);
449 457
450/* 458/*
@@ -480,7 +488,8 @@ extern u8 ata_bmdma_status(struct ata_port *ap);
480extern void ata_bmdma_irq_clear(struct ata_port *ap); 488extern void ata_bmdma_irq_clear(struct ata_port *ap);
481extern void ata_qc_complete(struct ata_queued_cmd *qc); 489extern void ata_qc_complete(struct ata_queued_cmd *qc);
482extern void ata_eng_timeout(struct ata_port *ap); 490extern void ata_eng_timeout(struct ata_port *ap);
483extern void ata_scsi_simulate(u16 *id, struct scsi_cmnd *cmd, 491extern void ata_scsi_simulate(struct ata_port *ap, struct ata_device *dev,
492 struct scsi_cmnd *cmd,
484 void (*done)(struct scsi_cmnd *)); 493 void (*done)(struct scsi_cmnd *));
485extern int ata_std_bios_param(struct scsi_device *sdev, 494extern int ata_std_bios_param(struct scsi_device *sdev,
486 struct block_device *bdev, 495 struct block_device *bdev,
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 8b67cf837ca9..ed00b278cb93 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -110,14 +110,6 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
110#define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL) 110#define mpol_set_vma_default(vma) ((vma)->vm_policy = NULL)
111 111
112/* 112/*
113 * Hugetlb policy. i386 hugetlb so far works with node numbers
114 * instead of zone lists, so give it special interfaces for now.
115 */
116extern int mpol_first_node(struct vm_area_struct *vma, unsigned long addr);
117extern int mpol_node_valid(int nid, struct vm_area_struct *vma,
118 unsigned long addr);
119
120/*
121 * Tree of shared policies for a shared memory region. 113 * Tree of shared policies for a shared memory region.
122 * Maintain the policies in a pseudo mm that contains vmas. The vmas 114 * Maintain the policies in a pseudo mm that contains vmas. The vmas
123 * carry the policy. As a special twist the pseudo mm is indexed in pages, not 115 * carry the policy. As a special twist the pseudo mm is indexed in pages, not
@@ -156,6 +148,16 @@ extern void numa_default_policy(void);
156extern void numa_policy_init(void); 148extern void numa_policy_init(void);
157extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new); 149extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
158extern struct mempolicy default_policy; 150extern struct mempolicy default_policy;
151extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
152 unsigned long addr);
153
154extern int policy_zone;
155
156static inline void check_highest_zone(int k)
157{
158 if (k > policy_zone)
159 policy_zone = k;
160}
159 161
160#else 162#else
161 163
@@ -182,17 +184,6 @@ static inline struct mempolicy *mpol_copy(struct mempolicy *old)
182 return NULL; 184 return NULL;
183} 185}
184 186
185static inline int mpol_first_node(struct vm_area_struct *vma, unsigned long a)
186{
187 return numa_node_id();
188}
189
190static inline int
191mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long a)
192{
193 return 1;
194}
195
196struct shared_policy {}; 187struct shared_policy {};
197 188
198static inline int mpol_set_shared_policy(struct shared_policy *info, 189static inline int mpol_set_shared_policy(struct shared_policy *info,
@@ -232,6 +223,15 @@ static inline void numa_policy_rebind(const nodemask_t *old,
232{ 223{
233} 224}
234 225
226static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
227 unsigned long addr)
228{
229 return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
230}
231
232static inline void check_highest_zone(int k)
233{
234}
235#endif /* CONFIG_NUMA */ 235#endif /* CONFIG_NUMA */
236#endif /* __KERNEL__ */ 236#endif /* __KERNEL__ */
237 237
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a06a84d347fb..bc01fff3aa01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -634,14 +634,38 @@ struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
634int shmem_lock(struct file *file, int lock, struct user_struct *user); 634int shmem_lock(struct file *file, int lock, struct user_struct *user);
635#else 635#else
636#define shmem_nopage filemap_nopage 636#define shmem_nopage filemap_nopage
637#define shmem_lock(a, b, c) ({0;}) /* always in memory, no need to lock */ 637
638#define shmem_set_policy(a, b) (0) 638static inline int shmem_lock(struct file *file, int lock,
639#define shmem_get_policy(a, b) (NULL) 639 struct user_struct *user)
640{
641 return 0;
642}
643
644static inline int shmem_set_policy(struct vm_area_struct *vma,
645 struct mempolicy *new)
646{
647 return 0;
648}
649
650static inline struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
651 unsigned long addr)
652{
653 return NULL;
654}
640#endif 655#endif
641struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags); 656struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
657extern int shmem_mmap(struct file *file, struct vm_area_struct *vma);
642 658
643int shmem_zero_setup(struct vm_area_struct *); 659int shmem_zero_setup(struct vm_area_struct *);
644 660
661#ifndef CONFIG_MMU
662extern unsigned long shmem_get_unmapped_area(struct file *file,
663 unsigned long addr,
664 unsigned long len,
665 unsigned long pgoff,
666 unsigned long flags);
667#endif
668
645static inline int can_do_mlock(void) 669static inline int can_do_mlock(void)
646{ 670{
647 if (capable(CAP_IPC_LOCK)) 671 if (capable(CAP_IPC_LOCK))
@@ -690,14 +714,31 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
690} 714}
691 715
692extern int vmtruncate(struct inode * inode, loff_t offset); 716extern int vmtruncate(struct inode * inode, loff_t offset);
717extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
693extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); 718extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
694extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); 719extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
695extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
696 720
697static inline int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access) 721#ifdef CONFIG_MMU
722extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma,
723 unsigned long address, int write_access);
724
725static inline int handle_mm_fault(struct mm_struct *mm,
726 struct vm_area_struct *vma, unsigned long address,
727 int write_access)
698{ 728{
699 return __handle_mm_fault(mm, vma, address, write_access) & (~VM_FAULT_WRITE); 729 return __handle_mm_fault(mm, vma, address, write_access) &
730 (~VM_FAULT_WRITE);
700} 731}
732#else
733static inline int handle_mm_fault(struct mm_struct *mm,
734 struct vm_area_struct *vma, unsigned long address,
735 int write_access)
736{
737 /* should never happen if there's no MMU */
738 BUG();
739 return VM_FAULT_SIGBUS;
740}
741#endif
701 742
702extern int make_pages_present(unsigned long addr, unsigned long end); 743extern int make_pages_present(unsigned long addr, unsigned long end);
703extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); 744extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
@@ -896,6 +937,8 @@ extern unsigned long do_brk(unsigned long, unsigned long);
896/* filemap.c */ 937/* filemap.c */
897extern unsigned long page_unuse(struct page *); 938extern unsigned long page_unuse(struct page *);
898extern void truncate_inode_pages(struct address_space *, loff_t); 939extern void truncate_inode_pages(struct address_space *, loff_t);
940extern void truncate_inode_pages_range(struct address_space *,
941 loff_t lstart, loff_t lend);
899 942
900/* generic vm_area_ops exported for stackable file systems */ 943/* generic vm_area_ops exported for stackable file systems */
901extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *); 944extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9f22090df7dd..c34f4a2c62f8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -46,7 +46,6 @@ struct zone_padding {
46 46
47struct per_cpu_pages { 47struct per_cpu_pages {
48 int count; /* number of pages in the list */ 48 int count; /* number of pages in the list */
49 int low; /* low watermark, refill needed */
50 int high; /* high watermark, emptying needed */ 49 int high; /* high watermark, emptying needed */
51 int batch; /* chunk size for buddy add/remove */ 50 int batch; /* chunk size for buddy add/remove */
52 struct list_head list; /* the list of pages */ 51 struct list_head list; /* the list of pages */
@@ -389,6 +388,11 @@ static inline struct zone *next_zone(struct zone *zone)
389#define for_each_zone(zone) \ 388#define for_each_zone(zone) \
390 for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) 389 for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
391 390
391static inline int populated_zone(struct zone *zone)
392{
393 return (!!zone->present_pages);
394}
395
392static inline int is_highmem_idx(int idx) 396static inline int is_highmem_idx(int idx)
393{ 397{
394 return (idx == ZONE_HIGHMEM); 398 return (idx == ZONE_HIGHMEM);
@@ -398,6 +402,7 @@ static inline int is_normal_idx(int idx)
398{ 402{
399 return (idx == ZONE_NORMAL); 403 return (idx == ZONE_NORMAL);
400} 404}
405
401/** 406/**
402 * is_highmem - helper function to quickly check if a struct zone is a 407 * is_highmem - helper function to quickly check if a struct zone is a
403 * highmem zone or not. This is an attempt to keep references 408 * highmem zone or not. This is an attempt to keep references
@@ -414,6 +419,16 @@ static inline int is_normal(struct zone *zone)
414 return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; 419 return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL;
415} 420}
416 421
422static inline int is_dma32(struct zone *zone)
423{
424 return zone == zone->zone_pgdat->node_zones + ZONE_DMA32;
425}
426
427static inline int is_dma(struct zone *zone)
428{
429 return zone == zone->zone_pgdat->node_zones + ZONE_DMA;
430}
431
417/* These two functions are used to setup the per zone pages min values */ 432/* These two functions are used to setup the per zone pages min values */
418struct ctl_table; 433struct ctl_table;
419struct file; 434struct file;
@@ -435,7 +450,6 @@ extern struct pglist_data contig_page_data;
435#define NODE_DATA(nid) (&contig_page_data) 450#define NODE_DATA(nid) (&contig_page_data)
436#define NODE_MEM_MAP(nid) mem_map 451#define NODE_MEM_MAP(nid) mem_map
437#define MAX_NODES_SHIFT 1 452#define MAX_NODES_SHIFT 1
438#define pfn_to_nid(pfn) (0)
439 453
440#else /* CONFIG_NEED_MULTIPLE_NODES */ 454#else /* CONFIG_NEED_MULTIPLE_NODES */
441 455
@@ -470,6 +484,10 @@ extern struct pglist_data contig_page_data;
470#define early_pfn_to_nid(nid) (0UL) 484#define early_pfn_to_nid(nid) (0UL)
471#endif 485#endif
472 486
487#ifdef CONFIG_FLATMEM
488#define pfn_to_nid(pfn) (0)
489#endif
490
473#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) 491#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
474#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) 492#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
475 493
@@ -564,11 +582,6 @@ static inline int valid_section_nr(unsigned long nr)
564 return valid_section(__nr_to_section(nr)); 582 return valid_section(__nr_to_section(nr));
565} 583}
566 584
567/*
568 * Given a kernel address, find the home node of the underlying memory.
569 */
570#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
571
572static inline struct mem_section *__pfn_to_section(unsigned long pfn) 585static inline struct mem_section *__pfn_to_section(unsigned long pfn)
573{ 586{
574 return __nr_to_section(pfn_to_section_nr(pfn)); 587 return __nr_to_section(pfn_to_section_nr(pfn));
@@ -598,13 +611,14 @@ static inline int pfn_valid(unsigned long pfn)
598 * this restriction. 611 * this restriction.
599 */ 612 */
600#ifdef CONFIG_NUMA 613#ifdef CONFIG_NUMA
601#define pfn_to_nid early_pfn_to_nid 614#define pfn_to_nid(pfn) \
602#endif
603
604#define pfn_to_pgdat(pfn) \
605({ \ 615({ \
606 NODE_DATA(pfn_to_nid(pfn)); \ 616 unsigned long __pfn_to_nid_pfn = (pfn); \
617 page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \
607}) 618})
619#else
620#define pfn_to_nid(pfn) (0)
621#endif
608 622
609#define early_pfn_valid(pfn) pfn_valid(pfn) 623#define early_pfn_valid(pfn) pfn_valid(pfn)
610void sparse_init(void); 624void sparse_init(void);
@@ -613,12 +627,6 @@ void sparse_init(void);
613#define sparse_index_init(_sec, _nid) do {} while (0) 627#define sparse_index_init(_sec, _nid) do {} while (0)
614#endif /* CONFIG_SPARSEMEM */ 628#endif /* CONFIG_SPARSEMEM */
615 629
616#ifdef CONFIG_NODES_SPAN_OTHER_NODES
617#define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid))
618#else
619#define early_pfn_in_nid(pfn, nid) (1)
620#endif
621
622#ifndef early_pfn_valid 630#ifndef early_pfn_valid
623#define early_pfn_valid(pfn) (1) 631#define early_pfn_valid(pfn) (1)
624#endif 632#endif
diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index 090e210e98f0..f95d51fae733 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -37,18 +37,26 @@ enum {
37/* userspace doesn't need the nbd_device structure */ 37/* userspace doesn't need the nbd_device structure */
38#ifdef __KERNEL__ 38#ifdef __KERNEL__
39 39
40#include <linux/wait.h>
41
40/* values for flags field */ 42/* values for flags field */
41#define NBD_READ_ONLY 0x0001 43#define NBD_READ_ONLY 0x0001
42#define NBD_WRITE_NOCHK 0x0002 44#define NBD_WRITE_NOCHK 0x0002
43 45
46struct request;
47
44struct nbd_device { 48struct nbd_device {
45 int flags; 49 int flags;
46 int harderror; /* Code of hard error */ 50 int harderror; /* Code of hard error */
47 struct socket * sock; 51 struct socket * sock;
48 struct file * file; /* If == NULL, device is not ready, yet */ 52 struct file * file; /* If == NULL, device is not ready, yet */
49 int magic; 53 int magic;
54
50 spinlock_t queue_lock; 55 spinlock_t queue_lock;
51 struct list_head queue_head;/* Requests are added here... */ 56 struct list_head queue_head;/* Requests are added here... */
57 struct request *active_req;
58 wait_queue_head_t active_wq;
59
52 struct semaphore tx_lock; 60 struct semaphore tx_lock;
53 struct gendisk *disk; 61 struct gendisk *disk;
54 int blksize; 62 int blksize;
diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h
index 130d4f588a37..3f4f7142bbe3 100644
--- a/include/linux/nfsd/xdr.h
+++ b/include/linux/nfsd/xdr.h
@@ -88,10 +88,12 @@ struct nfsd_readdirargs {
88 88
89struct nfsd_attrstat { 89struct nfsd_attrstat {
90 struct svc_fh fh; 90 struct svc_fh fh;
91 struct kstat stat;
91}; 92};
92 93
93struct nfsd_diropres { 94struct nfsd_diropres {
94 struct svc_fh fh; 95 struct svc_fh fh;
96 struct kstat stat;
95}; 97};
96 98
97struct nfsd_readlinkres { 99struct nfsd_readlinkres {
@@ -101,6 +103,7 @@ struct nfsd_readlinkres {
101struct nfsd_readres { 103struct nfsd_readres {
102 struct svc_fh fh; 104 struct svc_fh fh;
103 unsigned long count; 105 unsigned long count;
106 struct kstat stat;
104}; 107};
105 108
106struct nfsd_readdirres { 109struct nfsd_readdirres {
diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h
index 3c2a71b43bac..a4322741f8b9 100644
--- a/include/linux/nfsd/xdr3.h
+++ b/include/linux/nfsd/xdr3.h
@@ -126,6 +126,7 @@ struct nfsd3_setaclargs {
126struct nfsd3_attrstat { 126struct nfsd3_attrstat {
127 __u32 status; 127 __u32 status;
128 struct svc_fh fh; 128 struct svc_fh fh;
129 struct kstat stat;
129}; 130};
130 131
131/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */ 132/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 343083fec258..d52999c43336 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -79,13 +79,23 @@
79/* 79/*
80 * Global page accounting. One instance per CPU. Only unsigned longs are 80 * Global page accounting. One instance per CPU. Only unsigned longs are
81 * allowed. 81 * allowed.
82 *
83 * - Fields can be modified with xxx_page_state and xxx_page_state_zone at
84 * any time safely (which protects the instance from modification by
85 * interrupt.
86 * - The __xxx_page_state variants can be used safely when interrupts are
87 * disabled.
88 * - The __xxx_page_state variants can be used if the field is only
89 * modified from process context, or only modified from interrupt context.
90 * In this case, the field should be commented here.
82 */ 91 */
83struct page_state { 92struct page_state {
84 unsigned long nr_dirty; /* Dirty writeable pages */ 93 unsigned long nr_dirty; /* Dirty writeable pages */
85 unsigned long nr_writeback; /* Pages under writeback */ 94 unsigned long nr_writeback; /* Pages under writeback */
86 unsigned long nr_unstable; /* NFS unstable pages */ 95 unsigned long nr_unstable; /* NFS unstable pages */
87 unsigned long nr_page_table_pages;/* Pages used for pagetables */ 96 unsigned long nr_page_table_pages;/* Pages used for pagetables */
88 unsigned long nr_mapped; /* mapped into pagetables */ 97 unsigned long nr_mapped; /* mapped into pagetables.
98 * only modified from process context */
89 unsigned long nr_slab; /* In slab */ 99 unsigned long nr_slab; /* In slab */
90#define GET_PAGE_STATE_LAST nr_slab 100#define GET_PAGE_STATE_LAST nr_slab
91 101
@@ -97,32 +107,40 @@ struct page_state {
97 unsigned long pgpgout; /* Disk writes */ 107 unsigned long pgpgout; /* Disk writes */
98 unsigned long pswpin; /* swap reads */ 108 unsigned long pswpin; /* swap reads */
99 unsigned long pswpout; /* swap writes */ 109 unsigned long pswpout; /* swap writes */
100 unsigned long pgalloc_high; /* page allocations */
101 110
111 unsigned long pgalloc_high; /* page allocations */
102 unsigned long pgalloc_normal; 112 unsigned long pgalloc_normal;
113 unsigned long pgalloc_dma32;
103 unsigned long pgalloc_dma; 114 unsigned long pgalloc_dma;
115
104 unsigned long pgfree; /* page freeings */ 116 unsigned long pgfree; /* page freeings */
105 unsigned long pgactivate; /* pages moved inactive->active */ 117 unsigned long pgactivate; /* pages moved inactive->active */
106 unsigned long pgdeactivate; /* pages moved active->inactive */ 118 unsigned long pgdeactivate; /* pages moved active->inactive */
107 119
108 unsigned long pgfault; /* faults (major+minor) */ 120 unsigned long pgfault; /* faults (major+minor) */
109 unsigned long pgmajfault; /* faults (major only) */ 121 unsigned long pgmajfault; /* faults (major only) */
122
110 unsigned long pgrefill_high; /* inspected in refill_inactive_zone */ 123 unsigned long pgrefill_high; /* inspected in refill_inactive_zone */
111 unsigned long pgrefill_normal; 124 unsigned long pgrefill_normal;
125 unsigned long pgrefill_dma32;
112 unsigned long pgrefill_dma; 126 unsigned long pgrefill_dma;
113 127
114 unsigned long pgsteal_high; /* total highmem pages reclaimed */ 128 unsigned long pgsteal_high; /* total highmem pages reclaimed */
115 unsigned long pgsteal_normal; 129 unsigned long pgsteal_normal;
130 unsigned long pgsteal_dma32;
116 unsigned long pgsteal_dma; 131 unsigned long pgsteal_dma;
132
117 unsigned long pgscan_kswapd_high;/* total highmem pages scanned */ 133 unsigned long pgscan_kswapd_high;/* total highmem pages scanned */
118 unsigned long pgscan_kswapd_normal; 134 unsigned long pgscan_kswapd_normal;
119 135 unsigned long pgscan_kswapd_dma32;
120 unsigned long pgscan_kswapd_dma; 136 unsigned long pgscan_kswapd_dma;
137
121 unsigned long pgscan_direct_high;/* total highmem pages scanned */ 138 unsigned long pgscan_direct_high;/* total highmem pages scanned */
122 unsigned long pgscan_direct_normal; 139 unsigned long pgscan_direct_normal;
140 unsigned long pgscan_direct_dma32;
123 unsigned long pgscan_direct_dma; 141 unsigned long pgscan_direct_dma;
124 unsigned long pginodesteal; /* pages reclaimed via inode freeing */
125 142
143 unsigned long pginodesteal; /* pages reclaimed via inode freeing */
126 unsigned long slabs_scanned; /* slab objects scanned */ 144 unsigned long slabs_scanned; /* slab objects scanned */
127 unsigned long kswapd_steal; /* pages reclaimed by kswapd */ 145 unsigned long kswapd_steal; /* pages reclaimed by kswapd */
128 unsigned long kswapd_inodesteal;/* reclaimed via kswapd inode freeing */ 146 unsigned long kswapd_inodesteal;/* reclaimed via kswapd inode freeing */
@@ -136,31 +154,54 @@ struct page_state {
136extern void get_page_state(struct page_state *ret); 154extern void get_page_state(struct page_state *ret);
137extern void get_page_state_node(struct page_state *ret, int node); 155extern void get_page_state_node(struct page_state *ret, int node);
138extern void get_full_page_state(struct page_state *ret); 156extern void get_full_page_state(struct page_state *ret);
139extern unsigned long __read_page_state(unsigned long offset); 157extern unsigned long read_page_state_offset(unsigned long offset);
140extern void __mod_page_state(unsigned long offset, unsigned long delta); 158extern void mod_page_state_offset(unsigned long offset, unsigned long delta);
159extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
141 160
142#define read_page_state(member) \ 161#define read_page_state(member) \
143 __read_page_state(offsetof(struct page_state, member)) 162 read_page_state_offset(offsetof(struct page_state, member))
144 163
145#define mod_page_state(member, delta) \ 164#define mod_page_state(member, delta) \
146 __mod_page_state(offsetof(struct page_state, member), (delta)) 165 mod_page_state_offset(offsetof(struct page_state, member), (delta))
147 166
148#define inc_page_state(member) mod_page_state(member, 1UL) 167#define __mod_page_state(member, delta) \
149#define dec_page_state(member) mod_page_state(member, 0UL - 1) 168 __mod_page_state_offset(offsetof(struct page_state, member), (delta))
150#define add_page_state(member,delta) mod_page_state(member, (delta)) 169
151#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta)) 170#define inc_page_state(member) mod_page_state(member, 1UL)
152 171#define dec_page_state(member) mod_page_state(member, 0UL - 1)
153#define mod_page_state_zone(zone, member, delta) \ 172#define add_page_state(member,delta) mod_page_state(member, (delta))
154 do { \ 173#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta))
155 unsigned offset; \ 174
156 if (is_highmem(zone)) \ 175#define __inc_page_state(member) __mod_page_state(member, 1UL)
157 offset = offsetof(struct page_state, member##_high); \ 176#define __dec_page_state(member) __mod_page_state(member, 0UL - 1)
158 else if (is_normal(zone)) \ 177#define __add_page_state(member,delta) __mod_page_state(member, (delta))
159 offset = offsetof(struct page_state, member##_normal); \ 178#define __sub_page_state(member,delta) __mod_page_state(member, 0UL - (delta))
160 else \ 179
161 offset = offsetof(struct page_state, member##_dma); \ 180#define page_state(member) (*__page_state(offsetof(struct page_state, member)))
162 __mod_page_state(offset, (delta)); \ 181
163 } while (0) 182#define state_zone_offset(zone, member) \
183({ \
184 unsigned offset; \
185 if (is_highmem(zone)) \
186 offset = offsetof(struct page_state, member##_high); \
187 else if (is_normal(zone)) \
188 offset = offsetof(struct page_state, member##_normal); \
189 else if (is_dma32(zone)) \
190 offset = offsetof(struct page_state, member##_dma32); \
191 else \
192 offset = offsetof(struct page_state, member##_dma); \
193 offset; \
194})
195
196#define __mod_page_state_zone(zone, member, delta) \
197 do { \
198 __mod_page_state_offset(state_zone_offset(zone, member), (delta)); \
199 } while (0)
200
201#define mod_page_state_zone(zone, member, delta) \
202 do { \
203 mod_page_state_offset(state_zone_offset(zone, member), (delta)); \
204 } while (0)
164 205
165/* 206/*
166 * Manipulation of page state flags 207 * Manipulation of page state flags
diff --git a/include/linux/parport.h b/include/linux/parport.h
index d2a4d9e1e6d1..f7ff0b0c4031 100644
--- a/include/linux/parport.h
+++ b/include/linux/parport.h
@@ -242,7 +242,6 @@ enum ieee1284_phase {
242 IEEE1284_PH_FWD_IDLE, 242 IEEE1284_PH_FWD_IDLE,
243 IEEE1284_PH_TERMINATE, 243 IEEE1284_PH_TERMINATE,
244 IEEE1284_PH_NEGOTIATION, 244 IEEE1284_PH_NEGOTIATION,
245 IEEE1284_PH_HBUSY_DNA,
246 IEEE1284_PH_REV_IDLE, 245 IEEE1284_PH_REV_IDLE,
247 IEEE1284_PH_HBUSY_DAVAIL, 246 IEEE1284_PH_HBUSY_DAVAIL,
248 IEEE1284_PH_REV_DATA, 247 IEEE1284_PH_REV_DATA,
diff --git a/include/linux/parport_pc.h b/include/linux/parport_pc.h
index c6f762470879..1cc0f6b1a49a 100644
--- a/include/linux/parport_pc.h
+++ b/include/linux/parport_pc.h
@@ -79,13 +79,13 @@ static __inline__ unsigned char parport_pc_read_data(struct parport *p)
79} 79}
80 80
81#ifdef DEBUG_PARPORT 81#ifdef DEBUG_PARPORT
82extern __inline__ void dump_parport_state (char *str, struct parport *p) 82static inline void dump_parport_state (char *str, struct parport *p)
83{ 83{
84 /* here's hoping that reading these ports won't side-effect anything underneath */ 84 /* here's hoping that reading these ports won't side-effect anything underneath */
85 unsigned char ecr = inb (ECONTROL (p)); 85 unsigned char ecr = inb (ECONTROL (p));
86 unsigned char dcr = inb (CONTROL (p)); 86 unsigned char dcr = inb (CONTROL (p));
87 unsigned char dsr = inb (STATUS (p)); 87 unsigned char dsr = inb (STATUS (p));
88 static char *ecr_modes[] = {"SPP", "PS2", "PPFIFO", "ECP", "xXx", "yYy", "TST", "CFG"}; 88 static const char *const ecr_modes[] = {"SPP", "PS2", "PPFIFO", "ECP", "xXx", "yYy", "TST", "CFG"};
89 const struct parport_pc_private *priv = p->physport->private_data; 89 const struct parport_pc_private *priv = p->physport->private_data;
90 int i; 90 int i;
91 91
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 96a0403f61f6..a213e999de31 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -394,6 +394,13 @@
394#define PCI_DEVICE_ID_NS_87410 0xd001 394#define PCI_DEVICE_ID_NS_87410 0xd001
395#define PCI_DEVICE_ID_NS_CS5535_IDE 0x002d 395#define PCI_DEVICE_ID_NS_CS5535_IDE 0x002d
396 396
397#define PCI_DEVICE_ID_NS_CS5535_HOST_BRIDGE 0x0028
398#define PCI_DEVICE_ID_NS_CS5535_ISA_BRIDGE 0x002b
399#define PCI_DEVICE_ID_NS_CS5535_IDE 0x002d
400#define PCI_DEVICE_ID_NS_CS5535_AUDIO 0x002e
401#define PCI_DEVICE_ID_NS_CS5535_USB 0x002f
402#define PCI_DEVICE_ID_NS_CS5535_VIDEO 0x0030
403
397#define PCI_VENDOR_ID_TSENG 0x100c 404#define PCI_VENDOR_ID_TSENG 0x100c
398#define PCI_DEVICE_ID_TSENG_W32P_2 0x3202 405#define PCI_DEVICE_ID_TSENG_W32P_2 0x3202
399#define PCI_DEVICE_ID_TSENG_W32P_b 0x3205 406#define PCI_DEVICE_ID_TSENG_W32P_b 0x3205
@@ -496,6 +503,9 @@
496 503
497#define PCI_DEVICE_ID_AMD_CS5536_IDE 0x209A 504#define PCI_DEVICE_ID_AMD_CS5536_IDE 0x209A
498 505
506#define PCI_DEVICE_ID_AMD_LX_VIDEO 0x2081
507#define PCI_DEVICE_ID_AMD_LX_AES 0x2082
508
499#define PCI_VENDOR_ID_TRIDENT 0x1023 509#define PCI_VENDOR_ID_TRIDENT 0x1023
500#define PCI_DEVICE_ID_TRIDENT_4DWAVE_DX 0x2000 510#define PCI_DEVICE_ID_TRIDENT_4DWAVE_DX 0x2000
501#define PCI_DEVICE_ID_TRIDENT_4DWAVE_NX 0x2001 511#define PCI_DEVICE_ID_TRIDENT_4DWAVE_NX 0x2001
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 13e7c4b62367..b6e0bcad84e1 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -71,8 +71,8 @@
71 */ 71 */
72#define MD_PATCHLEVEL_VERSION 3 72#define MD_PATCHLEVEL_VERSION 3
73 73
74extern int register_md_personality (int p_num, mdk_personality_t *p); 74extern int register_md_personality (struct mdk_personality *p);
75extern int unregister_md_personality (int p_num); 75extern int unregister_md_personality (struct mdk_personality *p);
76extern mdk_thread_t * md_register_thread (void (*run) (mddev_t *mddev), 76extern mdk_thread_t * md_register_thread (void (*run) (mddev_t *mddev),
77 mddev_t *mddev, const char *name); 77 mddev_t *mddev, const char *name);
78extern void md_unregister_thread (mdk_thread_t *thread); 78extern void md_unregister_thread (mdk_thread_t *thread);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 46629a275ba9..617b9506c760 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -18,62 +18,19 @@
18/* and dm-bio-list.h is not under include/linux because.... ??? */ 18/* and dm-bio-list.h is not under include/linux because.... ??? */
19#include "../../../drivers/md/dm-bio-list.h" 19#include "../../../drivers/md/dm-bio-list.h"
20 20
21#define MD_RESERVED 0UL
22#define LINEAR 1UL
23#define RAID0 2UL
24#define RAID1 3UL
25#define RAID5 4UL
26#define TRANSLUCENT 5UL
27#define HSM 6UL
28#define MULTIPATH 7UL
29#define RAID6 8UL
30#define RAID10 9UL
31#define FAULTY 10UL
32#define MAX_PERSONALITY 11UL
33
34#define LEVEL_MULTIPATH (-4) 21#define LEVEL_MULTIPATH (-4)
35#define LEVEL_LINEAR (-1) 22#define LEVEL_LINEAR (-1)
36#define LEVEL_FAULTY (-5) 23#define LEVEL_FAULTY (-5)
37 24
25/* we need a value for 'no level specified' and 0
26 * means 'raid0', so we need something else. This is
27 * for internal use only
28 */
29#define LEVEL_NONE (-1000000)
30
38#define MaxSector (~(sector_t)0) 31#define MaxSector (~(sector_t)0)
39#define MD_THREAD_NAME_MAX 14 32#define MD_THREAD_NAME_MAX 14
40 33
41static inline int pers_to_level (int pers)
42{
43 switch (pers) {
44 case FAULTY: return LEVEL_FAULTY;
45 case MULTIPATH: return LEVEL_MULTIPATH;
46 case HSM: return -3;
47 case TRANSLUCENT: return -2;
48 case LINEAR: return LEVEL_LINEAR;
49 case RAID0: return 0;
50 case RAID1: return 1;
51 case RAID5: return 5;
52 case RAID6: return 6;
53 case RAID10: return 10;
54 }
55 BUG();
56 return MD_RESERVED;
57}
58
59static inline int level_to_pers (int level)
60{
61 switch (level) {
62 case LEVEL_FAULTY: return FAULTY;
63 case LEVEL_MULTIPATH: return MULTIPATH;
64 case -3: return HSM;
65 case -2: return TRANSLUCENT;
66 case LEVEL_LINEAR: return LINEAR;
67 case 0: return RAID0;
68 case 1: return RAID1;
69 case 4:
70 case 5: return RAID5;
71 case 6: return RAID6;
72 case 10: return RAID10;
73 }
74 return MD_RESERVED;
75}
76
77typedef struct mddev_s mddev_t; 34typedef struct mddev_s mddev_t;
78typedef struct mdk_rdev_s mdk_rdev_t; 35typedef struct mdk_rdev_s mdk_rdev_t;
79 36
@@ -138,14 +95,16 @@ struct mdk_rdev_s
138 atomic_t read_errors; /* number of consecutive read errors that 95 atomic_t read_errors; /* number of consecutive read errors that
139 * we have tried to ignore. 96 * we have tried to ignore.
140 */ 97 */
98 atomic_t corrected_errors; /* number of corrected read errors,
99 * for reporting to userspace and storing
100 * in superblock.
101 */
141}; 102};
142 103
143typedef struct mdk_personality_s mdk_personality_t;
144
145struct mddev_s 104struct mddev_s
146{ 105{
147 void *private; 106 void *private;
148 mdk_personality_t *pers; 107 struct mdk_personality *pers;
149 dev_t unit; 108 dev_t unit;
150 int md_minor; 109 int md_minor;
151 struct list_head disks; 110 struct list_head disks;
@@ -164,6 +123,7 @@ struct mddev_s
164 int chunk_size; 123 int chunk_size;
165 time_t ctime, utime; 124 time_t ctime, utime;
166 int level, layout; 125 int level, layout;
126 char clevel[16];
167 int raid_disks; 127 int raid_disks;
168 int max_disks; 128 int max_disks;
169 sector_t size; /* used size of component devices */ 129 sector_t size; /* used size of component devices */
@@ -183,6 +143,11 @@ struct mddev_s
183 sector_t resync_mismatches; /* count of sectors where 143 sector_t resync_mismatches; /* count of sectors where
184 * parity/replica mismatch found 144 * parity/replica mismatch found
185 */ 145 */
146 /* if zero, use the system-wide default */
147 int sync_speed_min;
148 int sync_speed_max;
149
150 int ok_start_degraded;
186 /* recovery/resync flags 151 /* recovery/resync flags
187 * NEEDED: we might need to start a resync/recover 152 * NEEDED: we might need to start a resync/recover
188 * RUNNING: a thread is running, or about to be started 153 * RUNNING: a thread is running, or about to be started
@@ -265,9 +230,11 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect
265 atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); 230 atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
266} 231}
267 232
268struct mdk_personality_s 233struct mdk_personality
269{ 234{
270 char *name; 235 char *name;
236 int level;
237 struct list_head list;
271 struct module *owner; 238 struct module *owner;
272 int (*make_request)(request_queue_t *q, struct bio *bio); 239 int (*make_request)(request_queue_t *q, struct bio *bio);
273 int (*run)(mddev_t *mddev); 240 int (*run)(mddev_t *mddev);
@@ -305,8 +272,6 @@ static inline char * mdname (mddev_t * mddev)
305 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; 272 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
306} 273}
307 274
308extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
309
310/* 275/*
311 * iterates through some rdev ringlist. It's safe to remove the 276 * iterates through some rdev ringlist. It's safe to remove the
312 * current 'rdev'. Dont touch 'tmp' though. 277 * current 'rdev'. Dont touch 'tmp' though.
@@ -366,5 +331,10 @@ do { \
366 __wait_event_lock_irq(wq, condition, lock, cmd); \ 331 __wait_event_lock_irq(wq, condition, lock, cmd); \
367} while (0) 332} while (0)
368 333
334static inline void safe_put_page(struct page *p)
335{
336 if (p) put_page(p);
337}
338
369#endif 339#endif
370 340
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index 292b98f2b408..9d5494aaac0f 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -45,6 +45,8 @@ struct r1_private_data_s {
45 45
46 spinlock_t resync_lock; 46 spinlock_t resync_lock;
47 int nr_pending; 47 int nr_pending;
48 int nr_waiting;
49 int nr_queued;
48 int barrier; 50 int barrier;
49 sector_t next_resync; 51 sector_t next_resync;
50 int fullsync; /* set to 1 if a full sync is needed, 52 int fullsync; /* set to 1 if a full sync is needed,
@@ -52,11 +54,12 @@ struct r1_private_data_s {
52 * Cleared when a sync completes. 54 * Cleared when a sync completes.
53 */ 55 */
54 56
55 wait_queue_head_t wait_idle; 57 wait_queue_head_t wait_barrier;
56 wait_queue_head_t wait_resume;
57 58
58 struct pool_info *poolinfo; 59 struct pool_info *poolinfo;
59 60
61 struct page *tmppage;
62
60 mempool_t *r1bio_pool; 63 mempool_t *r1bio_pool;
61 mempool_t *r1buf_pool; 64 mempool_t *r1buf_pool;
62}; 65};
@@ -106,6 +109,13 @@ struct r1bio_s {
106 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ 109 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
107}; 110};
108 111
112/* when we get a read error on a read-only array, we redirect to another
113 * device without failing the first device, or trying to over-write to
114 * correct the read error. To keep track of bad blocks on a per-bio
115 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
116 */
117#define IO_BLOCKED ((struct bio*)1)
118
109/* bits for r1bio.state */ 119/* bits for r1bio.state */
110#define R1BIO_Uptodate 0 120#define R1BIO_Uptodate 0
111#define R1BIO_IsSync 1 121#define R1BIO_IsSync 1
diff --git a/include/linux/raid/raid10.h b/include/linux/raid/raid10.h
index 60708789c8f9..b1103298a8c2 100644
--- a/include/linux/raid/raid10.h
+++ b/include/linux/raid/raid10.h
@@ -35,18 +35,26 @@ struct r10_private_data_s {
35 sector_t chunk_mask; 35 sector_t chunk_mask;
36 36
37 struct list_head retry_list; 37 struct list_head retry_list;
38 /* for use when syncing mirrors: */ 38 /* queue pending writes and submit them on unplug */
39 struct bio_list pending_bio_list;
40
39 41
40 spinlock_t resync_lock; 42 spinlock_t resync_lock;
41 int nr_pending; 43 int nr_pending;
44 int nr_waiting;
45 int nr_queued;
42 int barrier; 46 int barrier;
43 sector_t next_resync; 47 sector_t next_resync;
48 int fullsync; /* set to 1 if a full sync is needed,
49 * (fresh device added).
50 * Cleared when a sync completes.
51 */
44 52
45 wait_queue_head_t wait_idle; 53 wait_queue_head_t wait_barrier;
46 wait_queue_head_t wait_resume;
47 54
48 mempool_t *r10bio_pool; 55 mempool_t *r10bio_pool;
49 mempool_t *r10buf_pool; 56 mempool_t *r10buf_pool;
57 struct page *tmppage;
50}; 58};
51 59
52typedef struct r10_private_data_s conf_t; 60typedef struct r10_private_data_s conf_t;
@@ -96,8 +104,16 @@ struct r10bio_s {
96 } devs[0]; 104 } devs[0];
97}; 105};
98 106
107/* when we get a read error on a read-only array, we redirect to another
108 * device without failing the first device, or trying to over-write to
109 * correct the read error. To keep track of bad blocks on a per-bio
110 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
111 */
112#define IO_BLOCKED ((struct bio*)1)
113
99/* bits for r10bio.state */ 114/* bits for r10bio.state */
100#define R10BIO_Uptodate 0 115#define R10BIO_Uptodate 0
101#define R10BIO_IsSync 1 116#define R10BIO_IsSync 1
102#define R10BIO_IsRecover 2 117#define R10BIO_IsRecover 2
118#define R10BIO_Degraded 3
103#endif 119#endif
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index f025ba6fb14c..394da8207b34 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -126,7 +126,7 @@
126 */ 126 */
127 127
128struct stripe_head { 128struct stripe_head {
129 struct stripe_head *hash_next, **hash_pprev; /* hash pointers */ 129 struct hlist_node hash;
130 struct list_head lru; /* inactive_list or handle_list */ 130 struct list_head lru; /* inactive_list or handle_list */
131 struct raid5_private_data *raid_conf; 131 struct raid5_private_data *raid_conf;
132 sector_t sector; /* sector of this row */ 132 sector_t sector; /* sector of this row */
@@ -152,7 +152,6 @@ struct stripe_head {
152#define R5_Insync 3 /* rdev && rdev->in_sync at start */ 152#define R5_Insync 3 /* rdev && rdev->in_sync at start */
153#define R5_Wantread 4 /* want to schedule a read */ 153#define R5_Wantread 4 /* want to schedule a read */
154#define R5_Wantwrite 5 154#define R5_Wantwrite 5
155#define R5_Syncio 6 /* this io need to be accounted as resync io */
156#define R5_Overlap 7 /* There is a pending overlapping request on this block */ 155#define R5_Overlap 7 /* There is a pending overlapping request on this block */
157#define R5_ReadError 8 /* seen a read error here recently */ 156#define R5_ReadError 8 /* seen a read error here recently */
158#define R5_ReWrite 9 /* have tried to over-write the readerror */ 157#define R5_ReWrite 9 /* have tried to over-write the readerror */
@@ -205,7 +204,7 @@ struct disk_info {
205}; 204};
206 205
207struct raid5_private_data { 206struct raid5_private_data {
208 struct stripe_head **stripe_hashtbl; 207 struct hlist_head *stripe_hashtbl;
209 mddev_t *mddev; 208 mddev_t *mddev;
210 struct disk_info *spare; 209 struct disk_info *spare;
211 int chunk_size, level, algorithm; 210 int chunk_size, level, algorithm;
@@ -228,6 +227,8 @@ struct raid5_private_data {
228 * Cleared when a sync completes. 227 * Cleared when a sync completes.
229 */ 228 */
230 229
230 struct page *spare_page; /* Used when checking P/Q in raid6 */
231
231 /* 232 /*
232 * Free stripes pool 233 * Free stripes pool
233 */ 234 */
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index e0a4faa9610c..953b6df5d037 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -5,6 +5,16 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev);
5struct super_block *ramfs_get_sb(struct file_system_type *fs_type, 5struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
6 int flags, const char *dev_name, void *data); 6 int flags, const char *dev_name, void *data);
7 7
8#ifndef CONFIG_MMU
9extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
10 unsigned long addr,
11 unsigned long len,
12 unsigned long pgoff,
13 unsigned long flags);
14
15extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
16#endif
17
8extern struct file_operations ramfs_file_operations; 18extern struct file_operations ramfs_file_operations;
9extern struct vm_operations_struct generic_file_vm_ops; 19extern struct vm_operations_struct generic_file_vm_ops;
10 20
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 33261f1d2239..9d6fbeef2104 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -71,6 +71,7 @@ void __anon_vma_link(struct vm_area_struct *);
71 * rmap interfaces called when adding or removing pte of page 71 * rmap interfaces called when adding or removing pte of page
72 */ 72 */
73void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); 73void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
74void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
74void page_add_file_rmap(struct page *); 75void page_add_file_rmap(struct page *);
75void page_remove_rmap(struct page *); 76void page_remove_rmap(struct page *);
76 77
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0ad6f30679e..7da33619d5d0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -254,25 +254,12 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
254 * The mm counters are not protected by its page_table_lock, 254 * The mm counters are not protected by its page_table_lock,
255 * so must be incremented atomically. 255 * so must be incremented atomically.
256 */ 256 */
257#ifdef ATOMIC64_INIT 257#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
258#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value) 258#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
259#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member)) 259#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
260#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member) 260#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
261#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member) 261#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
262#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member) 262typedef atomic_long_t mm_counter_t;
263typedef atomic64_t mm_counter_t;
264#else /* !ATOMIC64_INIT */
265/*
266 * The counters wrap back to 0 at 2^32 * PAGE_SIZE,
267 * that is, at 16TB if using 4kB page size.
268 */
269#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value)
270#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member))
271#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member)
272#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member)
273#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member)
274typedef atomic_t mm_counter_t;
275#endif /* !ATOMIC64_INIT */
276 263
277#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ 264#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
278/* 265/*
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index a61c04f804b2..5dc94e777fab 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -14,11 +14,7 @@
14typedef struct pbe { 14typedef struct pbe {
15 unsigned long address; /* address of the copy */ 15 unsigned long address; /* address of the copy */
16 unsigned long orig_address; /* original address of page */ 16 unsigned long orig_address; /* original address of page */
17 swp_entry_t swap_address; 17 struct pbe *next;
18
19 struct pbe *next; /* also used as scratch space at
20 * end of page (see link, diskpage)
21 */
22} suspend_pagedir_t; 18} suspend_pagedir_t;
23 19
24#define for_each_pbe(pbe, pblist) \ 20#define for_each_pbe(pbe, pblist) \
@@ -77,6 +73,6 @@ unsigned long get_safe_page(gfp_t gfp_mask);
77 * XXX: We try to keep some more pages free so that I/O operations succeed 73 * XXX: We try to keep some more pages free so that I/O operations succeed
78 * without paging. Might this be more? 74 * without paging. Might this be more?
79 */ 75 */
80#define PAGES_FOR_IO 512 76#define PAGES_FOR_IO 1024
81 77
82#endif /* _LINUX_SWSUSP_H */ 78#endif /* _LINUX_SWSUSP_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 508668f840b6..556617bcf7ac 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -172,7 +172,6 @@ extern void swap_setup(void);
172 172
173/* linux/mm/vmscan.c */ 173/* linux/mm/vmscan.c */
174extern int try_to_free_pages(struct zone **, gfp_t); 174extern int try_to_free_pages(struct zone **, gfp_t);
175extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
176extern int shrink_all_memory(int); 175extern int shrink_all_memory(int);
177extern int vm_swappiness; 176extern int vm_swappiness;
178 177
@@ -210,6 +209,7 @@ extern unsigned int nr_swapfiles;
210extern struct swap_info_struct swap_info[]; 209extern struct swap_info_struct swap_info[];
211extern void si_swapinfo(struct sysinfo *); 210extern void si_swapinfo(struct sysinfo *);
212extern swp_entry_t get_swap_page(void); 211extern swp_entry_t get_swap_page(void);
212extern swp_entry_t get_swap_page_of_type(int type);
213extern int swap_duplicate(swp_entry_t); 213extern int swap_duplicate(swp_entry_t);
214extern int valid_swaphandles(swp_entry_t, unsigned long *); 214extern int valid_swaphandles(swp_entry_t, unsigned long *);
215extern void swap_free(swp_entry_t); 215extern void swap_free(swp_entry_t);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 343d883d69c5..64a36ba43b2f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -60,12 +60,6 @@ struct writeback_control {
60}; 60};
61 61
62/* 62/*
63 * ->writepage() return values (make these much larger than a pagesize, in
64 * case some fs is returning number-of-bytes-written from writepage)
65 */
66#define WRITEPAGE_ACTIVATE 0x80000 /* IO was not started: activate page */
67
68/*
69 * fs/fs-writeback.c 63 * fs/fs-writeback.c
70 */ 64 */
71void writeback_inodes(struct writeback_control *wbc); 65void writeback_inodes(struct writeback_control *wbc);
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index 850dfa877fda..02e26c1672bf 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -15,7 +15,6 @@ struct scsi_driver {
15 void (*rescan)(struct device *); 15 void (*rescan)(struct device *);
16 int (*issue_flush)(struct device *, sector_t *); 16 int (*issue_flush)(struct device *, sector_t *);
17 int (*prepare_flush)(struct request_queue *, struct request *); 17 int (*prepare_flush)(struct request_queue *, struct request *);
18 void (*end_flush)(struct request_queue *, struct request *);
19}; 18};
20#define to_scsi_driver(drv) \ 19#define to_scsi_driver(drv) \
21 container_of((drv), struct scsi_driver, gendrv) 20 container_of((drv), struct scsi_driver, gendrv)
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 6cbb1982ed03..230bc55c0bfa 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -296,6 +296,12 @@ struct scsi_host_template {
296 int (*proc_info)(struct Scsi_Host *, char *, char **, off_t, int, int); 296 int (*proc_info)(struct Scsi_Host *, char *, char **, off_t, int, int);
297 297
298 /* 298 /*
299 * suspend support
300 */
301 int (*resume)(struct scsi_device *);
302 int (*suspend)(struct scsi_device *);
303
304 /*
299 * Name of proc directory 305 * Name of proc directory
300 */ 306 */
301 char *proc_name; 307 char *proc_name;
@@ -392,7 +398,6 @@ struct scsi_host_template {
392 /* 398 /*
393 * ordered write support 399 * ordered write support
394 */ 400 */
395 unsigned ordered_flush:1;
396 unsigned ordered_tag:1; 401 unsigned ordered_tag:1;
397 402
398 /* 403 /*
diff --git a/init/Kconfig b/init/Kconfig
index ce737e02c5a2..ba42f3793a84 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -105,7 +105,6 @@ config SWAP
105 105
106config SYSVIPC 106config SYSVIPC
107 bool "System V IPC" 107 bool "System V IPC"
108 depends on MMU
109 ---help--- 108 ---help---
110 Inter Process Communication is a suite of library functions and 109 Inter Process Communication is a suite of library functions and
111 system calls which let processes (running programs) synchronize and 110 system calls which let processes (running programs) synchronize and
@@ -190,7 +189,7 @@ config AUDIT
190 189
191config AUDITSYSCALL 190config AUDITSYSCALL
192 bool "Enable system-call auditing support" 191 bool "Enable system-call auditing support"
193 depends on AUDIT && (X86 || PPC || PPC64 || ARCH_S390 || IA64 || UML || SPARC64) 192 depends on AUDIT && (X86 || PPC || PPC64 || S390 || IA64 || UML || SPARC64)
194 default y if SECURITY_SELINUX 193 default y if SECURITY_SELINUX
195 help 194 help
196 Enable low-overhead system-call auditing infrastructure that 195 Enable low-overhead system-call auditing infrastructure that
diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c
index 3fbc3555ce96..f6f36806f84a 100644
--- a/init/do_mounts_md.c
+++ b/init/do_mounts_md.c
@@ -17,7 +17,7 @@ static int __initdata raid_noautodetect, raid_autopart;
17static struct { 17static struct {
18 int minor; 18 int minor;
19 int partitioned; 19 int partitioned;
20 int pers; 20 int level;
21 int chunk; 21 int chunk;
22 char *device_names; 22 char *device_names;
23} md_setup_args[MAX_MD_DEVS] __initdata; 23} md_setup_args[MAX_MD_DEVS] __initdata;
@@ -47,7 +47,7 @@ extern int mdp_major;
47 */ 47 */
48static int __init md_setup(char *str) 48static int __init md_setup(char *str)
49{ 49{
50 int minor, level, factor, fault, pers, partitioned = 0; 50 int minor, level, factor, fault, partitioned = 0;
51 char *pername = ""; 51 char *pername = "";
52 char *str1; 52 char *str1;
53 int ent; 53 int ent;
@@ -78,7 +78,7 @@ static int __init md_setup(char *str)
78 } 78 }
79 if (ent >= md_setup_ents) 79 if (ent >= md_setup_ents)
80 md_setup_ents++; 80 md_setup_ents++;
81 switch (get_option(&str, &level)) { /* RAID Personality */ 81 switch (get_option(&str, &level)) { /* RAID level */
82 case 2: /* could be 0 or -1.. */ 82 case 2: /* could be 0 or -1.. */
83 if (level == 0 || level == LEVEL_LINEAR) { 83 if (level == 0 || level == LEVEL_LINEAR) {
84 if (get_option(&str, &factor) != 2 || /* Chunk Size */ 84 if (get_option(&str, &factor) != 2 || /* Chunk Size */
@@ -86,16 +86,12 @@ static int __init md_setup(char *str)
86 printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); 86 printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
87 return 0; 87 return 0;
88 } 88 }
89 md_setup_args[ent].pers = level; 89 md_setup_args[ent].level = level;
90 md_setup_args[ent].chunk = 1 << (factor+12); 90 md_setup_args[ent].chunk = 1 << (factor+12);
91 if (level == LEVEL_LINEAR) { 91 if (level == LEVEL_LINEAR)
92 pers = LINEAR;
93 pername = "linear"; 92 pername = "linear";
94 } else { 93 else
95 pers = RAID0;
96 pername = "raid0"; 94 pername = "raid0";
97 }
98 md_setup_args[ent].pers = pers;
99 break; 95 break;
100 } 96 }
101 /* FALL THROUGH */ 97 /* FALL THROUGH */
@@ -103,7 +99,7 @@ static int __init md_setup(char *str)
103 str = str1; 99 str = str1;
104 /* FALL THROUGH */ 100 /* FALL THROUGH */
105 case 0: 101 case 0:
106 md_setup_args[ent].pers = 0; 102 md_setup_args[ent].level = LEVEL_NONE;
107 pername="super-block"; 103 pername="super-block";
108 } 104 }
109 105
@@ -190,10 +186,10 @@ static void __init md_setup_drive(void)
190 continue; 186 continue;
191 } 187 }
192 188
193 if (md_setup_args[ent].pers) { 189 if (md_setup_args[ent].level != LEVEL_NONE) {
194 /* non-persistent */ 190 /* non-persistent */
195 mdu_array_info_t ainfo; 191 mdu_array_info_t ainfo;
196 ainfo.level = pers_to_level(md_setup_args[ent].pers); 192 ainfo.level = md_setup_args[ent].level;
197 ainfo.size = 0; 193 ainfo.size = 0;
198 ainfo.nr_disks =0; 194 ainfo.nr_disks =0;
199 ainfo.raid_disks =0; 195 ainfo.raid_disks =0;
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index c10b08a80982..c2683fcd792d 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -145,7 +145,7 @@ int __init rd_load_image(char *from)
145 int nblocks, i, disk; 145 int nblocks, i, disk;
146 char *buf = NULL; 146 char *buf = NULL;
147 unsigned short rotate = 0; 147 unsigned short rotate = 0;
148#if !defined(CONFIG_ARCH_S390) && !defined(CONFIG_PPC_ISERIES) 148#if !defined(CONFIG_S390) && !defined(CONFIG_PPC_ISERIES)
149 char rotator[4] = { '|' , '/' , '-' , '\\' }; 149 char rotator[4] = { '|' , '/' , '-' , '\\' };
150#endif 150#endif
151 151
@@ -237,7 +237,7 @@ int __init rd_load_image(char *from)
237 } 237 }
238 sys_read(in_fd, buf, BLOCK_SIZE); 238 sys_read(in_fd, buf, BLOCK_SIZE);
239 sys_write(out_fd, buf, BLOCK_SIZE); 239 sys_write(out_fd, buf, BLOCK_SIZE);
240#if !defined(CONFIG_ARCH_S390) && !defined(CONFIG_PPC_ISERIES) 240#if !defined(CONFIG_S390) && !defined(CONFIG_PPC_ISERIES)
241 if (!(i % 16)) { 241 if (!(i % 16)) {
242 printk("%c\b", rotator[rotate & 0x3]); 242 printk("%c\b", rotator[rotate & 0x3]);
243 rotate++; 243 rotate++;
diff --git a/init/main.c b/init/main.c
index 54aaf561cf66..2ed3638deec7 100644
--- a/init/main.c
+++ b/init/main.c
@@ -52,6 +52,7 @@
52#include <asm/bugs.h> 52#include <asm/bugs.h>
53#include <asm/setup.h> 53#include <asm/setup.h>
54#include <asm/sections.h> 54#include <asm/sections.h>
55#include <asm/cacheflush.h>
55 56
56/* 57/*
57 * This is one of the first .c files built. Error out early 58 * This is one of the first .c files built. Error out early
@@ -99,6 +100,9 @@ extern void acpi_early_init(void);
99#else 100#else
100static inline void acpi_early_init(void) { } 101static inline void acpi_early_init(void) { }
101#endif 102#endif
103#ifndef CONFIG_DEBUG_RODATA
104static inline void mark_rodata_ro(void) { }
105#endif
102 106
103#ifdef CONFIG_TC 107#ifdef CONFIG_TC
104extern void tc_init(void); 108extern void tc_init(void);
@@ -708,6 +712,7 @@ static int init(void * unused)
708 */ 712 */
709 free_initmem(); 713 free_initmem();
710 unlock_kernel(); 714 unlock_kernel();
715 mark_rodata_ro();
711 system_state = SYSTEM_RUNNING; 716 system_state = SYSTEM_RUNNING;
712 numa_default_policy(); 717 numa_default_policy();
713 718
diff --git a/ipc/shm.c b/ipc/shm.c
index 587d836d80d9..0ef4a1cf3e27 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -157,14 +157,22 @@ static void shm_close (struct vm_area_struct *shmd)
157 157
158static int shm_mmap(struct file * file, struct vm_area_struct * vma) 158static int shm_mmap(struct file * file, struct vm_area_struct * vma)
159{ 159{
160 file_accessed(file); 160 int ret;
161 vma->vm_ops = &shm_vm_ops; 161
162 shm_inc(file->f_dentry->d_inode->i_ino); 162 ret = shmem_mmap(file, vma);
163 return 0; 163 if (ret == 0) {
164 vma->vm_ops = &shm_vm_ops;
165 shm_inc(file->f_dentry->d_inode->i_ino);
166 }
167
168 return ret;
164} 169}
165 170
166static struct file_operations shm_file_operations = { 171static struct file_operations shm_file_operations = {
167 .mmap = shm_mmap 172 .mmap = shm_mmap,
173#ifndef CONFIG_MMU
174 .get_unmapped_area = shmem_get_unmapped_area,
175#endif
168}; 176};
169 177
170static struct vm_operations_struct shm_vm_ops = { 178static struct vm_operations_struct shm_vm_ops = {
diff --git a/kernel/acct.c b/kernel/acct.c
index 6312d6bd43e3..38d57fa6b78f 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -427,6 +427,7 @@ static void do_acct_process(long exitcode, struct file *file)
427 u64 elapsed; 427 u64 elapsed;
428 u64 run_time; 428 u64 run_time;
429 struct timespec uptime; 429 struct timespec uptime;
430 unsigned long jiffies;
430 431
431 /* 432 /*
432 * First check to see if there is enough free_space to continue 433 * First check to see if there is enough free_space to continue
@@ -467,12 +468,12 @@ static void do_acct_process(long exitcode, struct file *file)
467#endif 468#endif
468 do_div(elapsed, AHZ); 469 do_div(elapsed, AHZ);
469 ac.ac_btime = xtime.tv_sec - elapsed; 470 ac.ac_btime = xtime.tv_sec - elapsed;
470 ac.ac_utime = encode_comp_t(jiffies_to_AHZ( 471 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime,
471 current->signal->utime + 472 current->signal->utime));
472 current->group_leader->utime)); 473 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
473 ac.ac_stime = encode_comp_t(jiffies_to_AHZ( 474 jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime,
474 current->signal->stime + 475 current->signal->stime));
475 current->group_leader->stime)); 476 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
476 /* we really need to bite the bullet and change layout */ 477 /* we really need to bite the bullet and change layout */
477 ac.ac_uid = current->uid; 478 ac.ac_uid = current->uid;
478 ac.ac_gid = current->gid; 479 ac.ac_gid = current->gid;
@@ -580,7 +581,8 @@ void acct_process(long exitcode)
580void acct_update_integrals(struct task_struct *tsk) 581void acct_update_integrals(struct task_struct *tsk)
581{ 582{
582 if (likely(tsk->mm)) { 583 if (likely(tsk->mm)) {
583 long delta = tsk->stime - tsk->acct_stimexpd; 584 long delta =
585 cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd;
584 586
585 if (delta == 0) 587 if (delta == 0)
586 return; 588 return;
diff --git a/kernel/futex.c b/kernel/futex.c
index 5e71a6bf6f6b..5efa2f978032 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -356,6 +356,13 @@ retry:
356 if (bh1 != bh2) 356 if (bh1 != bh2)
357 spin_unlock(&bh2->lock); 357 spin_unlock(&bh2->lock);
358 358
359#ifndef CONFIG_MMU
360 /* we don't get EFAULT from MMU faults if we don't have an MMU,
361 * but we might get them from range checking */
362 ret = op_ret;
363 goto out;
364#endif
365
359 if (unlikely(op_ret != -EFAULT)) { 366 if (unlikely(op_ret != -EFAULT)) {
360 ret = op_ret; 367 ret = op_ret;
361 goto out; 368 goto out;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 81c49a4d679e..97d5559997d2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -366,6 +366,8 @@ int request_irq(unsigned int irq,
366 action->next = NULL; 366 action->next = NULL;
367 action->dev_id = dev_id; 367 action->dev_id = dev_id;
368 368
369 select_smp_affinity(irq);
370
369 retval = setup_irq(irq, action); 371 retval = setup_irq(irq, action);
370 if (retval) 372 if (retval)
371 kfree(action); 373 kfree(action);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index f26e534c6585..8a64a4844cde 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -68,7 +68,9 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
68 */ 68 */
69 cpus_and(tmp, new_value, cpu_online_map); 69 cpus_and(tmp, new_value, cpu_online_map);
70 if (cpus_empty(tmp)) 70 if (cpus_empty(tmp))
71 return -EINVAL; 71 /* Special case for empty set - allow the architecture
72 code to set default SMP affinity. */
73 return select_smp_affinity(irq) ? -EINVAL : full_count;
72 74
73 proc_set_irq_affinity(irq, new_value); 75 proc_set_irq_affinity(irq, new_value);
74 76
diff --git a/kernel/module.c b/kernel/module.c
index 2ea929d51ad0..4b06bbad49c2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1854,8 +1854,7 @@ static struct module *load_module(void __user *umod,
1854 kfree(args); 1854 kfree(args);
1855 free_hdr: 1855 free_hdr:
1856 vfree(hdr); 1856 vfree(hdr);
1857 if (err < 0) return ERR_PTR(err); 1857 return ERR_PTR(err);
1858 else return ptr;
1859 1858
1860 truncated: 1859 truncated:
1861 printk(KERN_ERR "Module len %lu truncated\n", len); 1860 printk(KERN_ERR "Module len %lu truncated\n", len);
diff --git a/kernel/panic.c b/kernel/panic.c
index aabc5f86fa3f..c5c4ab255834 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -60,7 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...)
60 long i; 60 long i;
61 static char buf[1024]; 61 static char buf[1024];
62 va_list args; 62 va_list args;
63#if defined(CONFIG_ARCH_S390) 63#if defined(CONFIG_S390)
64 unsigned long caller = (unsigned long) __builtin_return_address(0); 64 unsigned long caller = (unsigned long) __builtin_return_address(0);
65#endif 65#endif
66 66
@@ -125,7 +125,7 @@ NORET_TYPE void panic(const char * fmt, ...)
125 printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); 125 printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
126 } 126 }
127#endif 127#endif
128#if defined(CONFIG_ARCH_S390) 128#if defined(CONFIG_S390)
129 disabled_wait(caller); 129 disabled_wait(caller);
130#endif 130#endif
131 local_irq_enable(); 131 local_irq_enable();
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 027322a564f4..e24446f8d8cd 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -24,10 +24,11 @@
24 24
25extern suspend_disk_method_t pm_disk_mode; 25extern suspend_disk_method_t pm_disk_mode;
26 26
27extern int swsusp_shrink_memory(void);
27extern int swsusp_suspend(void); 28extern int swsusp_suspend(void);
28extern int swsusp_write(void); 29extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
29extern int swsusp_check(void); 30extern int swsusp_check(void);
30extern int swsusp_read(void); 31extern int swsusp_read(struct pbe **pblist_ptr);
31extern void swsusp_close(void); 32extern void swsusp_close(void);
32extern int swsusp_resume(void); 33extern int swsusp_resume(void);
33 34
@@ -73,31 +74,6 @@ static void power_down(suspend_disk_method_t mode)
73static int in_suspend __nosavedata = 0; 74static int in_suspend __nosavedata = 0;
74 75
75 76
76/**
77 * free_some_memory - Try to free as much memory as possible
78 *
79 * ... but do not OOM-kill anyone
80 *
81 * Notice: all userland should be stopped at this point, or
82 * livelock is possible.
83 */
84
85static void free_some_memory(void)
86{
87 unsigned int i = 0;
88 unsigned int tmp;
89 unsigned long pages = 0;
90 char *p = "-\\|/";
91
92 printk("Freeing memory... ");
93 while ((tmp = shrink_all_memory(10000))) {
94 pages += tmp;
95 printk("\b%c", p[i++ % 4]);
96 }
97 printk("\bdone (%li pages freed)\n", pages);
98}
99
100
101static inline void platform_finish(void) 77static inline void platform_finish(void)
102{ 78{
103 if (pm_disk_mode == PM_DISK_PLATFORM) { 79 if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -127,8 +103,8 @@ static int prepare_processes(void)
127 } 103 }
128 104
129 /* Free memory before shutting down devices. */ 105 /* Free memory before shutting down devices. */
130 free_some_memory(); 106 if (!(error = swsusp_shrink_memory()))
131 return 0; 107 return 0;
132thaw: 108thaw:
133 thaw_processes(); 109 thaw_processes();
134 enable_nonboot_cpus(); 110 enable_nonboot_cpus();
@@ -176,7 +152,7 @@ int pm_suspend_disk(void)
176 if (in_suspend) { 152 if (in_suspend) {
177 device_resume(); 153 device_resume();
178 pr_debug("PM: writing image.\n"); 154 pr_debug("PM: writing image.\n");
179 error = swsusp_write(); 155 error = swsusp_write(pagedir_nosave, nr_copy_pages);
180 if (!error) 156 if (!error)
181 power_down(pm_disk_mode); 157 power_down(pm_disk_mode);
182 else { 158 else {
@@ -247,7 +223,7 @@ static int software_resume(void)
247 223
248 pr_debug("PM: Reading swsusp image.\n"); 224 pr_debug("PM: Reading swsusp image.\n");
249 225
250 if ((error = swsusp_read())) { 226 if ((error = swsusp_read(&pagedir_nosave))) {
251 swsusp_free(); 227 swsusp_free();
252 goto Thaw; 228 goto Thaw;
253 } 229 }
@@ -363,37 +339,55 @@ static ssize_t resume_show(struct subsystem * subsys, char *buf)
363 MINOR(swsusp_resume_device)); 339 MINOR(swsusp_resume_device));
364} 340}
365 341
366static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n) 342static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
367{ 343{
368 int len;
369 char *p;
370 unsigned int maj, min; 344 unsigned int maj, min;
371 int error = -EINVAL;
372 dev_t res; 345 dev_t res;
346 int ret = -EINVAL;
373 347
374 p = memchr(buf, '\n', n); 348 if (sscanf(buf, "%u:%u", &maj, &min) != 2)
375 len = p ? p - buf : n; 349 goto out;
376 350
377 if (sscanf(buf, "%u:%u", &maj, &min) == 2) { 351 res = MKDEV(maj,min);
378 res = MKDEV(maj,min); 352 if (maj != MAJOR(res) || min != MINOR(res))
379 if (maj == MAJOR(res) && min == MINOR(res)) { 353 goto out;
380 down(&pm_sem);
381 swsusp_resume_device = res;
382 up(&pm_sem);
383 printk("Attempting manual resume\n");
384 noresume = 0;
385 software_resume();
386 }
387 }
388 354
389 return error >= 0 ? n : error; 355 down(&pm_sem);
356 swsusp_resume_device = res;
357 up(&pm_sem);
358 printk("Attempting manual resume\n");
359 noresume = 0;
360 software_resume();
361 ret = n;
362out:
363 return ret;
390} 364}
391 365
392power_attr(resume); 366power_attr(resume);
393 367
368static ssize_t image_size_show(struct subsystem * subsys, char *buf)
369{
370 return sprintf(buf, "%u\n", image_size);
371}
372
373static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n)
374{
375 unsigned int size;
376
377 if (sscanf(buf, "%u", &size) == 1) {
378 image_size = size;
379 return n;
380 }
381
382 return -EINVAL;
383}
384
385power_attr(image_size);
386
394static struct attribute * g[] = { 387static struct attribute * g[] = {
395 &disk_attr.attr, 388 &disk_attr.attr,
396 &resume_attr.attr, 389 &resume_attr.attr,
390 &image_size_attr.attr,
397 NULL, 391 NULL,
398}; 392};
399 393
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 6c042b5ee14b..7e8492fd1423 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -9,19 +9,13 @@
9#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 9#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
10#endif 10#endif
11 11
12#define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \
13 - 4 - 3*sizeof(unsigned long) - sizeof(int) \
14 - sizeof(void *)) / sizeof(swp_entry_t))
15
16struct swsusp_info { 12struct swsusp_info {
17 struct new_utsname uts; 13 struct new_utsname uts;
18 u32 version_code; 14 u32 version_code;
19 unsigned long num_physpages; 15 unsigned long num_physpages;
20 int cpus; 16 int cpus;
21 unsigned long image_pages; 17 unsigned long image_pages;
22 unsigned long pagedir_pages; 18 unsigned long pages;
23 suspend_pagedir_t * suspend_pagedir;
24 swp_entry_t pagedir[MAX_PBES];
25} __attribute__((aligned(PAGE_SIZE))); 19} __attribute__((aligned(PAGE_SIZE)));
26 20
27 21
@@ -48,25 +42,27 @@ static struct subsys_attribute _name##_attr = { \
48 42
49extern struct subsystem power_subsys; 43extern struct subsystem power_subsys;
50 44
51extern int freeze_processes(void);
52extern void thaw_processes(void);
53
54extern int pm_prepare_console(void); 45extern int pm_prepare_console(void);
55extern void pm_restore_console(void); 46extern void pm_restore_console(void);
56 47
57
58/* References to section boundaries */ 48/* References to section boundaries */
59extern const void __nosave_begin, __nosave_end; 49extern const void __nosave_begin, __nosave_end;
60 50
61extern unsigned int nr_copy_pages; 51extern unsigned int nr_copy_pages;
62extern suspend_pagedir_t *pagedir_nosave; 52extern struct pbe *pagedir_nosave;
63extern suspend_pagedir_t *pagedir_save; 53
54/* Preferred image size in MB (default 500) */
55extern unsigned int image_size;
64 56
65extern asmlinkage int swsusp_arch_suspend(void); 57extern asmlinkage int swsusp_arch_suspend(void);
66extern asmlinkage int swsusp_arch_resume(void); 58extern asmlinkage int swsusp_arch_resume(void);
67 59
60extern unsigned int count_data_pages(void);
68extern void free_pagedir(struct pbe *pblist); 61extern void free_pagedir(struct pbe *pblist);
62extern void release_eaten_pages(void);
69extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); 63extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
70extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
71extern void swsusp_free(void); 64extern void swsusp_free(void);
72extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); 65extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
66extern unsigned int snapshot_nr_pages(void);
67extern struct pbe *snapshot_pblist(void);
68extern void snapshot_pblist_set(struct pbe *pblist);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4a6dbcefd378..41f66365f0d8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -33,7 +33,35 @@
33 33
34#include "power.h" 34#include "power.h"
35 35
36struct pbe *pagedir_nosave;
37unsigned int nr_copy_pages;
38
36#ifdef CONFIG_HIGHMEM 39#ifdef CONFIG_HIGHMEM
40unsigned int count_highmem_pages(void)
41{
42 struct zone *zone;
43 unsigned long zone_pfn;
44 unsigned int n = 0;
45
46 for_each_zone (zone)
47 if (is_highmem(zone)) {
48 mark_free_pages(zone);
49 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
50 struct page *page;
51 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
52 if (!pfn_valid(pfn))
53 continue;
54 page = pfn_to_page(pfn);
55 if (PageReserved(page))
56 continue;
57 if (PageNosaveFree(page))
58 continue;
59 n++;
60 }
61 }
62 return n;
63}
64
37struct highmem_page { 65struct highmem_page {
38 char *data; 66 char *data;
39 struct page *page; 67 struct page *page;
@@ -149,17 +177,15 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn)
149 BUG_ON(PageReserved(page) && PageNosave(page)); 177 BUG_ON(PageReserved(page) && PageNosave(page));
150 if (PageNosave(page)) 178 if (PageNosave(page))
151 return 0; 179 return 0;
152 if (PageReserved(page) && pfn_is_nosave(pfn)) { 180 if (PageReserved(page) && pfn_is_nosave(pfn))
153 pr_debug("[nosave pfn 0x%lx]", pfn);
154 return 0; 181 return 0;
155 }
156 if (PageNosaveFree(page)) 182 if (PageNosaveFree(page))
157 return 0; 183 return 0;
158 184
159 return 1; 185 return 1;
160} 186}
161 187
162static unsigned count_data_pages(void) 188unsigned int count_data_pages(void)
163{ 189{
164 struct zone *zone; 190 struct zone *zone;
165 unsigned long zone_pfn; 191 unsigned long zone_pfn;
@@ -244,7 +270,7 @@ static inline void fill_pb_page(struct pbe *pbpage)
244 * of memory pages allocated with alloc_pagedir() 270 * of memory pages allocated with alloc_pagedir()
245 */ 271 */
246 272
247void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) 273static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
248{ 274{
249 struct pbe *pbpage, *p; 275 struct pbe *pbpage, *p;
250 unsigned int num = PBES_PER_PAGE; 276 unsigned int num = PBES_PER_PAGE;
@@ -261,7 +287,35 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
261 p->next = p + 1; 287 p->next = p + 1;
262 p->next = NULL; 288 p->next = NULL;
263 } 289 }
264 pr_debug("create_pbe_list(): initialized %d PBEs\n", num); 290}
291
292/**
293 * On resume it is necessary to trace and eventually free the unsafe
294 * pages that have been allocated, because they are needed for I/O
295 * (on x86-64 we likely will "eat" these pages once again while
296 * creating the temporary page translation tables)
297 */
298
299struct eaten_page {
300 struct eaten_page *next;
301 char padding[PAGE_SIZE - sizeof(void *)];
302};
303
304static struct eaten_page *eaten_pages = NULL;
305
306void release_eaten_pages(void)
307{
308 struct eaten_page *p, *q;
309
310 p = eaten_pages;
311 while (p) {
312 q = p->next;
313 /* We don't want swsusp_free() to free this page again */
314 ClearPageNosave(virt_to_page(p));
315 free_page((unsigned long)p);
316 p = q;
317 }
318 eaten_pages = NULL;
265} 319}
266 320
267/** 321/**
@@ -282,9 +336,12 @@ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
282 if (safe_needed) 336 if (safe_needed)
283 do { 337 do {
284 res = (void *)get_zeroed_page(gfp_mask); 338 res = (void *)get_zeroed_page(gfp_mask);
285 if (res && PageNosaveFree(virt_to_page(res))) 339 if (res && PageNosaveFree(virt_to_page(res))) {
286 /* This is for swsusp_free() */ 340 /* This is for swsusp_free() */
287 SetPageNosave(virt_to_page(res)); 341 SetPageNosave(virt_to_page(res));
342 ((struct eaten_page *)res)->next = eaten_pages;
343 eaten_pages = res;
344 }
288 } while (res && PageNosaveFree(virt_to_page(res))); 345 } while (res && PageNosaveFree(virt_to_page(res)));
289 else 346 else
290 res = (void *)get_zeroed_page(gfp_mask); 347 res = (void *)get_zeroed_page(gfp_mask);
@@ -332,7 +389,8 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
332 if (!pbe) { /* get_zeroed_page() failed */ 389 if (!pbe) { /* get_zeroed_page() failed */
333 free_pagedir(pblist); 390 free_pagedir(pblist);
334 pblist = NULL; 391 pblist = NULL;
335 } 392 } else
393 create_pbe_list(pblist, nr_pages);
336 return pblist; 394 return pblist;
337} 395}
338 396
@@ -370,8 +428,14 @@ void swsusp_free(void)
370 428
371static int enough_free_mem(unsigned int nr_pages) 429static int enough_free_mem(unsigned int nr_pages)
372{ 430{
373 pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); 431 struct zone *zone;
374 return nr_free_pages() > (nr_pages + PAGES_FOR_IO + 432 unsigned int n = 0;
433
434 for_each_zone (zone)
435 if (!is_highmem(zone))
436 n += zone->free_pages;
437 pr_debug("swsusp: available memory: %u pages\n", n);
438 return n > (nr_pages + PAGES_FOR_IO +
375 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); 439 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
376} 440}
377 441
@@ -395,7 +459,6 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages)
395 printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); 459 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
396 return NULL; 460 return NULL;
397 } 461 }
398 create_pbe_list(pblist, nr_pages);
399 462
400 if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { 463 if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
401 printk(KERN_ERR "suspend: Allocating image pages failed.\n"); 464 printk(KERN_ERR "suspend: Allocating image pages failed.\n");
@@ -421,10 +484,6 @@ asmlinkage int swsusp_save(void)
421 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, 484 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
422 PAGES_FOR_IO, nr_free_pages()); 485 PAGES_FOR_IO, nr_free_pages());
423 486
424 /* This is needed because of the fixed size of swsusp_info */
425 if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
426 return -ENOSPC;
427
428 if (!enough_free_mem(nr_pages)) { 487 if (!enough_free_mem(nr_pages)) {
429 printk(KERN_ERR "swsusp: Not enough free memory\n"); 488 printk(KERN_ERR "swsusp: Not enough free memory\n");
430 return -ENOMEM; 489 return -ENOMEM;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c05f46e7348f..55a18d26abed 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -30,8 +30,8 @@
30 * Alex Badea <vampire@go.ro>: 30 * Alex Badea <vampire@go.ro>:
31 * Fixed runaway init 31 * Fixed runaway init
32 * 32 *
33 * Andreas Steinmetz <ast@domdv.de>: 33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Added encrypted suspend option 34 * Added the swap map data structure and reworked the handling of swap
35 * 35 *
36 * More state savers are welcome. Especially for the scsi layer... 36 * More state savers are welcome. Especially for the scsi layer...
37 * 37 *
@@ -67,44 +67,33 @@
67#include <asm/tlbflush.h> 67#include <asm/tlbflush.h>
68#include <asm/io.h> 68#include <asm/io.h>
69 69
70#include <linux/random.h>
71#include <linux/crypto.h>
72#include <asm/scatterlist.h>
73
74#include "power.h" 70#include "power.h"
75 71
72/*
73 * Preferred image size in MB (tunable via /sys/power/image_size).
74 * When it is set to N, swsusp will do its best to ensure the image
75 * size will not exceed N MB, but if that is impossible, it will
76 * try to create the smallest image possible.
77 */
78unsigned int image_size = 500;
79
76#ifdef CONFIG_HIGHMEM 80#ifdef CONFIG_HIGHMEM
81unsigned int count_highmem_pages(void);
77int save_highmem(void); 82int save_highmem(void);
78int restore_highmem(void); 83int restore_highmem(void);
79#else 84#else
80static int save_highmem(void) { return 0; } 85static int save_highmem(void) { return 0; }
81static int restore_highmem(void) { return 0; } 86static int restore_highmem(void) { return 0; }
87static unsigned int count_highmem_pages(void) { return 0; }
82#endif 88#endif
83 89
84#define CIPHER "aes"
85#define MAXKEY 32
86#define MAXIV 32
87
88extern char resume_file[]; 90extern char resume_file[];
89 91
90/* Local variables that should not be affected by save */
91unsigned int nr_copy_pages __nosavedata = 0;
92
93/* Suspend pagedir is allocated before final copy, therefore it
94 must be freed after resume
95
96 Warning: this is even more evil than it seems. Pagedirs this file
97 talks about are completely different from page directories used by
98 MMU hardware.
99 */
100suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
101
102#define SWSUSP_SIG "S1SUSPEND" 92#define SWSUSP_SIG "S1SUSPEND"
103 93
104static struct swsusp_header { 94static struct swsusp_header {
105 char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)]; 95 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
106 u8 key_iv[MAXKEY+MAXIV]; 96 swp_entry_t image;
107 swp_entry_t swsusp_info;
108 char orig_sig[10]; 97 char orig_sig[10];
109 char sig[10]; 98 char sig[10];
110} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; 99} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
@@ -115,140 +104,9 @@ static struct swsusp_info swsusp_info;
115 * Saving part... 104 * Saving part...
116 */ 105 */
117 106
118/* We memorize in swapfile_used what swap devices are used for suspension */ 107static unsigned short root_swap = 0xffff;
119#define SWAPFILE_UNUSED 0
120#define SWAPFILE_SUSPEND 1 /* This is the suspending device */
121#define SWAPFILE_IGNORED 2 /* Those are other swap devices ignored for suspension */
122
123static unsigned short swapfile_used[MAX_SWAPFILES];
124static unsigned short root_swap;
125
126static int write_page(unsigned long addr, swp_entry_t *loc);
127static int bio_read_page(pgoff_t page_off, void *page);
128
129static u8 key_iv[MAXKEY+MAXIV];
130
131#ifdef CONFIG_SWSUSP_ENCRYPT
132
133static int crypto_init(int mode, void **mem)
134{
135 int error = 0;
136 int len;
137 char *modemsg;
138 struct crypto_tfm *tfm;
139
140 modemsg = mode ? "suspend not possible" : "resume not possible";
141
142 tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC);
143 if(!tfm) {
144 printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg);
145 error = -EINVAL;
146 goto out;
147 }
148
149 if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) {
150 printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg);
151 error = -ENOKEY;
152 goto fail;
153 }
154
155 if (mode)
156 get_random_bytes(key_iv, MAXKEY+MAXIV);
157
158 len = crypto_tfm_alg_max_keysize(tfm);
159 if (len > MAXKEY)
160 len = MAXKEY;
161
162 if (crypto_cipher_setkey(tfm, key_iv, len)) {
163 printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg);
164 error = -EKEYREJECTED;
165 goto fail;
166 }
167
168 len = crypto_tfm_alg_ivsize(tfm);
169
170 if (MAXIV < len) {
171 printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg);
172 error = -EOVERFLOW;
173 goto fail;
174 }
175
176 crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len);
177
178 *mem=(void *)tfm;
179
180 goto out;
181
182fail: crypto_free_tfm(tfm);
183out: return error;
184}
185
186static __inline__ void crypto_exit(void *mem)
187{
188 crypto_free_tfm((struct crypto_tfm *)mem);
189}
190
191static __inline__ int crypto_write(struct pbe *p, void *mem)
192{
193 int error = 0;
194 struct scatterlist src, dst;
195
196 src.page = virt_to_page(p->address);
197 src.offset = 0;
198 src.length = PAGE_SIZE;
199 dst.page = virt_to_page((void *)&swsusp_header);
200 dst.offset = 0;
201 dst.length = PAGE_SIZE;
202
203 error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src,
204 PAGE_SIZE);
205
206 if (!error)
207 error = write_page((unsigned long)&swsusp_header,
208 &(p->swap_address));
209 return error;
210}
211
212static __inline__ int crypto_read(struct pbe *p, void *mem)
213{
214 int error = 0;
215 struct scatterlist src, dst;
216
217 error = bio_read_page(swp_offset(p->swap_address), (void *)p->address);
218 if (!error) {
219 src.offset = 0;
220 src.length = PAGE_SIZE;
221 dst.offset = 0;
222 dst.length = PAGE_SIZE;
223 src.page = dst.page = virt_to_page((void *)p->address);
224
225 error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst,
226 &src, PAGE_SIZE);
227 }
228 return error;
229}
230#else
231static __inline__ int crypto_init(int mode, void *mem)
232{
233 return 0;
234}
235
236static __inline__ void crypto_exit(void *mem)
237{
238}
239
240static __inline__ int crypto_write(struct pbe *p, void *mem)
241{
242 return write_page(p->address, &(p->swap_address));
243}
244 108
245static __inline__ int crypto_read(struct pbe *p, void *mem) 109static int mark_swapfiles(swp_entry_t start)
246{
247 return bio_read_page(swp_offset(p->swap_address), (void *)p->address);
248}
249#endif
250
251static int mark_swapfiles(swp_entry_t prev)
252{ 110{
253 int error; 111 int error;
254 112
@@ -259,8 +117,7 @@ static int mark_swapfiles(swp_entry_t prev)
259 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 117 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
260 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 118 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
261 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 119 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
262 memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV); 120 swsusp_header.image = start;
263 swsusp_header.swsusp_info = prev;
264 error = rw_swap_page_sync(WRITE, 121 error = rw_swap_page_sync(WRITE,
265 swp_entry(root_swap, 0), 122 swp_entry(root_swap, 0),
266 virt_to_page((unsigned long) 123 virt_to_page((unsigned long)
@@ -283,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev)
283 * devfs, since the resume code can only recognize the form /dev/hda4, 140 * devfs, since the resume code can only recognize the form /dev/hda4,
284 * but the suspend code would see the long name.) 141 * but the suspend code would see the long name.)
285 */ 142 */
286static int is_resume_device(const struct swap_info_struct *swap_info) 143static inline int is_resume_device(const struct swap_info_struct *swap_info)
287{ 144{
288 struct file *file = swap_info->swap_file; 145 struct file *file = swap_info->swap_file;
289 struct inode *inode = file->f_dentry->d_inode; 146 struct inode *inode = file->f_dentry->d_inode;
@@ -294,54 +151,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info)
294 151
295static int swsusp_swap_check(void) /* This is called before saving image */ 152static int swsusp_swap_check(void) /* This is called before saving image */
296{ 153{
297 int i, len;
298
299 len=strlen(resume_file);
300 root_swap = 0xFFFF;
301
302 spin_lock(&swap_lock);
303 for (i=0; i<MAX_SWAPFILES; i++) {
304 if (!(swap_info[i].flags & SWP_WRITEOK)) {
305 swapfile_used[i]=SWAPFILE_UNUSED;
306 } else {
307 if (!len) {
308 printk(KERN_WARNING "resume= option should be used to set suspend device" );
309 if (root_swap == 0xFFFF) {
310 swapfile_used[i] = SWAPFILE_SUSPEND;
311 root_swap = i;
312 } else
313 swapfile_used[i] = SWAPFILE_IGNORED;
314 } else {
315 /* we ignore all swap devices that are not the resume_file */
316 if (is_resume_device(&swap_info[i])) {
317 swapfile_used[i] = SWAPFILE_SUSPEND;
318 root_swap = i;
319 } else {
320 swapfile_used[i] = SWAPFILE_IGNORED;
321 }
322 }
323 }
324 }
325 spin_unlock(&swap_lock);
326 return (root_swap != 0xffff) ? 0 : -ENODEV;
327}
328
329/**
330 * This is called after saving image so modification
331 * will be lost after resume... and that's what we want.
332 * we make the device unusable. A new call to
333 * lock_swapdevices can unlock the devices.
334 */
335static void lock_swapdevices(void)
336{
337 int i; 154 int i;
338 155
156 if (!swsusp_resume_device)
157 return -ENODEV;
339 spin_lock(&swap_lock); 158 spin_lock(&swap_lock);
340 for (i = 0; i< MAX_SWAPFILES; i++) 159 for (i = 0; i < MAX_SWAPFILES; i++) {
341 if (swapfile_used[i] == SWAPFILE_IGNORED) { 160 if (!(swap_info[i].flags & SWP_WRITEOK))
342 swap_info[i].flags ^= SWP_WRITEOK; 161 continue;
162 if (is_resume_device(swap_info + i)) {
163 spin_unlock(&swap_lock);
164 root_swap = i;
165 return 0;
343 } 166 }
167 }
344 spin_unlock(&swap_lock); 168 spin_unlock(&swap_lock);
169 return -ENODEV;
345} 170}
346 171
347/** 172/**
@@ -359,72 +184,217 @@ static void lock_swapdevices(void)
359static int write_page(unsigned long addr, swp_entry_t *loc) 184static int write_page(unsigned long addr, swp_entry_t *loc)
360{ 185{
361 swp_entry_t entry; 186 swp_entry_t entry;
362 int error = 0; 187 int error = -ENOSPC;
363 188
364 entry = get_swap_page(); 189 entry = get_swap_page_of_type(root_swap);
365 if (swp_offset(entry) && 190 if (swp_offset(entry)) {
366 swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { 191 error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
367 error = rw_swap_page_sync(WRITE, entry, 192 if (!error || error == -EIO)
368 virt_to_page(addr));
369 if (error == -EIO)
370 error = 0;
371 if (!error)
372 *loc = entry; 193 *loc = entry;
373 } else 194 }
374 error = -ENOSPC;
375 return error; 195 return error;
376} 196}
377 197
378/** 198/**
379 * data_free - Free the swap entries used by the saved image. 199 * Swap map-handling functions
200 *
201 * The swap map is a data structure used for keeping track of each page
202 * written to the swap. It consists of many swap_map_page structures
203 * that contain each an array of MAP_PAGE_SIZE swap entries.
204 * These structures are linked together with the help of either the
205 * .next (in memory) or the .next_swap (in swap) member.
380 * 206 *
381 * Walk the list of used swap entries and free each one. 207 * The swap map is created during suspend. At that time we need to keep
382 * This is only used for cleanup when suspend fails. 208 * it in memory, because we have to free all of the allocated swap
209 * entries if an error occurs. The memory needed is preallocated
210 * so that we know in advance if there's enough of it.
211 *
212 * The first swap_map_page structure is filled with the swap entries that
213 * correspond to the first MAP_PAGE_SIZE data pages written to swap and
214 * so on. After the all of the data pages have been written, the order
215 * of the swap_map_page structures in the map is reversed so that they
216 * can be read from swap in the original order. This causes the data
217 * pages to be loaded in exactly the same order in which they have been
218 * saved.
219 *
220 * During resume we only need to use one swap_map_page structure
221 * at a time, which means that we only need to use two memory pages for
222 * reading the image - one for reading the swap_map_page structures
223 * and the second for reading the data pages from swap.
383 */ 224 */
384static void data_free(void) 225
226#define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
227 / sizeof(swp_entry_t))
228
229struct swap_map_page {
230 swp_entry_t entries[MAP_PAGE_SIZE];
231 swp_entry_t next_swap;
232 struct swap_map_page *next;
233};
234
235static inline void free_swap_map(struct swap_map_page *swap_map)
385{ 236{
386 swp_entry_t entry; 237 struct swap_map_page *swp;
387 struct pbe *p;
388 238
389 for_each_pbe (p, pagedir_nosave) { 239 while (swap_map) {
390 entry = p->swap_address; 240 swp = swap_map->next;
391 if (entry.val) 241 free_page((unsigned long)swap_map);
392 swap_free(entry); 242 swap_map = swp;
393 else
394 break;
395 } 243 }
396} 244}
397 245
246static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
247{
248 struct swap_map_page *swap_map, *swp;
249 unsigned n = 0;
250
251 if (!nr_pages)
252 return NULL;
253
254 pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
255 swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
256 swp = swap_map;
257 for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
258 swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
259 swp = swp->next;
260 if (!swp) {
261 free_swap_map(swap_map);
262 return NULL;
263 }
264 }
265 return swap_map;
266}
267
398/** 268/**
399 * data_write - Write saved image to swap. 269 * reverse_swap_map - reverse the order of pages in the swap map
400 * 270 * @swap_map
401 * Walk the list of pages in the image and sync each one to swap.
402 */ 271 */
403static int data_write(void) 272
273static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
404{ 274{
405 int error = 0, i = 0; 275 struct swap_map_page *prev, *next;
406 unsigned int mod = nr_copy_pages / 100; 276
407 struct pbe *p; 277 prev = NULL;
408 void *tfm; 278 while (swap_map) {
279 next = swap_map->next;
280 swap_map->next = prev;
281 prev = swap_map;
282 swap_map = next;
283 }
284 return prev;
285}
409 286
410 if ((error = crypto_init(1, &tfm))) 287/**
411 return error; 288 * free_swap_map_entries - free the swap entries allocated to store
289 * the swap map @swap_map (this is only called in case of an error)
290 */
291static inline void free_swap_map_entries(struct swap_map_page *swap_map)
292{
293 while (swap_map) {
294 if (swap_map->next_swap.val)
295 swap_free(swap_map->next_swap);
296 swap_map = swap_map->next;
297 }
298}
412 299
413 if (!mod) 300/**
414 mod = 1; 301 * save_swap_map - save the swap map used for tracing the data pages
302 * stored in the swap
303 */
415 304
416 printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); 305static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
417 for_each_pbe (p, pagedir_nosave) { 306{
418 if (!(i%mod)) 307 swp_entry_t entry = (swp_entry_t){0};
419 printk( "\b\b\b\b%3d%%", i / mod ); 308 int error;
420 if ((error = crypto_write(p, tfm))) { 309
421 crypto_exit(tfm); 310 while (swap_map) {
311 swap_map->next_swap = entry;
312 if ((error = write_page((unsigned long)swap_map, &entry)))
422 return error; 313 return error;
423 } 314 swap_map = swap_map->next;
424 i++;
425 } 315 }
426 printk("\b\b\b\bdone\n"); 316 *start = entry;
427 crypto_exit(tfm); 317 return 0;
318}
319
320/**
321 * free_image_entries - free the swap entries allocated to store
322 * the image data pages (this is only called in case of an error)
323 */
324
325static inline void free_image_entries(struct swap_map_page *swp)
326{
327 unsigned k;
328
329 while (swp) {
330 for (k = 0; k < MAP_PAGE_SIZE; k++)
331 if (swp->entries[k].val)
332 swap_free(swp->entries[k]);
333 swp = swp->next;
334 }
335}
336
337/**
338 * The swap_map_handle structure is used for handling the swap map in
339 * a file-alike way
340 */
341
342struct swap_map_handle {
343 struct swap_map_page *cur;
344 unsigned int k;
345};
346
347static inline void init_swap_map_handle(struct swap_map_handle *handle,
348 struct swap_map_page *map)
349{
350 handle->cur = map;
351 handle->k = 0;
352}
353
354static inline int swap_map_write_page(struct swap_map_handle *handle,
355 unsigned long addr)
356{
357 int error;
358
359 error = write_page(addr, handle->cur->entries + handle->k);
360 if (error)
361 return error;
362 if (++handle->k >= MAP_PAGE_SIZE) {
363 handle->cur = handle->cur->next;
364 handle->k = 0;
365 }
366 return 0;
367}
368
369/**
370 * save_image_data - save the data pages pointed to by the PBEs
371 * from the list @pblist using the swap map handle @handle
372 * (assume there are @nr_pages data pages to save)
373 */
374
375static int save_image_data(struct pbe *pblist,
376 struct swap_map_handle *handle,
377 unsigned int nr_pages)
378{
379 unsigned int m;
380 struct pbe *p;
381 int error = 0;
382
383 printk("Saving image data pages (%u pages) ... ", nr_pages);
384 m = nr_pages / 100;
385 if (!m)
386 m = 1;
387 nr_pages = 0;
388 for_each_pbe (p, pblist) {
389 error = swap_map_write_page(handle, p->address);
390 if (error)
391 break;
392 if (!(nr_pages % m))
393 printk("\b\b\b\b%3d%%", nr_pages / m);
394 nr_pages++;
395 }
396 if (!error)
397 printk("\b\b\b\bdone\n");
428 return error; 398 return error;
429} 399}
430 400
@@ -440,70 +410,70 @@ static void dump_info(void)
440 pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); 410 pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
441 pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); 411 pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
442 pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); 412 pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
443 pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages); 413 pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
444} 414}
445 415
446static void init_header(void) 416static void init_header(unsigned int nr_pages)
447{ 417{
448 memset(&swsusp_info, 0, sizeof(swsusp_info)); 418 memset(&swsusp_info, 0, sizeof(swsusp_info));
449 swsusp_info.version_code = LINUX_VERSION_CODE; 419 swsusp_info.version_code = LINUX_VERSION_CODE;
450 swsusp_info.num_physpages = num_physpages; 420 swsusp_info.num_physpages = num_physpages;
451 memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); 421 memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
452 422
453 swsusp_info.suspend_pagedir = pagedir_nosave;
454 swsusp_info.cpus = num_online_cpus(); 423 swsusp_info.cpus = num_online_cpus();
455 swsusp_info.image_pages = nr_copy_pages; 424 swsusp_info.image_pages = nr_pages;
456} 425 swsusp_info.pages = nr_pages +
457 426 ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
458static int close_swap(void)
459{
460 swp_entry_t entry;
461 int error;
462
463 dump_info();
464 error = write_page((unsigned long)&swsusp_info, &entry);
465 if (!error) {
466 printk( "S" );
467 error = mark_swapfiles(entry);
468 printk( "|\n" );
469 }
470 return error;
471} 427}
472 428
473/** 429/**
474 * free_pagedir_entries - Free pages used by the page directory. 430 * pack_orig_addresses - the .orig_address fields of the PBEs from the
475 * 431 * list starting at @pbe are stored in the array @buf[] (1 page)
476 * This is used during suspend for error recovery.
477 */ 432 */
478 433
479static void free_pagedir_entries(void) 434static inline struct pbe *pack_orig_addresses(unsigned long *buf,
435 struct pbe *pbe)
480{ 436{
481 int i; 437 int j;
482 438
483 for (i = 0; i < swsusp_info.pagedir_pages; i++) 439 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
484 swap_free(swsusp_info.pagedir[i]); 440 buf[j] = pbe->orig_address;
441 pbe = pbe->next;
442 }
443 if (!pbe)
444 for (; j < PAGE_SIZE / sizeof(long); j++)
445 buf[j] = 0;
446 return pbe;
485} 447}
486 448
487
488/** 449/**
489 * write_pagedir - Write the array of pages holding the page directory. 450 * save_image_metadata - save the .orig_address fields of the PBEs
490 * @last: Last swap entry we write (needed for header). 451 * from the list @pblist using the swap map handle @handle
491 */ 452 */
492 453
493static int write_pagedir(void) 454static int save_image_metadata(struct pbe *pblist,
455 struct swap_map_handle *handle)
494{ 456{
495 int error = 0; 457 unsigned long *buf;
496 unsigned int n = 0; 458 unsigned int n = 0;
497 struct pbe *pbe; 459 struct pbe *p;
460 int error = 0;
498 461
499 printk( "Writing pagedir..."); 462 printk("Saving image metadata ... ");
500 for_each_pb_page (pbe, pagedir_nosave) { 463 buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
501 if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) 464 if (!buf)
502 return error; 465 return -ENOMEM;
466 p = pblist;
467 while (p) {
468 p = pack_orig_addresses(buf, p);
469 error = swap_map_write_page(handle, (unsigned long)buf);
470 if (error)
471 break;
472 n++;
503 } 473 }
504 474 free_page((unsigned long)buf);
505 swsusp_info.pagedir_pages = n; 475 if (!error)
506 printk("done (%u pages)\n", n); 476 printk("done (%u pages saved)\n", n);
507 return error; 477 return error;
508} 478}
509 479
@@ -511,75 +481,125 @@ static int write_pagedir(void)
511 * enough_swap - Make sure we have enough swap to save the image. 481 * enough_swap - Make sure we have enough swap to save the image.
512 * 482 *
513 * Returns TRUE or FALSE after checking the total amount of swap 483 * Returns TRUE or FALSE after checking the total amount of swap
514 * space avaiable. 484 * space avaiable from the resume partition.
515 *
516 * FIXME: si_swapinfo(&i) returns all swap devices information.
517 * We should only consider resume_device.
518 */ 485 */
519 486
520static int enough_swap(unsigned int nr_pages) 487static int enough_swap(unsigned int nr_pages)
521{ 488{
522 struct sysinfo i; 489 unsigned int free_swap = swap_info[root_swap].pages -
490 swap_info[root_swap].inuse_pages;
523 491
524 si_swapinfo(&i); 492 pr_debug("swsusp: free swap pages: %u\n", free_swap);
525 pr_debug("swsusp: available swap: %lu pages\n", i.freeswap); 493 return free_swap > (nr_pages + PAGES_FOR_IO +
526 return i.freeswap > (nr_pages + PAGES_FOR_IO +
527 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); 494 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
528} 495}
529 496
530/** 497/**
531 * write_suspend_image - Write entire image and metadata. 498 * swsusp_write - Write entire image and metadata.
532 * 499 *
500 * It is important _NOT_ to umount filesystems at this point. We want
501 * them synced (in case something goes wrong) but we DO not want to mark
502 * filesystem clean: it is not. (And it does not matter, if we resume
503 * correctly, we'll mark system clean, anyway.)
533 */ 504 */
534static int write_suspend_image(void) 505
506int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
535{ 507{
508 struct swap_map_page *swap_map;
509 struct swap_map_handle handle;
510 swp_entry_t start;
536 int error; 511 int error;
537 512
538 if (!enough_swap(nr_copy_pages)) { 513 if ((error = swsusp_swap_check())) {
514 printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
515 return error;
516 }
517 if (!enough_swap(nr_pages)) {
539 printk(KERN_ERR "swsusp: Not enough free swap\n"); 518 printk(KERN_ERR "swsusp: Not enough free swap\n");
540 return -ENOSPC; 519 return -ENOSPC;
541 } 520 }
542 521
543 init_header(); 522 init_header(nr_pages);
544 if ((error = data_write())) 523 swap_map = alloc_swap_map(swsusp_info.pages);
545 goto FreeData; 524 if (!swap_map)
525 return -ENOMEM;
526 init_swap_map_handle(&handle, swap_map);
527
528 error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
529 if (!error)
530 error = save_image_metadata(pblist, &handle);
531 if (!error)
532 error = save_image_data(pblist, &handle, nr_pages);
533 if (error)
534 goto Free_image_entries;
546 535
547 if ((error = write_pagedir())) 536 swap_map = reverse_swap_map(swap_map);
548 goto FreePagedir; 537 error = save_swap_map(swap_map, &start);
538 if (error)
539 goto Free_map_entries;
549 540
550 if ((error = close_swap())) 541 dump_info();
551 goto FreePagedir; 542 printk( "S" );
552 Done: 543 error = mark_swapfiles(start);
553 memset(key_iv, 0, MAXKEY+MAXIV); 544 printk( "|\n" );
545 if (error)
546 goto Free_map_entries;
547
548Free_swap_map:
549 free_swap_map(swap_map);
554 return error; 550 return error;
555 FreePagedir: 551
556 free_pagedir_entries(); 552Free_map_entries:
557 FreeData: 553 free_swap_map_entries(swap_map);
558 data_free(); 554Free_image_entries:
559 goto Done; 555 free_image_entries(swap_map);
556 goto Free_swap_map;
560} 557}
561 558
562/* It is important _NOT_ to umount filesystems at this point. We want 559/**
563 * them synced (in case something goes wrong) but we DO not want to mark 560 * swsusp_shrink_memory - Try to free as much memory as needed
564 * filesystem clean: it is not. (And it does not matter, if we resume 561 *
565 * correctly, we'll mark system clean, anyway.) 562 * ... but do not OOM-kill anyone
563 *
564 * Notice: all userland should be stopped before it is called, or
565 * livelock is possible.
566 */ 566 */
567int swsusp_write(void)
568{
569 int error;
570 567
571 if ((error = swsusp_swap_check())) { 568#define SHRINK_BITE 10000
572 printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
573 return error;
574 }
575 lock_swapdevices();
576 error = write_suspend_image();
577 /* This will unlock ignored swap devices since writing is finished */
578 lock_swapdevices();
579 return error;
580}
581 569
570int swsusp_shrink_memory(void)
571{
572 long size, tmp;
573 struct zone *zone;
574 unsigned long pages = 0;
575 unsigned int i = 0;
576 char *p = "-\\|/";
577
578 printk("Shrinking memory... ");
579 do {
580 size = 2 * count_highmem_pages();
581 size += size / 50 + count_data_pages();
582 size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
583 PAGES_FOR_IO;
584 tmp = size;
585 for_each_zone (zone)
586 if (!is_highmem(zone))
587 tmp -= zone->free_pages;
588 if (tmp > 0) {
589 tmp = shrink_all_memory(SHRINK_BITE);
590 if (!tmp)
591 return -ENOMEM;
592 pages += tmp;
593 } else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) {
594 tmp = shrink_all_memory(SHRINK_BITE);
595 pages += tmp;
596 }
597 printk("\b%c", p[i++%4]);
598 } while (tmp > 0);
599 printk("\bdone (%lu pages freed)\n", pages);
582 600
601 return 0;
602}
583 603
584int swsusp_suspend(void) 604int swsusp_suspend(void)
585{ 605{
@@ -677,7 +697,6 @@ static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
677 /* We assume both lists contain the same number of elements */ 697 /* We assume both lists contain the same number of elements */
678 while (src) { 698 while (src) {
679 dst->orig_address = src->orig_address; 699 dst->orig_address = src->orig_address;
680 dst->swap_address = src->swap_address;
681 dst = dst->next; 700 dst = dst->next;
682 src = src->next; 701 src = src->next;
683 } 702 }
@@ -757,198 +776,224 @@ static int bio_write_page(pgoff_t page_off, void *page)
757 return submit(WRITE, page_off, page); 776 return submit(WRITE, page_off, page);
758} 777}
759 778
760/* 779/**
761 * Sanity check if this image makes sense with this kernel/swap context 780 * The following functions allow us to read data using a swap map
762 * I really don't think that it's foolproof but more than nothing.. 781 * in a file-alike way
763 */ 782 */
764 783
765static const char *sanity_check(void) 784static inline void release_swap_map_reader(struct swap_map_handle *handle)
766{ 785{
767 dump_info(); 786 if (handle->cur)
768 if (swsusp_info.version_code != LINUX_VERSION_CODE) 787 free_page((unsigned long)handle->cur);
769 return "kernel version"; 788 handle->cur = NULL;
770 if (swsusp_info.num_physpages != num_physpages)
771 return "memory size";
772 if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
773 return "system type";
774 if (strcmp(swsusp_info.uts.release,system_utsname.release))
775 return "kernel release";
776 if (strcmp(swsusp_info.uts.version,system_utsname.version))
777 return "version";
778 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
779 return "machine";
780#if 0
781 /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
782 if (swsusp_info.cpus != num_possible_cpus())
783 return "number of cpus";
784#endif
785 return NULL;
786} 789}
787 790
788 791static inline int get_swap_map_reader(struct swap_map_handle *handle,
789static int check_header(void) 792 swp_entry_t start)
790{ 793{
791 const char *reason = NULL;
792 int error; 794 int error;
793 795
794 if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) 796 if (!swp_offset(start))
797 return -EINVAL;
798 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
799 if (!handle->cur)
800 return -ENOMEM;
801 error = bio_read_page(swp_offset(start), handle->cur);
802 if (error) {
803 release_swap_map_reader(handle);
795 return error; 804 return error;
796
797 /* Is this same machine? */
798 if ((reason = sanity_check())) {
799 printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason);
800 return -EPERM;
801 } 805 }
802 nr_copy_pages = swsusp_info.image_pages; 806 handle->k = 0;
803 return error; 807 return 0;
804} 808}
805 809
806static int check_sig(void) 810static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
807{ 811{
812 unsigned long offset;
808 int error; 813 int error;
809 814
810 memset(&swsusp_header, 0, sizeof(swsusp_header)); 815 if (!handle->cur)
811 if ((error = bio_read_page(0, &swsusp_header))) 816 return -EINVAL;
812 return error; 817 offset = swp_offset(handle->cur->entries[handle->k]);
813 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 818 if (!offset)
814 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
815 memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV);
816 memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV);
817
818 /*
819 * Reset swap signature now.
820 */
821 error = bio_write_page(0, &swsusp_header);
822 } else {
823 return -EINVAL; 819 return -EINVAL;
820 error = bio_read_page(offset, buf);
821 if (error)
822 return error;
823 if (++handle->k >= MAP_PAGE_SIZE) {
824 handle->k = 0;
825 offset = swp_offset(handle->cur->next_swap);
826 if (!offset)
827 release_swap_map_reader(handle);
828 else
829 error = bio_read_page(offset, handle->cur);
824 } 830 }
825 if (!error)
826 pr_debug("swsusp: Signature found, resuming\n");
827 return error; 831 return error;
828} 832}
829 833
830/** 834static int check_header(void)
831 * data_read - Read image pages from swap.
832 *
833 * You do not need to check for overlaps, check_pagedir()
834 * already did that.
835 */
836
837static int data_read(struct pbe *pblist)
838{ 835{
839 struct pbe *p; 836 char *reason = NULL;
840 int error = 0;
841 int i = 0;
842 int mod = swsusp_info.image_pages / 100;
843 void *tfm;
844
845 if ((error = crypto_init(0, &tfm)))
846 return error;
847
848 if (!mod)
849 mod = 1;
850
851 printk("swsusp: Reading image data (%lu pages): ",
852 swsusp_info.image_pages);
853
854 for_each_pbe (p, pblist) {
855 if (!(i % mod))
856 printk("\b\b\b\b%3d%%", i / mod);
857 837
858 if ((error = crypto_read(p, tfm))) { 838 dump_info();
859 crypto_exit(tfm); 839 if (swsusp_info.version_code != LINUX_VERSION_CODE)
860 return error; 840 reason = "kernel version";
861 } 841 if (swsusp_info.num_physpages != num_physpages)
862 842 reason = "memory size";
863 i++; 843 if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
844 reason = "system type";
845 if (strcmp(swsusp_info.uts.release,system_utsname.release))
846 reason = "kernel release";
847 if (strcmp(swsusp_info.uts.version,system_utsname.version))
848 reason = "version";
849 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
850 reason = "machine";
851 if (reason) {
852 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
853 return -EPERM;
864 } 854 }
865 printk("\b\b\b\bdone\n"); 855 return 0;
866 crypto_exit(tfm);
867 return error;
868} 856}
869 857
870/** 858/**
871 * read_pagedir - Read page backup list pages from swap 859 * load_image_data - load the image data using the swap map handle
860 * @handle and store them using the page backup list @pblist
861 * (assume there are @nr_pages pages to load)
872 */ 862 */
873 863
874static int read_pagedir(struct pbe *pblist) 864static int load_image_data(struct pbe *pblist,
865 struct swap_map_handle *handle,
866 unsigned int nr_pages)
875{ 867{
876 struct pbe *pbpage, *p;
877 unsigned int i = 0;
878 int error; 868 int error;
869 unsigned int m;
870 struct pbe *p;
879 871
880 if (!pblist) 872 if (!pblist)
881 return -EFAULT; 873 return -EINVAL;
882 874 printk("Loading image data pages (%u pages) ... ", nr_pages);
883 printk("swsusp: Reading pagedir (%lu pages)\n", 875 m = nr_pages / 100;
884 swsusp_info.pagedir_pages); 876 if (!m)
885 877 m = 1;
886 for_each_pb_page (pbpage, pblist) { 878 nr_pages = 0;
887 unsigned long offset = swp_offset(swsusp_info.pagedir[i++]); 879 p = pblist;
888 880 while (p) {
889 error = -EFAULT; 881 error = swap_map_read_page(handle, (void *)p->address);
890 if (offset) {
891 p = (pbpage + PB_PAGE_SKIP)->next;
892 error = bio_read_page(offset, (void *)pbpage);
893 (pbpage + PB_PAGE_SKIP)->next = p;
894 }
895 if (error) 882 if (error)
896 break; 883 break;
884 p = p->next;
885 if (!(nr_pages % m))
886 printk("\b\b\b\b%3d%%", nr_pages / m);
887 nr_pages++;
897 } 888 }
898
899 if (!error) 889 if (!error)
900 BUG_ON(i != swsusp_info.pagedir_pages); 890 printk("\b\b\b\bdone\n");
901
902 return error; 891 return error;
903} 892}
904 893
894/**
895 * unpack_orig_addresses - copy the elements of @buf[] (1 page) to
896 * the PBEs in the list starting at @pbe
897 */
905 898
906static int check_suspend_image(void) 899static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
900 struct pbe *pbe)
907{ 901{
908 int error = 0; 902 int j;
909 903
910 if ((error = check_sig())) 904 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
911 return error; 905 pbe->orig_address = buf[j];
912 906 pbe = pbe->next;
913 if ((error = check_header())) 907 }
914 return error; 908 return pbe;
915
916 return 0;
917} 909}
918 910
919static int read_suspend_image(void) 911/**
912 * load_image_metadata - load the image metadata using the swap map
913 * handle @handle and put them into the PBEs in the list @pblist
914 */
915
916static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
920{ 917{
921 int error = 0;
922 struct pbe *p; 918 struct pbe *p;
919 unsigned long *buf;
920 unsigned int n = 0;
921 int error = 0;
923 922
924 if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0))) 923 printk("Loading image metadata ... ");
924 buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
925 if (!buf)
925 return -ENOMEM; 926 return -ENOMEM;
926 927 p = pblist;
927 if ((error = read_pagedir(p))) 928 while (p) {
928 return error; 929 error = swap_map_read_page(handle, buf);
929 create_pbe_list(p, nr_copy_pages); 930 if (error)
930 mark_unsafe_pages(p); 931 break;
931 pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); 932 p = unpack_orig_addresses(buf, p);
932 if (pagedir_nosave) { 933 n++;
933 create_pbe_list(pagedir_nosave, nr_copy_pages);
934 copy_page_backup_list(pagedir_nosave, p);
935 } 934 }
936 free_pagedir(p); 935 free_page((unsigned long)buf);
937 if (!pagedir_nosave) 936 if (!error)
938 return -ENOMEM; 937 printk("done (%u pages loaded)\n", n);
938 return error;
939}
939 940
940 /* Allocate memory for the image and read the data from swap */ 941int swsusp_read(struct pbe **pblist_ptr)
942{
943 int error;
944 struct pbe *p, *pblist;
945 struct swap_map_handle handle;
946 unsigned int nr_pages;
941 947
942 error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1); 948 if (IS_ERR(resume_bdev)) {
949 pr_debug("swsusp: block device not initialised\n");
950 return PTR_ERR(resume_bdev);
951 }
943 952
953 error = get_swap_map_reader(&handle, swsusp_header.image);
944 if (!error) 954 if (!error)
945 error = data_read(pagedir_nosave); 955 error = swap_map_read_page(&handle, &swsusp_info);
956 if (!error)
957 error = check_header();
958 if (error)
959 return error;
960 nr_pages = swsusp_info.image_pages;
961 p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
962 if (!p)
963 return -ENOMEM;
964 error = load_image_metadata(p, &handle);
965 if (!error) {
966 mark_unsafe_pages(p);
967 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
968 if (pblist)
969 copy_page_backup_list(pblist, p);
970 free_pagedir(p);
971 if (!pblist)
972 error = -ENOMEM;
973
974 /* Allocate memory for the image and read the data from swap */
975 if (!error)
976 error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
977 if (!error) {
978 release_eaten_pages();
979 error = load_image_data(pblist, &handle, nr_pages);
980 }
981 if (!error)
982 *pblist_ptr = pblist;
983 }
984 release_swap_map_reader(&handle);
946 985
986 blkdev_put(resume_bdev);
987
988 if (!error)
989 pr_debug("swsusp: Reading resume file was successful\n");
990 else
991 pr_debug("swsusp: Error %d resuming\n", error);
947 return error; 992 return error;
948} 993}
949 994
950/** 995/**
951 * swsusp_check - Check for saved image in swap 996 * swsusp_check - Check for swsusp signature in the resume device
952 */ 997 */
953 998
954int swsusp_check(void) 999int swsusp_check(void)
@@ -958,40 +1003,27 @@ int swsusp_check(void)
958 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 1003 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
959 if (!IS_ERR(resume_bdev)) { 1004 if (!IS_ERR(resume_bdev)) {
960 set_blocksize(resume_bdev, PAGE_SIZE); 1005 set_blocksize(resume_bdev, PAGE_SIZE);
961 error = check_suspend_image(); 1006 memset(&swsusp_header, 0, sizeof(swsusp_header));
1007 if ((error = bio_read_page(0, &swsusp_header)))
1008 return error;
1009 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
1010 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
1011 /* Reset swap signature now */
1012 error = bio_write_page(0, &swsusp_header);
1013 } else {
1014 return -EINVAL;
1015 }
962 if (error) 1016 if (error)
963 blkdev_put(resume_bdev); 1017 blkdev_put(resume_bdev);
964 } else 1018 else
1019 pr_debug("swsusp: Signature found, resuming\n");
1020 } else {
965 error = PTR_ERR(resume_bdev); 1021 error = PTR_ERR(resume_bdev);
966
967 if (!error)
968 pr_debug("swsusp: resume file found\n");
969 else
970 pr_debug("swsusp: Error %d check for resume file\n", error);
971 return error;
972}
973
974/**
975 * swsusp_read - Read saved image from swap.
976 */
977
978int swsusp_read(void)
979{
980 int error;
981
982 if (IS_ERR(resume_bdev)) {
983 pr_debug("swsusp: block device not initialised\n");
984 return PTR_ERR(resume_bdev);
985 } 1022 }
986 1023
987 error = read_suspend_image(); 1024 if (error)
988 blkdev_put(resume_bdev); 1025 pr_debug("swsusp: Error %d check for resume file\n", error);
989 memset(key_iv, 0, MAXKEY+MAXIV);
990 1026
991 if (!error)
992 pr_debug("swsusp: Reading resume file was successful\n");
993 else
994 pr_debug("swsusp: Error %d resuming\n", error);
995 return error; 1027 return error;
996} 1028}
997 1029
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 345f4a1d533f..a85047bb5739 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -108,7 +108,7 @@ extern int pwrsw_enabled;
108extern int unaligned_enabled; 108extern int unaligned_enabled;
109#endif 109#endif
110 110
111#ifdef CONFIG_ARCH_S390 111#ifdef CONFIG_S390
112#ifdef CONFIG_MATHEMU 112#ifdef CONFIG_MATHEMU
113extern int sysctl_ieee_emulation_warnings; 113extern int sysctl_ieee_emulation_warnings;
114#endif 114#endif
@@ -542,7 +542,7 @@ static ctl_table kern_table[] = {
542 .extra1 = &minolduid, 542 .extra1 = &minolduid,
543 .extra2 = &maxolduid, 543 .extra2 = &maxolduid,
544 }, 544 },
545#ifdef CONFIG_ARCH_S390 545#ifdef CONFIG_S390
546#ifdef CONFIG_MATHEMU 546#ifdef CONFIG_MATHEMU
547 { 547 {
548 .ctl_name = KERN_IEEE_EMULATION_WARNINGS, 548 .ctl_name = KERN_IEEE_EMULATION_WARNINGS,
@@ -644,7 +644,7 @@ static ctl_table kern_table[] = {
644 .mode = 0644, 644 .mode = 0644,
645 .proc_handler = &proc_dointvec, 645 .proc_handler = &proc_dointvec,
646 }, 646 },
647#if defined(CONFIG_ARCH_S390) 647#if defined(CONFIG_S390)
648 { 648 {
649 .ctl_name = KERN_SPIN_RETRY, 649 .ctl_name = KERN_SPIN_RETRY,
650 .procname = "spin_retry", 650 .procname = "spin_retry",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 156822e3cc79..80598cfd728c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -32,7 +32,7 @@ config MAGIC_SYSRQ
32config LOG_BUF_SHIFT 32config LOG_BUF_SHIFT
33 int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" if DEBUG_KERNEL 33 int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" if DEBUG_KERNEL
34 range 12 21 34 range 12 21
35 default 17 if ARCH_S390 35 default 17 if S390
36 default 16 if X86_NUMAQ || IA64 36 default 16 if X86_NUMAQ || IA64
37 default 15 if SMP 37 default 15 if SMP
38 default 14 38 default 14
@@ -172,7 +172,8 @@ config DEBUG_VM
172 bool "Debug VM" 172 bool "Debug VM"
173 depends on DEBUG_KERNEL 173 depends on DEBUG_KERNEL
174 help 174 help
175 Enable this to debug the virtual-memory system. 175 Enable this to turn on extended checks in the virtual-memory system
176 that may impact performance.
176 177
177 If unsure, say N. 178 If unsure, say N.
178 179
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 1ff8dcebf7c6..3b482052f403 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -142,8 +142,7 @@ swiotlb_init_with_default_size (size_t default_size)
142 /* 142 /*
143 * Get IO TLB memory from the low pages 143 * Get IO TLB memory from the low pages
144 */ 144 */
145 io_tlb_start = alloc_bootmem_low_pages_limit(io_tlb_nslabs * 145 io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs * (1 << IO_TLB_SHIFT));
146 (1 << IO_TLB_SHIFT), 0x100000000);
147 if (!io_tlb_start) 146 if (!io_tlb_start)
148 panic("Cannot allocate SWIOTLB buffer"); 147 panic("Cannot allocate SWIOTLB buffer");
149 io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT); 148 io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT);
diff --git a/mm/Kconfig b/mm/Kconfig
index 21eb51d4da8f..b3db11f137e0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -11,7 +11,7 @@ choice
11 11
12config FLATMEM_MANUAL 12config FLATMEM_MANUAL
13 bool "Flat Memory" 13 bool "Flat Memory"
14 depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE 14 depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
15 help 15 help
16 This option allows you to change some of the ways that 16 This option allows you to change some of the ways that
17 Linux manages its memory internally. Most users will 17 Linux manages its memory internally. Most users will
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 16b9465eb4eb..35c32290f717 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -296,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
296 unsigned long v = ~map[i / BITS_PER_LONG]; 296 unsigned long v = ~map[i / BITS_PER_LONG];
297 297
298 if (gofast && v == ~0UL) { 298 if (gofast && v == ~0UL) {
299 int j, order; 299 int order;
300 300
301 page = pfn_to_page(pfn); 301 page = pfn_to_page(pfn);
302 count += BITS_PER_LONG; 302 count += BITS_PER_LONG;
303 __ClearPageReserved(page);
304 order = ffs(BITS_PER_LONG) - 1; 303 order = ffs(BITS_PER_LONG) - 1;
305 set_page_refs(page, order); 304 __free_pages_bootmem(page, order);
306 for (j = 1; j < BITS_PER_LONG; j++) {
307 if (j + 16 < BITS_PER_LONG)
308 prefetchw(page + j + 16);
309 __ClearPageReserved(page + j);
310 set_page_count(page + j, 0);
311 }
312 __free_pages(page, order);
313 i += BITS_PER_LONG; 305 i += BITS_PER_LONG;
314 page += BITS_PER_LONG; 306 page += BITS_PER_LONG;
315 } else if (v) { 307 } else if (v) {
@@ -319,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
319 for (m = 1; m && i < idx; m<<=1, page++, i++) { 311 for (m = 1; m && i < idx; m<<=1, page++, i++) {
320 if (v & m) { 312 if (v & m) {
321 count++; 313 count++;
322 __ClearPageReserved(page); 314 __free_pages_bootmem(page, 0);
323 set_page_refs(page, 0);
324 __free_page(page);
325 } 315 }
326 } 316 }
327 } else { 317 } else {
@@ -339,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
339 count = 0; 329 count = 0;
340 for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { 330 for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
341 count++; 331 count++;
342 __ClearPageReserved(page); 332 __free_pages_bootmem(page, 0);
343 set_page_count(page, 1);
344 __free_page(page);
345 } 333 }
346 total += count; 334 total += count;
347 bdata->node_bootmem_map = NULL; 335 bdata->node_bootmem_map = NULL;
@@ -393,15 +381,14 @@ unsigned long __init free_all_bootmem (void)
393 return(free_all_bootmem_core(NODE_DATA(0))); 381 return(free_all_bootmem_core(NODE_DATA(0)));
394} 382}
395 383
396void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, 384void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
397 unsigned long limit)
398{ 385{
399 pg_data_t *pgdat = pgdat_list; 386 pg_data_t *pgdat = pgdat_list;
400 void *ptr; 387 void *ptr;
401 388
402 for_each_pgdat(pgdat) 389 for_each_pgdat(pgdat)
403 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, 390 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
404 align, goal, limit))) 391 align, goal, 0)))
405 return(ptr); 392 return(ptr);
406 393
407 /* 394 /*
@@ -413,15 +400,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un
413} 400}
414 401
415 402
416void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, 403void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align,
417 unsigned long goal, unsigned long limit) 404 unsigned long goal)
418{ 405{
419 void *ptr; 406 void *ptr;
420 407
421 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit); 408 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
422 if (ptr) 409 if (ptr)
423 return (ptr); 410 return (ptr);
424 411
425 return __alloc_bootmem_limit(size, align, goal, limit); 412 return __alloc_bootmem(size, align, goal);
413}
414
415#define LOW32LIMIT 0xffffffff
416
417void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
418{
419 pg_data_t *pgdat = pgdat_list;
420 void *ptr;
421
422 for_each_pgdat(pgdat)
423 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
424 align, goal, LOW32LIMIT)))
425 return(ptr);
426
427 /*
428 * Whoops, we cannot satisfy the allocation request.
429 */
430 printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
431 panic("Out of low memory");
432 return NULL;
426} 433}
427 434
435void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
436 unsigned long align, unsigned long goal)
437{
438 return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT);
439}
diff --git a/mm/filemap.c b/mm/filemap.c
index 33a28bfde158..4ef24a397684 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -555,11 +555,12 @@ repeat:
555 page_cache_get(page); 555 page_cache_get(page);
556 if (TestSetPageLocked(page)) { 556 if (TestSetPageLocked(page)) {
557 read_unlock_irq(&mapping->tree_lock); 557 read_unlock_irq(&mapping->tree_lock);
558 lock_page(page); 558 __lock_page(page);
559 read_lock_irq(&mapping->tree_lock); 559 read_lock_irq(&mapping->tree_lock);
560 560
561 /* Has the page been truncated while we slept? */ 561 /* Has the page been truncated while we slept? */
562 if (page->mapping != mapping || page->index != offset) { 562 if (unlikely(page->mapping != mapping ||
563 page->index != offset)) {
563 unlock_page(page); 564 unlock_page(page);
564 page_cache_release(page); 565 page_cache_release(page);
565 goto repeat; 566 goto repeat;
@@ -831,8 +832,13 @@ readpage:
831 /* Start the actual read. The read will unlock the page. */ 832 /* Start the actual read. The read will unlock the page. */
832 error = mapping->a_ops->readpage(filp, page); 833 error = mapping->a_ops->readpage(filp, page);
833 834
834 if (unlikely(error)) 835 if (unlikely(error)) {
836 if (error == AOP_TRUNCATED_PAGE) {
837 page_cache_release(page);
838 goto find_page;
839 }
835 goto readpage_error; 840 goto readpage_error;
841 }
836 842
837 if (!PageUptodate(page)) { 843 if (!PageUptodate(page)) {
838 lock_page(page); 844 lock_page(page);
@@ -1152,26 +1158,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
1152{ 1158{
1153 struct address_space *mapping = file->f_mapping; 1159 struct address_space *mapping = file->f_mapping;
1154 struct page *page; 1160 struct page *page;
1155 int error; 1161 int ret;
1156 1162
1157 page = page_cache_alloc_cold(mapping); 1163 do {
1158 if (!page) 1164 page = page_cache_alloc_cold(mapping);
1159 return -ENOMEM; 1165 if (!page)
1166 return -ENOMEM;
1167
1168 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1169 if (ret == 0)
1170 ret = mapping->a_ops->readpage(file, page);
1171 else if (ret == -EEXIST)
1172 ret = 0; /* losing race to add is OK */
1160 1173
1161 error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1162 if (!error) {
1163 error = mapping->a_ops->readpage(file, page);
1164 page_cache_release(page); 1174 page_cache_release(page);
1165 return error;
1166 }
1167 1175
1168 /* 1176 } while (ret == AOP_TRUNCATED_PAGE);
1169 * We arrive here in the unlikely event that someone 1177
1170 * raced with us and added our page to the cache first 1178 return ret;
1171 * or we are out of memory for radix-tree nodes.
1172 */
1173 page_cache_release(page);
1174 return error == -EEXIST ? 0 : error;
1175} 1179}
1176 1180
1177#define MMAP_LOTSAMISS (100) 1181#define MMAP_LOTSAMISS (100)
@@ -1331,10 +1335,14 @@ page_not_uptodate:
1331 goto success; 1335 goto success;
1332 } 1336 }
1333 1337
1334 if (!mapping->a_ops->readpage(file, page)) { 1338 error = mapping->a_ops->readpage(file, page);
1339 if (!error) {
1335 wait_on_page_locked(page); 1340 wait_on_page_locked(page);
1336 if (PageUptodate(page)) 1341 if (PageUptodate(page))
1337 goto success; 1342 goto success;
1343 } else if (error == AOP_TRUNCATED_PAGE) {
1344 page_cache_release(page);
1345 goto retry_find;
1338 } 1346 }
1339 1347
1340 /* 1348 /*
@@ -1358,10 +1366,14 @@ page_not_uptodate:
1358 goto success; 1366 goto success;
1359 } 1367 }
1360 ClearPageError(page); 1368 ClearPageError(page);
1361 if (!mapping->a_ops->readpage(file, page)) { 1369 error = mapping->a_ops->readpage(file, page);
1370 if (!error) {
1362 wait_on_page_locked(page); 1371 wait_on_page_locked(page);
1363 if (PageUptodate(page)) 1372 if (PageUptodate(page))
1364 goto success; 1373 goto success;
1374 } else if (error == AOP_TRUNCATED_PAGE) {
1375 page_cache_release(page);
1376 goto retry_find;
1365 } 1377 }
1366 1378
1367 /* 1379 /*
@@ -1444,10 +1456,14 @@ page_not_uptodate:
1444 goto success; 1456 goto success;
1445 } 1457 }
1446 1458
1447 if (!mapping->a_ops->readpage(file, page)) { 1459 error = mapping->a_ops->readpage(file, page);
1460 if (!error) {
1448 wait_on_page_locked(page); 1461 wait_on_page_locked(page);
1449 if (PageUptodate(page)) 1462 if (PageUptodate(page))
1450 goto success; 1463 goto success;
1464 } else if (error == AOP_TRUNCATED_PAGE) {
1465 page_cache_release(page);
1466 goto retry_find;
1451 } 1467 }
1452 1468
1453 /* 1469 /*
@@ -1470,10 +1486,14 @@ page_not_uptodate:
1470 } 1486 }
1471 1487
1472 ClearPageError(page); 1488 ClearPageError(page);
1473 if (!mapping->a_ops->readpage(file, page)) { 1489 error = mapping->a_ops->readpage(file, page);
1490 if (!error) {
1474 wait_on_page_locked(page); 1491 wait_on_page_locked(page);
1475 if (PageUptodate(page)) 1492 if (PageUptodate(page))
1476 goto success; 1493 goto success;
1494 } else if (error == AOP_TRUNCATED_PAGE) {
1495 page_cache_release(page);
1496 goto retry_find;
1477 } 1497 }
1478 1498
1479 /* 1499 /*
@@ -1934,12 +1954,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1934 status = a_ops->prepare_write(file, page, offset, offset+bytes); 1954 status = a_ops->prepare_write(file, page, offset, offset+bytes);
1935 if (unlikely(status)) { 1955 if (unlikely(status)) {
1936 loff_t isize = i_size_read(inode); 1956 loff_t isize = i_size_read(inode);
1957
1958 if (status != AOP_TRUNCATED_PAGE)
1959 unlock_page(page);
1960 page_cache_release(page);
1961 if (status == AOP_TRUNCATED_PAGE)
1962 continue;
1937 /* 1963 /*
1938 * prepare_write() may have instantiated a few blocks 1964 * prepare_write() may have instantiated a few blocks
1939 * outside i_size. Trim these off again. 1965 * outside i_size. Trim these off again.
1940 */ 1966 */
1941 unlock_page(page);
1942 page_cache_release(page);
1943 if (pos + bytes > isize) 1967 if (pos + bytes > isize)
1944 vmtruncate(inode, isize); 1968 vmtruncate(inode, isize);
1945 break; 1969 break;
@@ -1952,6 +1976,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1952 cur_iov, iov_base, bytes); 1976 cur_iov, iov_base, bytes);
1953 flush_dcache_page(page); 1977 flush_dcache_page(page);
1954 status = a_ops->commit_write(file, page, offset, offset+bytes); 1978 status = a_ops->commit_write(file, page, offset, offset+bytes);
1979 if (status == AOP_TRUNCATED_PAGE) {
1980 page_cache_release(page);
1981 continue;
1982 }
1955 if (likely(copied > 0)) { 1983 if (likely(copied > 0)) {
1956 if (!status) 1984 if (!status)
1957 status = copied; 1985 status = copied;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3e52df7c471b..f4c43d7980ba 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,8 @@
11#include <linux/highmem.h> 11#include <linux/highmem.h>
12#include <linux/nodemask.h> 12#include <linux/nodemask.h>
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/mempolicy.h>
15
14#include <asm/page.h> 16#include <asm/page.h>
15#include <asm/pgtable.h> 17#include <asm/pgtable.h>
16 18
@@ -36,18 +38,21 @@ static void enqueue_huge_page(struct page *page)
36 free_huge_pages_node[nid]++; 38 free_huge_pages_node[nid]++;
37} 39}
38 40
39static struct page *dequeue_huge_page(void) 41static struct page *dequeue_huge_page(struct vm_area_struct *vma,
42 unsigned long address)
40{ 43{
41 int nid = numa_node_id(); 44 int nid = numa_node_id();
42 struct page *page = NULL; 45 struct page *page = NULL;
46 struct zonelist *zonelist = huge_zonelist(vma, address);
47 struct zone **z;
43 48
44 if (list_empty(&hugepage_freelists[nid])) { 49 for (z = zonelist->zones; *z; z++) {
45 for (nid = 0; nid < MAX_NUMNODES; ++nid) 50 nid = (*z)->zone_pgdat->node_id;
46 if (!list_empty(&hugepage_freelists[nid])) 51 if (!list_empty(&hugepage_freelists[nid]))
47 break; 52 break;
48 } 53 }
49 if (nid >= 0 && nid < MAX_NUMNODES && 54
50 !list_empty(&hugepage_freelists[nid])) { 55 if (*z) {
51 page = list_entry(hugepage_freelists[nid].next, 56 page = list_entry(hugepage_freelists[nid].next,
52 struct page, lru); 57 struct page, lru);
53 list_del(&page->lru); 58 list_del(&page->lru);
@@ -85,13 +90,13 @@ void free_huge_page(struct page *page)
85 spin_unlock(&hugetlb_lock); 90 spin_unlock(&hugetlb_lock);
86} 91}
87 92
88struct page *alloc_huge_page(void) 93struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
89{ 94{
90 struct page *page; 95 struct page *page;
91 int i; 96 int i;
92 97
93 spin_lock(&hugetlb_lock); 98 spin_lock(&hugetlb_lock);
94 page = dequeue_huge_page(); 99 page = dequeue_huge_page(vma, addr);
95 if (!page) { 100 if (!page) {
96 spin_unlock(&hugetlb_lock); 101 spin_unlock(&hugetlb_lock);
97 return NULL; 102 return NULL;
@@ -194,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
194 spin_lock(&hugetlb_lock); 199 spin_lock(&hugetlb_lock);
195 try_to_free_low(count); 200 try_to_free_low(count);
196 while (count < nr_huge_pages) { 201 while (count < nr_huge_pages) {
197 struct page *page = dequeue_huge_page(); 202 struct page *page = dequeue_huge_page(NULL, 0);
198 if (!page) 203 if (!page)
199 break; 204 break;
200 update_and_free_page(page); 205 update_and_free_page(page);
@@ -261,11 +266,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
261 .nopage = hugetlb_nopage, 266 .nopage = hugetlb_nopage,
262}; 267};
263 268
264static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) 269static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
270 int writable)
265{ 271{
266 pte_t entry; 272 pte_t entry;
267 273
268 if (vma->vm_flags & VM_WRITE) { 274 if (writable) {
269 entry = 275 entry =
270 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 276 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
271 } else { 277 } else {
@@ -277,12 +283,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
277 return entry; 283 return entry;
278} 284}
279 285
286static void set_huge_ptep_writable(struct vm_area_struct *vma,
287 unsigned long address, pte_t *ptep)
288{
289 pte_t entry;
290
291 entry = pte_mkwrite(pte_mkdirty(*ptep));
292 ptep_set_access_flags(vma, address, ptep, entry, 1);
293 update_mmu_cache(vma, address, entry);
294 lazy_mmu_prot_update(entry);
295}
296
297
280int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 298int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
281 struct vm_area_struct *vma) 299 struct vm_area_struct *vma)
282{ 300{
283 pte_t *src_pte, *dst_pte, entry; 301 pte_t *src_pte, *dst_pte, entry;
284 struct page *ptepage; 302 struct page *ptepage;
285 unsigned long addr; 303 unsigned long addr;
304 int cow;
305
306 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
286 307
287 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 308 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
288 src_pte = huge_pte_offset(src, addr); 309 src_pte = huge_pte_offset(src, addr);
@@ -294,6 +315,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
294 spin_lock(&dst->page_table_lock); 315 spin_lock(&dst->page_table_lock);
295 spin_lock(&src->page_table_lock); 316 spin_lock(&src->page_table_lock);
296 if (!pte_none(*src_pte)) { 317 if (!pte_none(*src_pte)) {
318 if (cow)
319 ptep_set_wrprotect(src, addr, src_pte);
297 entry = *src_pte; 320 entry = *src_pte;
298 ptepage = pte_page(entry); 321 ptepage = pte_page(entry);
299 get_page(ptepage); 322 get_page(ptepage);
@@ -345,57 +368,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
345 flush_tlb_range(vma, start, end); 368 flush_tlb_range(vma, start, end);
346} 369}
347 370
348static struct page *find_lock_huge_page(struct address_space *mapping, 371static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
349 unsigned long idx) 372 unsigned long address, pte_t *ptep, pte_t pte)
350{ 373{
351 struct page *page; 374 struct page *old_page, *new_page;
352 int err; 375 int i, avoidcopy;
353 struct inode *inode = mapping->host;
354 unsigned long size;
355 376
356retry: 377 old_page = pte_page(pte);
357 page = find_lock_page(mapping, idx);
358 if (page)
359 goto out;
360 378
361 /* Check to make sure the mapping hasn't been truncated */ 379 /* If no-one else is actually using this page, avoid the copy
362 size = i_size_read(inode) >> HPAGE_SHIFT; 380 * and just make the page writable */
363 if (idx >= size) 381 avoidcopy = (page_count(old_page) == 1);
364 goto out; 382 if (avoidcopy) {
383 set_huge_ptep_writable(vma, address, ptep);
384 return VM_FAULT_MINOR;
385 }
365 386
366 if (hugetlb_get_quota(mapping)) 387 page_cache_get(old_page);
367 goto out; 388 new_page = alloc_huge_page(vma, address);
368 page = alloc_huge_page(); 389
369 if (!page) { 390 if (!new_page) {
370 hugetlb_put_quota(mapping); 391 page_cache_release(old_page);
371 goto out; 392
393 /* Logically this is OOM, not a SIGBUS, but an OOM
394 * could cause the kernel to go killing other
395 * processes which won't help the hugepage situation
396 * at all (?) */
397 return VM_FAULT_SIGBUS;
372 } 398 }
373 399
374 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 400 spin_unlock(&mm->page_table_lock);
375 if (err) { 401 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
376 put_page(page); 402 copy_user_highpage(new_page + i, old_page + i,
377 hugetlb_put_quota(mapping); 403 address + i*PAGE_SIZE);
378 if (err == -EEXIST) 404 spin_lock(&mm->page_table_lock);
379 goto retry; 405
380 page = NULL; 406 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
407 if (likely(pte_same(*ptep, pte))) {
408 /* Break COW */
409 set_huge_pte_at(mm, address, ptep,
410 make_huge_pte(vma, new_page, 1));
411 /* Make the old page be freed below */
412 new_page = old_page;
381 } 413 }
382out: 414 page_cache_release(new_page);
383 return page; 415 page_cache_release(old_page);
416 return VM_FAULT_MINOR;
384} 417}
385 418
386int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 419int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
387 unsigned long address, int write_access) 420 unsigned long address, pte_t *ptep, int write_access)
388{ 421{
389 int ret = VM_FAULT_SIGBUS; 422 int ret = VM_FAULT_SIGBUS;
390 unsigned long idx; 423 unsigned long idx;
391 unsigned long size; 424 unsigned long size;
392 pte_t *pte;
393 struct page *page; 425 struct page *page;
394 struct address_space *mapping; 426 struct address_space *mapping;
395 427 pte_t new_pte;
396 pte = huge_pte_alloc(mm, address);
397 if (!pte)
398 goto out;
399 428
400 mapping = vma->vm_file->f_mapping; 429 mapping = vma->vm_file->f_mapping;
401 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 430 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -405,9 +434,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
405 * Use page lock to guard against racing truncation 434 * Use page lock to guard against racing truncation
406 * before we get page_table_lock. 435 * before we get page_table_lock.
407 */ 436 */
408 page = find_lock_huge_page(mapping, idx); 437retry:
409 if (!page) 438 page = find_lock_page(mapping, idx);
410 goto out; 439 if (!page) {
440 if (hugetlb_get_quota(mapping))
441 goto out;
442 page = alloc_huge_page(vma, address);
443 if (!page) {
444 hugetlb_put_quota(mapping);
445 goto out;
446 }
447
448 if (vma->vm_flags & VM_SHARED) {
449 int err;
450
451 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
452 if (err) {
453 put_page(page);
454 hugetlb_put_quota(mapping);
455 if (err == -EEXIST)
456 goto retry;
457 goto out;
458 }
459 } else
460 lock_page(page);
461 }
411 462
412 spin_lock(&mm->page_table_lock); 463 spin_lock(&mm->page_table_lock);
413 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 464 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
@@ -415,11 +466,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
415 goto backout; 466 goto backout;
416 467
417 ret = VM_FAULT_MINOR; 468 ret = VM_FAULT_MINOR;
418 if (!pte_none(*pte)) 469 if (!pte_none(*ptep))
419 goto backout; 470 goto backout;
420 471
421 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); 472 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
422 set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); 473 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
474 && (vma->vm_flags & VM_SHARED)));
475 set_huge_pte_at(mm, address, ptep, new_pte);
476
477 if (write_access && !(vma->vm_flags & VM_SHARED)) {
478 /* Optimization, do the COW without a second fault */
479 ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
480 }
481
423 spin_unlock(&mm->page_table_lock); 482 spin_unlock(&mm->page_table_lock);
424 unlock_page(page); 483 unlock_page(page);
425out: 484out:
@@ -433,6 +492,33 @@ backout:
433 goto out; 492 goto out;
434} 493}
435 494
495int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
496 unsigned long address, int write_access)
497{
498 pte_t *ptep;
499 pte_t entry;
500 int ret;
501
502 ptep = huge_pte_alloc(mm, address);
503 if (!ptep)
504 return VM_FAULT_OOM;
505
506 entry = *ptep;
507 if (pte_none(entry))
508 return hugetlb_no_page(mm, vma, address, ptep, write_access);
509
510 ret = VM_FAULT_MINOR;
511
512 spin_lock(&mm->page_table_lock);
513 /* Check for a racing update before calling hugetlb_cow */
514 if (likely(pte_same(entry, *ptep)))
515 if (write_access && !pte_write(entry))
516 ret = hugetlb_cow(mm, vma, address, ptep, entry);
517 spin_unlock(&mm->page_table_lock);
518
519 return ret;
520}
521
436int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 522int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
437 struct page **pages, struct vm_area_struct **vmas, 523 struct page **pages, struct vm_area_struct **vmas,
438 unsigned long *position, int *length, int i) 524 unsigned long *position, int *length, int i)
diff --git a/mm/internal.h b/mm/internal.h
index 6bf134e8fb3d..17256bb2f4ef 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -9,5 +9,22 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12/* page_alloc.c */ 12static inline void set_page_refs(struct page *page, int order)
13extern void set_page_refs(struct page *page, int order); 13{
14#ifdef CONFIG_MMU
15 set_page_count(page, 1);
16#else
17 int i;
18
19 /*
20 * We need to reference all the pages for this order, otherwise if
21 * anyone accesses one of the pages with (get/put) it will be freed.
22 * - eg: access_process_vm()
23 */
24 for (i = 0; i < (1 << order); i++)
25 set_page_count(page + i, 1);
26#endif /* CONFIG_MMU */
27}
28
29extern void fastcall __init __free_pages_bootmem(struct page *page,
30 unsigned int order);
diff --git a/mm/madvise.c b/mm/madvise.c
index 2b7cf0400a21..ae0ae3ea299a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma,
140 return 0; 140 return 0;
141} 141}
142 142
143/*
144 * Application wants to free up the pages and associated backing store.
145 * This is effectively punching a hole into the middle of a file.
146 *
147 * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
148 * Other filesystems return -ENOSYS.
149 */
150static long madvise_remove(struct vm_area_struct *vma,
151 unsigned long start, unsigned long end)
152{
153 struct address_space *mapping;
154 loff_t offset, endoff;
155
156 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
157 return -EINVAL;
158
159 if (!vma->vm_file || !vma->vm_file->f_mapping
160 || !vma->vm_file->f_mapping->host) {
161 return -EINVAL;
162 }
163
164 mapping = vma->vm_file->f_mapping;
165
166 offset = (loff_t)(start - vma->vm_start)
167 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
168 endoff = (loff_t)(end - vma->vm_start - 1)
169 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
170 return vmtruncate_range(mapping->host, offset, endoff);
171}
172
143static long 173static long
144madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 174madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
145 unsigned long start, unsigned long end, int behavior) 175 unsigned long start, unsigned long end, int behavior)
@@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
152 case MADV_RANDOM: 182 case MADV_RANDOM:
153 error = madvise_behavior(vma, prev, start, end, behavior); 183 error = madvise_behavior(vma, prev, start, end, behavior);
154 break; 184 break;
185 case MADV_REMOVE:
186 error = madvise_remove(vma, start, end);
187 break;
155 188
156 case MADV_WILLNEED: 189 case MADV_WILLNEED:
157 error = madvise_willneed(vma, prev, start, end); 190 error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
190 * some pages ahead. 223 * some pages ahead.
191 * MADV_DONTNEED - the application is finished with the given range, 224 * MADV_DONTNEED - the application is finished with the given range,
192 * so the kernel can free resources associated with it. 225 * so the kernel can free resources associated with it.
226 * MADV_REMOVE - the application wants to free up the given range of
227 * pages and associated backing store.
193 * 228 *
194 * return values: 229 * return values:
195 * zero - success 230 * zero - success
diff --git a/mm/memory.c b/mm/memory.c
index d8dde07a3656..7197f9bcd384 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1498,7 +1498,7 @@ gotten:
1498 update_mmu_cache(vma, address, entry); 1498 update_mmu_cache(vma, address, entry);
1499 lazy_mmu_prot_update(entry); 1499 lazy_mmu_prot_update(entry);
1500 lru_cache_add_active(new_page); 1500 lru_cache_add_active(new_page);
1501 page_add_anon_rmap(new_page, vma, address); 1501 page_add_new_anon_rmap(new_page, vma, address);
1502 1502
1503 /* Free the old page.. */ 1503 /* Free the old page.. */
1504 new_page = old_page; 1504 new_page = old_page;
@@ -1770,9 +1770,32 @@ out_big:
1770out_busy: 1770out_busy:
1771 return -ETXTBSY; 1771 return -ETXTBSY;
1772} 1772}
1773
1774EXPORT_SYMBOL(vmtruncate); 1773EXPORT_SYMBOL(vmtruncate);
1775 1774
1775int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1776{
1777 struct address_space *mapping = inode->i_mapping;
1778
1779 /*
1780 * If the underlying filesystem is not going to provide
1781 * a way to truncate a range of blocks (punch a hole) -
1782 * we should return failure right now.
1783 */
1784 if (!inode->i_op || !inode->i_op->truncate_range)
1785 return -ENOSYS;
1786
1787 down(&inode->i_sem);
1788 down_write(&inode->i_alloc_sem);
1789 unmap_mapping_range(mapping, offset, (end - offset), 1);
1790 truncate_inode_pages_range(mapping, offset, end);
1791 inode->i_op->truncate_range(inode, offset, end);
1792 up_write(&inode->i_alloc_sem);
1793 up(&inode->i_sem);
1794
1795 return 0;
1796}
1797EXPORT_SYMBOL(vmtruncate_range);
1798
1776/* 1799/*
1777 * Primitive swap readahead code. We simply read an aligned block of 1800 * Primitive swap readahead code. We simply read an aligned block of
1778 * (1 << page_cluster) entries in the swap area. This method is chosen 1801 * (1 << page_cluster) entries in the swap area. This method is chosen
@@ -1954,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1954 goto release; 1977 goto release;
1955 inc_mm_counter(mm, anon_rss); 1978 inc_mm_counter(mm, anon_rss);
1956 lru_cache_add_active(page); 1979 lru_cache_add_active(page);
1957 SetPageReferenced(page); 1980 page_add_new_anon_rmap(page, vma, address);
1958 page_add_anon_rmap(page, vma, address);
1959 } else { 1981 } else {
1960 /* Map the ZERO_PAGE - vm_page_prot is readonly */ 1982 /* Map the ZERO_PAGE - vm_page_prot is readonly */
1961 page = ZERO_PAGE(address); 1983 page = ZERO_PAGE(address);
@@ -2086,7 +2108,7 @@ retry:
2086 if (anon) { 2108 if (anon) {
2087 inc_mm_counter(mm, anon_rss); 2109 inc_mm_counter(mm, anon_rss);
2088 lru_cache_add_active(new_page); 2110 lru_cache_add_active(new_page);
2089 page_add_anon_rmap(new_page, vma, address); 2111 page_add_new_anon_rmap(new_page, vma, address);
2090 } else { 2112 } else {
2091 inc_mm_counter(mm, file_rss); 2113 inc_mm_counter(mm, file_rss);
2092 page_add_file_rmap(new_page); 2114 page_add_file_rmap(new_page);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f6d4af8af8a8..a918f77f02f3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
42 int nr_pages); 42 int nr_pages);
43static int __add_section(struct zone *zone, unsigned long phys_start_pfn) 43static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
44{ 44{
45 struct pglist_data *pgdat = zone->zone_pgdat;
46 int nr_pages = PAGES_PER_SECTION; 45 int nr_pages = PAGES_PER_SECTION;
47 int ret; 46 int ret;
48 47
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 72f402cc9c9a..0f1d2b8a952b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,7 +93,7 @@ static kmem_cache_t *sn_cache;
93 93
94/* Highest zone. An specific allocation for a zone below that is not 94/* Highest zone. An specific allocation for a zone below that is not
95 policied. */ 95 policied. */
96static int policy_zone; 96int policy_zone = ZONE_DMA;
97 97
98struct mempolicy default_policy = { 98struct mempolicy default_policy = {
99 .refcnt = ATOMIC_INIT(1), /* never free it */ 99 .refcnt = ATOMIC_INIT(1), /* never free it */
@@ -131,17 +131,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
131 if (!zl) 131 if (!zl)
132 return NULL; 132 return NULL;
133 num = 0; 133 num = 0;
134 for_each_node_mask(nd, *nodes) { 134 for_each_node_mask(nd, *nodes)
135 int k; 135 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
138 if (!z->present_pages)
139 continue;
140 zl->zones[num++] = z;
141 if (k > policy_zone)
142 policy_zone = k;
143 }
144 }
145 zl->zones[num] = NULL; 136 zl->zones[num] = NULL;
146 return zl; 137 return zl;
147} 138}
@@ -785,6 +776,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
785 return nid; 776 return nid;
786} 777}
787 778
779/* Determine a node number for interleave */
780static inline unsigned interleave_nid(struct mempolicy *pol,
781 struct vm_area_struct *vma, unsigned long addr, int shift)
782{
783 if (vma) {
784 unsigned long off;
785
786 off = vma->vm_pgoff;
787 off += (addr - vma->vm_start) >> shift;
788 return offset_il_node(pol, vma, off);
789 } else
790 return interleave_nodes(pol);
791}
792
793/* Return a zonelist suitable for a huge page allocation. */
794struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
795{
796 struct mempolicy *pol = get_vma_policy(current, vma, addr);
797
798 if (pol->policy == MPOL_INTERLEAVE) {
799 unsigned nid;
800
801 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
802 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
803 }
804 return zonelist_policy(GFP_HIGHUSER, pol);
805}
806
788/* Allocate a page in interleaved policy. 807/* Allocate a page in interleaved policy.
789 Own path because it needs to do special accounting. */ 808 Own path because it needs to do special accounting. */
790static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 809static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -833,15 +852,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
833 852
834 if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 853 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
835 unsigned nid; 854 unsigned nid;
836 if (vma) { 855
837 unsigned long off; 856 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
838 off = vma->vm_pgoff;
839 off += (addr - vma->vm_start) >> PAGE_SHIFT;
840 nid = offset_il_node(pol, vma, off);
841 } else {
842 /* fall back to process interleaving */
843 nid = interleave_nodes(pol);
844 }
845 return alloc_page_interleave(gfp, 0, nid); 857 return alloc_page_interleave(gfp, 0, nid);
846 } 858 }
847 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); 859 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
@@ -940,54 +952,6 @@ void __mpol_free(struct mempolicy *p)
940} 952}
941 953
942/* 954/*
943 * Hugetlb policy. Same as above, just works with node numbers instead of
944 * zonelists.
945 */
946
947/* Find first node suitable for an allocation */
948int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
949{
950 struct mempolicy *pol = get_vma_policy(current, vma, addr);
951
952 switch (pol->policy) {
953 case MPOL_DEFAULT:
954 return numa_node_id();
955 case MPOL_BIND:
956 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
957 case MPOL_INTERLEAVE:
958 return interleave_nodes(pol);
959 case MPOL_PREFERRED:
960 return pol->v.preferred_node >= 0 ?
961 pol->v.preferred_node : numa_node_id();
962 }
963 BUG();
964 return 0;
965}
966
967/* Find secondary valid nodes for an allocation */
968int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
969{
970 struct mempolicy *pol = get_vma_policy(current, vma, addr);
971
972 switch (pol->policy) {
973 case MPOL_PREFERRED:
974 case MPOL_DEFAULT:
975 case MPOL_INTERLEAVE:
976 return 1;
977 case MPOL_BIND: {
978 struct zone **z;
979 for (z = pol->v.zonelist->zones; *z; z++)
980 if ((*z)->zone_pgdat->node_id == nid)
981 return 1;
982 return 0;
983 }
984 default:
985 BUG();
986 return 0;
987 }
988}
989
990/*
991 * Shared memory backing store policy support. 955 * Shared memory backing store policy support.
992 * 956 *
993 * Remember policies even when nobody has shared memory mapped. 957 * Remember policies even when nobody has shared memory mapped.
diff --git a/mm/nommu.c b/mm/nommu.c
index c1196812876b..c10262d68232 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr)
1177{ 1177{
1178 return 0; 1178 return 0;
1179} 1179}
1180
1181struct page *filemap_nopage(struct vm_area_struct *area,
1182 unsigned long address, int *type)
1183{
1184 BUG();
1185 return NULL;
1186}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe14a8c87fc2..fd47494cb989 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
36#include <linux/memory_hotplug.h> 36#include <linux/memory_hotplug.h>
37#include <linux/nodemask.h> 37#include <linux/nodemask.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/mempolicy.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include "internal.h" 42#include "internal.h"
@@ -53,6 +54,8 @@ unsigned long totalram_pages __read_mostly;
53unsigned long totalhigh_pages __read_mostly; 54unsigned long totalhigh_pages __read_mostly;
54long nr_swap_pages; 55long nr_swap_pages;
55 56
57static void fastcall free_hot_cold_page(struct page *page, int cold);
58
56/* 59/*
57 * results with 256, 32 in the lowmem_reserve sysctl: 60 * results with 256, 32 in the lowmem_reserve sysctl:
58 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 61 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
@@ -81,6 +84,7 @@ int min_free_kbytes = 1024;
81unsigned long __initdata nr_kernel_pages; 84unsigned long __initdata nr_kernel_pages;
82unsigned long __initdata nr_all_pages; 85unsigned long __initdata nr_all_pages;
83 86
87#ifdef CONFIG_DEBUG_VM
84static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 88static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
85{ 89{
86 int ret = 0; 90 int ret = 0;
@@ -122,16 +126,23 @@ static int bad_range(struct zone *zone, struct page *page)
122 return 0; 126 return 0;
123} 127}
124 128
125static void bad_page(const char *function, struct page *page) 129#else
130static inline int bad_range(struct zone *zone, struct page *page)
131{
132 return 0;
133}
134#endif
135
136static void bad_page(struct page *page)
126{ 137{
127 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", 138 printk(KERN_EMERG "Bad page state in process '%s'\n"
128 function, current->comm, page); 139 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
129 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 140 "Trying to fix it up, but a reboot is needed\n"
130 (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, 141 "Backtrace:\n",
131 page->mapping, page_mapcount(page), page_count(page)); 142 current->comm, page, (int)(2*sizeof(unsigned long)),
132 printk(KERN_EMERG "Backtrace:\n"); 143 (unsigned long)page->flags, page->mapping,
144 page_mapcount(page), page_count(page));
133 dump_stack(); 145 dump_stack();
134 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
135 page->flags &= ~(1 << PG_lru | 146 page->flags &= ~(1 << PG_lru |
136 1 << PG_private | 147 1 << PG_private |
137 1 << PG_locked | 148 1 << PG_locked |
@@ -184,19 +195,15 @@ static void destroy_compound_page(struct page *page, unsigned long order)
184 int i; 195 int i;
185 int nr_pages = 1 << order; 196 int nr_pages = 1 << order;
186 197
187 if (!PageCompound(page)) 198 if (unlikely(page[1].index != order))
188 return; 199 bad_page(page);
189
190 if (page[1].index != order)
191 bad_page(__FUNCTION__, page);
192 200
193 for (i = 0; i < nr_pages; i++) { 201 for (i = 0; i < nr_pages; i++) {
194 struct page *p = page + i; 202 struct page *p = page + i;
195 203
196 if (!PageCompound(p)) 204 if (unlikely(!PageCompound(p) |
197 bad_page(__FUNCTION__, page); 205 (page_private(p) != (unsigned long)page)))
198 if (page_private(p) != (unsigned long)page) 206 bad_page(page);
199 bad_page(__FUNCTION__, page);
200 ClearPageCompound(p); 207 ClearPageCompound(p);
201 } 208 }
202} 209}
@@ -255,14 +262,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
255/* 262/*
256 * This function checks whether a page is free && is the buddy 263 * This function checks whether a page is free && is the buddy
257 * we can do coalesce a page and its buddy if 264 * we can do coalesce a page and its buddy if
258 * (a) the buddy is free && 265 * (a) the buddy is not in a hole &&
259 * (b) the buddy is on the buddy system && 266 * (b) the buddy is free &&
260 * (c) a page and its buddy have the same order. 267 * (c) the buddy is on the buddy system &&
268 * (d) a page and its buddy have the same order.
261 * for recording page's order, we use page_private(page) and PG_private. 269 * for recording page's order, we use page_private(page) and PG_private.
262 * 270 *
263 */ 271 */
264static inline int page_is_buddy(struct page *page, int order) 272static inline int page_is_buddy(struct page *page, int order)
265{ 273{
274#ifdef CONFIG_HOLES_IN_ZONE
275 if (!pfn_valid(page_to_pfn(page)))
276 return 0;
277#endif
278
266 if (PagePrivate(page) && 279 if (PagePrivate(page) &&
267 (page_order(page) == order) && 280 (page_order(page) == order) &&
268 page_count(page) == 0) 281 page_count(page) == 0)
@@ -300,7 +313,7 @@ static inline void __free_pages_bulk (struct page *page,
300 unsigned long page_idx; 313 unsigned long page_idx;
301 int order_size = 1 << order; 314 int order_size = 1 << order;
302 315
303 if (unlikely(order)) 316 if (unlikely(PageCompound(page)))
304 destroy_compound_page(page, order); 317 destroy_compound_page(page, order);
305 318
306 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 319 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -314,17 +327,15 @@ static inline void __free_pages_bulk (struct page *page,
314 struct free_area *area; 327 struct free_area *area;
315 struct page *buddy; 328 struct page *buddy;
316 329
317 combined_idx = __find_combined_index(page_idx, order);
318 buddy = __page_find_buddy(page, page_idx, order); 330 buddy = __page_find_buddy(page, page_idx, order);
319
320 if (bad_range(zone, buddy))
321 break;
322 if (!page_is_buddy(buddy, order)) 331 if (!page_is_buddy(buddy, order))
323 break; /* Move the buddy up one level. */ 332 break; /* Move the buddy up one level. */
333
324 list_del(&buddy->lru); 334 list_del(&buddy->lru);
325 area = zone->free_area + order; 335 area = zone->free_area + order;
326 area->nr_free--; 336 area->nr_free--;
327 rmv_page_order(buddy); 337 rmv_page_order(buddy);
338 combined_idx = __find_combined_index(page_idx, order);
328 page = page + (combined_idx - page_idx); 339 page = page + (combined_idx - page_idx);
329 page_idx = combined_idx; 340 page_idx = combined_idx;
330 order++; 341 order++;
@@ -334,11 +345,11 @@ static inline void __free_pages_bulk (struct page *page,
334 zone->free_area[order].nr_free++; 345 zone->free_area[order].nr_free++;
335} 346}
336 347
337static inline int free_pages_check(const char *function, struct page *page) 348static inline int free_pages_check(struct page *page)
338{ 349{
339 if ( page_mapcount(page) || 350 if (unlikely(page_mapcount(page) |
340 page->mapping != NULL || 351 (page->mapping != NULL) |
341 page_count(page) != 0 || 352 (page_count(page) != 0) |
342 (page->flags & ( 353 (page->flags & (
343 1 << PG_lru | 354 1 << PG_lru |
344 1 << PG_private | 355 1 << PG_private |
@@ -348,8 +359,8 @@ static inline int free_pages_check(const char *function, struct page *page)
348 1 << PG_slab | 359 1 << PG_slab |
349 1 << PG_swapcache | 360 1 << PG_swapcache |
350 1 << PG_writeback | 361 1 << PG_writeback |
351 1 << PG_reserved ))) 362 1 << PG_reserved ))))
352 bad_page(function, page); 363 bad_page(page);
353 if (PageDirty(page)) 364 if (PageDirty(page))
354 __ClearPageDirty(page); 365 __ClearPageDirty(page);
355 /* 366 /*
@@ -375,11 +386,10 @@ static int
375free_pages_bulk(struct zone *zone, int count, 386free_pages_bulk(struct zone *zone, int count,
376 struct list_head *list, unsigned int order) 387 struct list_head *list, unsigned int order)
377{ 388{
378 unsigned long flags;
379 struct page *page = NULL; 389 struct page *page = NULL;
380 int ret = 0; 390 int ret = 0;
381 391
382 spin_lock_irqsave(&zone->lock, flags); 392 spin_lock(&zone->lock);
383 zone->all_unreclaimable = 0; 393 zone->all_unreclaimable = 0;
384 zone->pages_scanned = 0; 394 zone->pages_scanned = 0;
385 while (!list_empty(list) && count--) { 395 while (!list_empty(list) && count--) {
@@ -389,12 +399,13 @@ free_pages_bulk(struct zone *zone, int count,
389 __free_pages_bulk(page, zone, order); 399 __free_pages_bulk(page, zone, order);
390 ret++; 400 ret++;
391 } 401 }
392 spin_unlock_irqrestore(&zone->lock, flags); 402 spin_unlock(&zone->lock);
393 return ret; 403 return ret;
394} 404}
395 405
396void __free_pages_ok(struct page *page, unsigned int order) 406void __free_pages_ok(struct page *page, unsigned int order)
397{ 407{
408 unsigned long flags;
398 LIST_HEAD(list); 409 LIST_HEAD(list);
399 int i; 410 int i;
400 int reserved = 0; 411 int reserved = 0;
@@ -408,14 +419,49 @@ void __free_pages_ok(struct page *page, unsigned int order)
408#endif 419#endif
409 420
410 for (i = 0 ; i < (1 << order) ; ++i) 421 for (i = 0 ; i < (1 << order) ; ++i)
411 reserved += free_pages_check(__FUNCTION__, page + i); 422 reserved += free_pages_check(page + i);
412 if (reserved) 423 if (reserved)
413 return; 424 return;
414 425
415 list_add(&page->lru, &list); 426 list_add(&page->lru, &list);
416 mod_page_state(pgfree, 1 << order);
417 kernel_map_pages(page, 1<<order, 0); 427 kernel_map_pages(page, 1<<order, 0);
428 local_irq_save(flags);
429 __mod_page_state(pgfree, 1 << order);
418 free_pages_bulk(page_zone(page), 1, &list, order); 430 free_pages_bulk(page_zone(page), 1, &list, order);
431 local_irq_restore(flags);
432}
433
434/*
435 * permit the bootmem allocator to evade page validation on high-order frees
436 */
437void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
438{
439 if (order == 0) {
440 __ClearPageReserved(page);
441 set_page_count(page, 0);
442
443 free_hot_cold_page(page, 0);
444 } else {
445 LIST_HEAD(list);
446 int loop;
447
448 for (loop = 0; loop < BITS_PER_LONG; loop++) {
449 struct page *p = &page[loop];
450
451 if (loop + 16 < BITS_PER_LONG)
452 prefetchw(p + 16);
453 __ClearPageReserved(p);
454 set_page_count(p, 0);
455 }
456
457 arch_free_page(page, order);
458
459 mod_page_state(pgfree, 1 << order);
460
461 list_add(&page->lru, &list);
462 kernel_map_pages(page, 1 << order, 0);
463 free_pages_bulk(page_zone(page), 1, &list, order);
464 }
419} 465}
420 466
421 467
@@ -433,8 +479,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
433 * 479 *
434 * -- wli 480 * -- wli
435 */ 481 */
436static inline struct page * 482static inline void expand(struct zone *zone, struct page *page,
437expand(struct zone *zone, struct page *page,
438 int low, int high, struct free_area *area) 483 int low, int high, struct free_area *area)
439{ 484{
440 unsigned long size = 1 << high; 485 unsigned long size = 1 << high;
@@ -448,24 +493,6 @@ expand(struct zone *zone, struct page *page,
448 area->nr_free++; 493 area->nr_free++;
449 set_page_order(&page[size], high); 494 set_page_order(&page[size], high);
450 } 495 }
451 return page;
452}
453
454void set_page_refs(struct page *page, int order)
455{
456#ifdef CONFIG_MMU
457 set_page_count(page, 1);
458#else
459 int i;
460
461 /*
462 * We need to reference all the pages for this order, otherwise if
463 * anyone accesses one of the pages with (get/put) it will be freed.
464 * - eg: access_process_vm()
465 */
466 for (i = 0; i < (1 << order); i++)
467 set_page_count(page + i, 1);
468#endif /* CONFIG_MMU */
469} 496}
470 497
471/* 498/*
@@ -473,9 +500,9 @@ void set_page_refs(struct page *page, int order)
473 */ 500 */
474static int prep_new_page(struct page *page, int order) 501static int prep_new_page(struct page *page, int order)
475{ 502{
476 if ( page_mapcount(page) || 503 if (unlikely(page_mapcount(page) |
477 page->mapping != NULL || 504 (page->mapping != NULL) |
478 page_count(page) != 0 || 505 (page_count(page) != 0) |
479 (page->flags & ( 506 (page->flags & (
480 1 << PG_lru | 507 1 << PG_lru |
481 1 << PG_private | 508 1 << PG_private |
@@ -486,8 +513,8 @@ static int prep_new_page(struct page *page, int order)
486 1 << PG_slab | 513 1 << PG_slab |
487 1 << PG_swapcache | 514 1 << PG_swapcache |
488 1 << PG_writeback | 515 1 << PG_writeback |
489 1 << PG_reserved ))) 516 1 << PG_reserved ))))
490 bad_page(__FUNCTION__, page); 517 bad_page(page);
491 518
492 /* 519 /*
493 * For now, we report if PG_reserved was found set, but do not 520 * For now, we report if PG_reserved was found set, but do not
@@ -525,7 +552,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
525 rmv_page_order(page); 552 rmv_page_order(page);
526 area->nr_free--; 553 area->nr_free--;
527 zone->free_pages -= 1UL << order; 554 zone->free_pages -= 1UL << order;
528 return expand(zone, page, order, current_order, area); 555 expand(zone, page, order, current_order, area);
556 return page;
529 } 557 }
530 558
531 return NULL; 559 return NULL;
@@ -539,21 +567,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
539static int rmqueue_bulk(struct zone *zone, unsigned int order, 567static int rmqueue_bulk(struct zone *zone, unsigned int order,
540 unsigned long count, struct list_head *list) 568 unsigned long count, struct list_head *list)
541{ 569{
542 unsigned long flags;
543 int i; 570 int i;
544 int allocated = 0;
545 struct page *page;
546 571
547 spin_lock_irqsave(&zone->lock, flags); 572 spin_lock(&zone->lock);
548 for (i = 0; i < count; ++i) { 573 for (i = 0; i < count; ++i) {
549 page = __rmqueue(zone, order); 574 struct page *page = __rmqueue(zone, order);
550 if (page == NULL) 575 if (unlikely(page == NULL))
551 break; 576 break;
552 allocated++;
553 list_add_tail(&page->lru, list); 577 list_add_tail(&page->lru, list);
554 } 578 }
555 spin_unlock_irqrestore(&zone->lock, flags); 579 spin_unlock(&zone->lock);
556 return allocated; 580 return i;
557} 581}
558 582
559#ifdef CONFIG_NUMA 583#ifdef CONFIG_NUMA
@@ -589,6 +613,7 @@ void drain_remote_pages(void)
589#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 613#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
590static void __drain_pages(unsigned int cpu) 614static void __drain_pages(unsigned int cpu)
591{ 615{
616 unsigned long flags;
592 struct zone *zone; 617 struct zone *zone;
593 int i; 618 int i;
594 619
@@ -600,8 +625,10 @@ static void __drain_pages(unsigned int cpu)
600 struct per_cpu_pages *pcp; 625 struct per_cpu_pages *pcp;
601 626
602 pcp = &pset->pcp[i]; 627 pcp = &pset->pcp[i];
628 local_irq_save(flags);
603 pcp->count -= free_pages_bulk(zone, pcp->count, 629 pcp->count -= free_pages_bulk(zone, pcp->count,
604 &pcp->list, 0); 630 &pcp->list, 0);
631 local_irq_restore(flags);
605 } 632 }
606 } 633 }
607} 634}
@@ -647,18 +674,14 @@ void drain_local_pages(void)
647} 674}
648#endif /* CONFIG_PM */ 675#endif /* CONFIG_PM */
649 676
650static void zone_statistics(struct zonelist *zonelist, struct zone *z) 677static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
651{ 678{
652#ifdef CONFIG_NUMA 679#ifdef CONFIG_NUMA
653 unsigned long flags;
654 int cpu;
655 pg_data_t *pg = z->zone_pgdat; 680 pg_data_t *pg = z->zone_pgdat;
656 pg_data_t *orig = zonelist->zones[0]->zone_pgdat; 681 pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
657 struct per_cpu_pageset *p; 682 struct per_cpu_pageset *p;
658 683
659 local_irq_save(flags); 684 p = zone_pcp(z, cpu);
660 cpu = smp_processor_id();
661 p = zone_pcp(z,cpu);
662 if (pg == orig) { 685 if (pg == orig) {
663 p->numa_hit++; 686 p->numa_hit++;
664 } else { 687 } else {
@@ -669,14 +692,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
669 p->local_node++; 692 p->local_node++;
670 else 693 else
671 p->other_node++; 694 p->other_node++;
672 local_irq_restore(flags);
673#endif 695#endif
674} 696}
675 697
676/* 698/*
677 * Free a 0-order page 699 * Free a 0-order page
678 */ 700 */
679static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
680static void fastcall free_hot_cold_page(struct page *page, int cold) 701static void fastcall free_hot_cold_page(struct page *page, int cold)
681{ 702{
682 struct zone *zone = page_zone(page); 703 struct zone *zone = page_zone(page);
@@ -687,14 +708,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
687 708
688 if (PageAnon(page)) 709 if (PageAnon(page))
689 page->mapping = NULL; 710 page->mapping = NULL;
690 if (free_pages_check(__FUNCTION__, page)) 711 if (free_pages_check(page))
691 return; 712 return;
692 713
693 inc_page_state(pgfree);
694 kernel_map_pages(page, 1, 0); 714 kernel_map_pages(page, 1, 0);
695 715
696 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 716 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
697 local_irq_save(flags); 717 local_irq_save(flags);
718 __inc_page_state(pgfree);
698 list_add(&page->lru, &pcp->list); 719 list_add(&page->lru, &pcp->list);
699 pcp->count++; 720 pcp->count++;
700 if (pcp->count >= pcp->high) 721 if (pcp->count >= pcp->high)
@@ -727,49 +748,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
727 * we cheat by calling it from here, in the order > 0 path. Saves a branch 748 * we cheat by calling it from here, in the order > 0 path. Saves a branch
728 * or two. 749 * or two.
729 */ 750 */
730static struct page * 751static struct page *buffered_rmqueue(struct zonelist *zonelist,
731buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) 752 struct zone *zone, int order, gfp_t gfp_flags)
732{ 753{
733 unsigned long flags; 754 unsigned long flags;
734 struct page *page; 755 struct page *page;
735 int cold = !!(gfp_flags & __GFP_COLD); 756 int cold = !!(gfp_flags & __GFP_COLD);
757 int cpu;
736 758
737again: 759again:
760 cpu = get_cpu();
738 if (order == 0) { 761 if (order == 0) {
739 struct per_cpu_pages *pcp; 762 struct per_cpu_pages *pcp;
740 763
741 page = NULL; 764 pcp = &zone_pcp(zone, cpu)->pcp[cold];
742 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
743 local_irq_save(flags); 765 local_irq_save(flags);
744 if (pcp->count <= pcp->low) 766 if (!pcp->count) {
745 pcp->count += rmqueue_bulk(zone, 0, 767 pcp->count += rmqueue_bulk(zone, 0,
746 pcp->batch, &pcp->list); 768 pcp->batch, &pcp->list);
747 if (pcp->count) { 769 if (unlikely(!pcp->count))
748 page = list_entry(pcp->list.next, struct page, lru); 770 goto failed;
749 list_del(&page->lru);
750 pcp->count--;
751 } 771 }
752 local_irq_restore(flags); 772 page = list_entry(pcp->list.next, struct page, lru);
753 put_cpu(); 773 list_del(&page->lru);
774 pcp->count--;
754 } else { 775 } else {
755 spin_lock_irqsave(&zone->lock, flags); 776 spin_lock_irqsave(&zone->lock, flags);
756 page = __rmqueue(zone, order); 777 page = __rmqueue(zone, order);
757 spin_unlock_irqrestore(&zone->lock, flags); 778 spin_unlock(&zone->lock);
779 if (!page)
780 goto failed;
758 } 781 }
759 782
760 if (page != NULL) { 783 __mod_page_state_zone(zone, pgalloc, 1 << order);
761 BUG_ON(bad_range(zone, page)); 784 zone_statistics(zonelist, zone, cpu);
762 mod_page_state_zone(zone, pgalloc, 1 << order); 785 local_irq_restore(flags);
763 if (prep_new_page(page, order)) 786 put_cpu();
764 goto again; 787
788 BUG_ON(bad_range(zone, page));
789 if (prep_new_page(page, order))
790 goto again;
765 791
766 if (gfp_flags & __GFP_ZERO) 792 if (gfp_flags & __GFP_ZERO)
767 prep_zero_page(page, order, gfp_flags); 793 prep_zero_page(page, order, gfp_flags);
768 794
769 if (order && (gfp_flags & __GFP_COMP)) 795 if (order && (gfp_flags & __GFP_COMP))
770 prep_compound_page(page, order); 796 prep_compound_page(page, order);
771 }
772 return page; 797 return page;
798
799failed:
800 local_irq_restore(flags);
801 put_cpu();
802 return NULL;
773} 803}
774 804
775#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 805#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
@@ -845,9 +875,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
845 continue; 875 continue;
846 } 876 }
847 877
848 page = buffered_rmqueue(*z, order, gfp_mask); 878 page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
849 if (page) { 879 if (page) {
850 zone_statistics(zonelist, *z);
851 break; 880 break;
852 } 881 }
853 } while (*(++z) != NULL); 882 } while (*(++z) != NULL);
@@ -903,8 +932,7 @@ restart:
903 alloc_flags |= ALLOC_HARDER; 932 alloc_flags |= ALLOC_HARDER;
904 if (gfp_mask & __GFP_HIGH) 933 if (gfp_mask & __GFP_HIGH)
905 alloc_flags |= ALLOC_HIGH; 934 alloc_flags |= ALLOC_HIGH;
906 if (wait) 935 alloc_flags |= ALLOC_CPUSET;
907 alloc_flags |= ALLOC_CPUSET;
908 936
909 /* 937 /*
910 * Go through the zonelist again. Let __GFP_HIGH and allocations 938 * Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -926,7 +954,7 @@ restart:
926nofail_alloc: 954nofail_alloc:
927 /* go through the zonelist yet again, ignoring mins */ 955 /* go through the zonelist yet again, ignoring mins */
928 page = get_page_from_freelist(gfp_mask, order, 956 page = get_page_from_freelist(gfp_mask, order,
929 zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); 957 zonelist, ALLOC_NO_WATERMARKS);
930 if (page) 958 if (page)
931 goto got_pg; 959 goto got_pg;
932 if (gfp_mask & __GFP_NOFAIL) { 960 if (gfp_mask & __GFP_NOFAIL) {
@@ -1171,12 +1199,11 @@ EXPORT_SYMBOL(nr_pagecache);
1171DEFINE_PER_CPU(long, nr_pagecache_local) = 0; 1199DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1172#endif 1200#endif
1173 1201
1174void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) 1202static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1175{ 1203{
1176 int cpu = 0; 1204 int cpu = 0;
1177 1205
1178 memset(ret, 0, sizeof(*ret)); 1206 memset(ret, 0, sizeof(*ret));
1179 cpus_and(*cpumask, *cpumask, cpu_online_map);
1180 1207
1181 cpu = first_cpu(*cpumask); 1208 cpu = first_cpu(*cpumask);
1182 while (cpu < NR_CPUS) { 1209 while (cpu < NR_CPUS) {
@@ -1224,12 +1251,12 @@ void get_full_page_state(struct page_state *ret)
1224 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); 1251 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
1225} 1252}
1226 1253
1227unsigned long __read_page_state(unsigned long offset) 1254unsigned long read_page_state_offset(unsigned long offset)
1228{ 1255{
1229 unsigned long ret = 0; 1256 unsigned long ret = 0;
1230 int cpu; 1257 int cpu;
1231 1258
1232 for_each_online_cpu(cpu) { 1259 for_each_cpu(cpu) {
1233 unsigned long in; 1260 unsigned long in;
1234 1261
1235 in = (unsigned long)&per_cpu(page_states, cpu) + offset; 1262 in = (unsigned long)&per_cpu(page_states, cpu) + offset;
@@ -1238,18 +1265,26 @@ unsigned long __read_page_state(unsigned long offset)
1238 return ret; 1265 return ret;
1239} 1266}
1240 1267
1241void __mod_page_state(unsigned long offset, unsigned long delta) 1268void __mod_page_state_offset(unsigned long offset, unsigned long delta)
1269{
1270 void *ptr;
1271
1272 ptr = &__get_cpu_var(page_states);
1273 *(unsigned long *)(ptr + offset) += delta;
1274}
1275EXPORT_SYMBOL(__mod_page_state_offset);
1276
1277void mod_page_state_offset(unsigned long offset, unsigned long delta)
1242{ 1278{
1243 unsigned long flags; 1279 unsigned long flags;
1244 void* ptr; 1280 void *ptr;
1245 1281
1246 local_irq_save(flags); 1282 local_irq_save(flags);
1247 ptr = &__get_cpu_var(page_states); 1283 ptr = &__get_cpu_var(page_states);
1248 *(unsigned long*)(ptr + offset) += delta; 1284 *(unsigned long *)(ptr + offset) += delta;
1249 local_irq_restore(flags); 1285 local_irq_restore(flags);
1250} 1286}
1251 1287EXPORT_SYMBOL(mod_page_state_offset);
1252EXPORT_SYMBOL(__mod_page_state);
1253 1288
1254void __get_zone_counts(unsigned long *active, unsigned long *inactive, 1289void __get_zone_counts(unsigned long *active, unsigned long *inactive,
1255 unsigned long *free, struct pglist_data *pgdat) 1290 unsigned long *free, struct pglist_data *pgdat)
@@ -1335,7 +1370,7 @@ void show_free_areas(void)
1335 show_node(zone); 1370 show_node(zone);
1336 printk("%s per-cpu:", zone->name); 1371 printk("%s per-cpu:", zone->name);
1337 1372
1338 if (!zone->present_pages) { 1373 if (!populated_zone(zone)) {
1339 printk(" empty\n"); 1374 printk(" empty\n");
1340 continue; 1375 continue;
1341 } else 1376 } else
@@ -1347,10 +1382,9 @@ void show_free_areas(void)
1347 pageset = zone_pcp(zone, cpu); 1382 pageset = zone_pcp(zone, cpu);
1348 1383
1349 for (temperature = 0; temperature < 2; temperature++) 1384 for (temperature = 0; temperature < 2; temperature++)
1350 printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", 1385 printk("cpu %d %s: high %d, batch %d used:%d\n",
1351 cpu, 1386 cpu,
1352 temperature ? "cold" : "hot", 1387 temperature ? "cold" : "hot",
1353 pageset->pcp[temperature].low,
1354 pageset->pcp[temperature].high, 1388 pageset->pcp[temperature].high,
1355 pageset->pcp[temperature].batch, 1389 pageset->pcp[temperature].batch,
1356 pageset->pcp[temperature].count); 1390 pageset->pcp[temperature].count);
@@ -1413,7 +1447,7 @@ void show_free_areas(void)
1413 1447
1414 show_node(zone); 1448 show_node(zone);
1415 printk("%s: ", zone->name); 1449 printk("%s: ", zone->name);
1416 if (!zone->present_pages) { 1450 if (!populated_zone(zone)) {
1417 printk("empty\n"); 1451 printk("empty\n");
1418 continue; 1452 continue;
1419 } 1453 }
@@ -1433,36 +1467,29 @@ void show_free_areas(void)
1433 1467
1434/* 1468/*
1435 * Builds allocation fallback zone lists. 1469 * Builds allocation fallback zone lists.
1470 *
1471 * Add all populated zones of a node to the zonelist.
1436 */ 1472 */
1437static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) 1473static int __init build_zonelists_node(pg_data_t *pgdat,
1438{ 1474 struct zonelist *zonelist, int nr_zones, int zone_type)
1439 switch (k) { 1475{
1440 struct zone *zone; 1476 struct zone *zone;
1441 default: 1477
1442 BUG(); 1478 BUG_ON(zone_type > ZONE_HIGHMEM);
1443 case ZONE_HIGHMEM: 1479
1444 zone = pgdat->node_zones + ZONE_HIGHMEM; 1480 do {
1445 if (zone->present_pages) { 1481 zone = pgdat->node_zones + zone_type;
1482 if (populated_zone(zone)) {
1446#ifndef CONFIG_HIGHMEM 1483#ifndef CONFIG_HIGHMEM
1447 BUG(); 1484 BUG_ON(zone_type > ZONE_NORMAL);
1448#endif 1485#endif
1449 zonelist->zones[j++] = zone; 1486 zonelist->zones[nr_zones++] = zone;
1487 check_highest_zone(zone_type);
1450 } 1488 }
1451 case ZONE_NORMAL: 1489 zone_type--;
1452 zone = pgdat->node_zones + ZONE_NORMAL;
1453 if (zone->present_pages)
1454 zonelist->zones[j++] = zone;
1455 case ZONE_DMA32:
1456 zone = pgdat->node_zones + ZONE_DMA32;
1457 if (zone->present_pages)
1458 zonelist->zones[j++] = zone;
1459 case ZONE_DMA:
1460 zone = pgdat->node_zones + ZONE_DMA;
1461 if (zone->present_pages)
1462 zonelist->zones[j++] = zone;
1463 }
1464 1490
1465 return j; 1491 } while (zone_type >= 0);
1492 return nr_zones;
1466} 1493}
1467 1494
1468static inline int highest_zone(int zone_bits) 1495static inline int highest_zone(int zone_bits)
@@ -1709,8 +1736,6 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1709 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { 1736 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
1710 if (!early_pfn_valid(pfn)) 1737 if (!early_pfn_valid(pfn))
1711 continue; 1738 continue;
1712 if (!early_pfn_in_nid(pfn, nid))
1713 continue;
1714 page = pfn_to_page(pfn); 1739 page = pfn_to_page(pfn);
1715 set_page_links(page, zone, nid, pfn); 1740 set_page_links(page, zone, nid, pfn);
1716 set_page_count(page, 1); 1741 set_page_count(page, 1);
@@ -1794,14 +1819,12 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1794 1819
1795 pcp = &p->pcp[0]; /* hot */ 1820 pcp = &p->pcp[0]; /* hot */
1796 pcp->count = 0; 1821 pcp->count = 0;
1797 pcp->low = 0;
1798 pcp->high = 6 * batch; 1822 pcp->high = 6 * batch;
1799 pcp->batch = max(1UL, 1 * batch); 1823 pcp->batch = max(1UL, 1 * batch);
1800 INIT_LIST_HEAD(&pcp->list); 1824 INIT_LIST_HEAD(&pcp->list);
1801 1825
1802 pcp = &p->pcp[1]; /* cold*/ 1826 pcp = &p->pcp[1]; /* cold*/
1803 pcp->count = 0; 1827 pcp->count = 0;
1804 pcp->low = 0;
1805 pcp->high = 2 * batch; 1828 pcp->high = 2 * batch;
1806 pcp->batch = max(1UL, batch/2); 1829 pcp->batch = max(1UL, batch/2);
1807 INIT_LIST_HEAD(&pcp->list); 1830 INIT_LIST_HEAD(&pcp->list);
@@ -2116,7 +2139,7 @@ static int frag_show(struct seq_file *m, void *arg)
2116 int order; 2139 int order;
2117 2140
2118 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 2141 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2119 if (!zone->present_pages) 2142 if (!populated_zone(zone))
2120 continue; 2143 continue;
2121 2144
2122 spin_lock_irqsave(&zone->lock, flags); 2145 spin_lock_irqsave(&zone->lock, flags);
@@ -2149,7 +2172,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2149 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { 2172 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2150 int i; 2173 int i;
2151 2174
2152 if (!zone->present_pages) 2175 if (!populated_zone(zone))
2153 continue; 2176 continue;
2154 2177
2155 spin_lock_irqsave(&zone->lock, flags); 2178 spin_lock_irqsave(&zone->lock, flags);
@@ -2197,12 +2220,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2197 seq_printf(m, 2220 seq_printf(m,
2198 "\n cpu: %i pcp: %i" 2221 "\n cpu: %i pcp: %i"
2199 "\n count: %i" 2222 "\n count: %i"
2200 "\n low: %i"
2201 "\n high: %i" 2223 "\n high: %i"
2202 "\n batch: %i", 2224 "\n batch: %i",
2203 i, j, 2225 i, j,
2204 pageset->pcp[j].count, 2226 pageset->pcp[j].count,
2205 pageset->pcp[j].low,
2206 pageset->pcp[j].high, 2227 pageset->pcp[j].high,
2207 pageset->pcp[j].batch); 2228 pageset->pcp[j].batch);
2208 } 2229 }
@@ -2257,32 +2278,40 @@ static char *vmstat_text[] = {
2257 "pgpgout", 2278 "pgpgout",
2258 "pswpin", 2279 "pswpin",
2259 "pswpout", 2280 "pswpout",
2260 "pgalloc_high",
2261 2281
2282 "pgalloc_high",
2262 "pgalloc_normal", 2283 "pgalloc_normal",
2284 "pgalloc_dma32",
2263 "pgalloc_dma", 2285 "pgalloc_dma",
2286
2264 "pgfree", 2287 "pgfree",
2265 "pgactivate", 2288 "pgactivate",
2266 "pgdeactivate", 2289 "pgdeactivate",
2267 2290
2268 "pgfault", 2291 "pgfault",
2269 "pgmajfault", 2292 "pgmajfault",
2293
2270 "pgrefill_high", 2294 "pgrefill_high",
2271 "pgrefill_normal", 2295 "pgrefill_normal",
2296 "pgrefill_dma32",
2272 "pgrefill_dma", 2297 "pgrefill_dma",
2273 2298
2274 "pgsteal_high", 2299 "pgsteal_high",
2275 "pgsteal_normal", 2300 "pgsteal_normal",
2301 "pgsteal_dma32",
2276 "pgsteal_dma", 2302 "pgsteal_dma",
2303
2277 "pgscan_kswapd_high", 2304 "pgscan_kswapd_high",
2278 "pgscan_kswapd_normal", 2305 "pgscan_kswapd_normal",
2279 2306 "pgscan_kswapd_dma32",
2280 "pgscan_kswapd_dma", 2307 "pgscan_kswapd_dma",
2308
2281 "pgscan_direct_high", 2309 "pgscan_direct_high",
2282 "pgscan_direct_normal", 2310 "pgscan_direct_normal",
2311 "pgscan_direct_dma32",
2283 "pgscan_direct_dma", 2312 "pgscan_direct_dma",
2284 "pginodesteal",
2285 2313
2314 "pginodesteal",
2286 "slabs_scanned", 2315 "slabs_scanned",
2287 "kswapd_steal", 2316 "kswapd_steal",
2288 "kswapd_inodesteal", 2317 "kswapd_inodesteal",
diff --git a/mm/readahead.c b/mm/readahead.c
index 72e7adbb87c7..8d6eeaaa6296 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
158{ 158{
159 unsigned page_idx; 159 unsigned page_idx;
160 struct pagevec lru_pvec; 160 struct pagevec lru_pvec;
161 int ret = 0; 161 int ret;
162 162
163 if (mapping->a_ops->readpages) { 163 if (mapping->a_ops->readpages) {
164 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 164 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
171 list_del(&page->lru); 171 list_del(&page->lru);
172 if (!add_to_page_cache(page, mapping, 172 if (!add_to_page_cache(page, mapping,
173 page->index, GFP_KERNEL)) { 173 page->index, GFP_KERNEL)) {
174 mapping->a_ops->readpage(filp, page); 174 ret = mapping->a_ops->readpage(filp, page);
175 if (!pagevec_add(&lru_pvec, page)) 175 if (ret != AOP_TRUNCATED_PAGE) {
176 __pagevec_lru_add(&lru_pvec); 176 if (!pagevec_add(&lru_pvec, page))
177 } else { 177 __pagevec_lru_add(&lru_pvec);
178 page_cache_release(page); 178 continue;
179 } /* else fall through to release */
179 } 180 }
181 page_cache_release(page);
180 } 182 }
181 pagevec_lru_add(&lru_pvec); 183 pagevec_lru_add(&lru_pvec);
184 ret = 0;
182out: 185out:
183 return ret; 186 return ret;
184} 187}
diff --git a/mm/rmap.c b/mm/rmap.c
index f853c6def159..6f3f7db27128 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -435,6 +435,30 @@ int page_referenced(struct page *page, int is_locked)
435} 435}
436 436
437/** 437/**
438 * page_set_anon_rmap - setup new anonymous rmap
439 * @page: the page to add the mapping to
440 * @vma: the vm area in which the mapping is added
441 * @address: the user virtual address mapped
442 */
443static void __page_set_anon_rmap(struct page *page,
444 struct vm_area_struct *vma, unsigned long address)
445{
446 struct anon_vma *anon_vma = vma->anon_vma;
447
448 BUG_ON(!anon_vma);
449 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
450 page->mapping = (struct address_space *) anon_vma;
451
452 page->index = linear_page_index(vma, address);
453
454 /*
455 * nr_mapped state can be updated without turning off
456 * interrupts because it is not modified via interrupt.
457 */
458 __inc_page_state(nr_mapped);
459}
460
461/**
438 * page_add_anon_rmap - add pte mapping to an anonymous page 462 * page_add_anon_rmap - add pte mapping to an anonymous page
439 * @page: the page to add the mapping to 463 * @page: the page to add the mapping to
440 * @vma: the vm area in which the mapping is added 464 * @vma: the vm area in which the mapping is added
@@ -445,20 +469,27 @@ int page_referenced(struct page *page, int is_locked)
445void page_add_anon_rmap(struct page *page, 469void page_add_anon_rmap(struct page *page,
446 struct vm_area_struct *vma, unsigned long address) 470 struct vm_area_struct *vma, unsigned long address)
447{ 471{
448 if (atomic_inc_and_test(&page->_mapcount)) { 472 if (atomic_inc_and_test(&page->_mapcount))
449 struct anon_vma *anon_vma = vma->anon_vma; 473 __page_set_anon_rmap(page, vma, address);
450
451 BUG_ON(!anon_vma);
452 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
453 page->mapping = (struct address_space *) anon_vma;
454
455 page->index = linear_page_index(vma, address);
456
457 inc_page_state(nr_mapped);
458 }
459 /* else checking page index and mapping is racy */ 474 /* else checking page index and mapping is racy */
460} 475}
461 476
477/*
478 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
479 * @page: the page to add the mapping to
480 * @vma: the vm area in which the mapping is added
481 * @address: the user virtual address mapped
482 *
483 * Same as page_add_anon_rmap but must only be called on *new* pages.
484 * This means the inc-and-test can be bypassed.
485 */
486void page_add_new_anon_rmap(struct page *page,
487 struct vm_area_struct *vma, unsigned long address)
488{
489 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
490 __page_set_anon_rmap(page, vma, address);
491}
492
462/** 493/**
463 * page_add_file_rmap - add pte mapping to a file page 494 * page_add_file_rmap - add pte mapping to a file page
464 * @page: the page to add the mapping to 495 * @page: the page to add the mapping to
@@ -471,7 +502,7 @@ void page_add_file_rmap(struct page *page)
471 BUG_ON(!pfn_valid(page_to_pfn(page))); 502 BUG_ON(!pfn_valid(page_to_pfn(page)));
472 503
473 if (atomic_inc_and_test(&page->_mapcount)) 504 if (atomic_inc_and_test(&page->_mapcount))
474 inc_page_state(nr_mapped); 505 __inc_page_state(nr_mapped);
475} 506}
476 507
477/** 508/**
@@ -495,7 +526,7 @@ void page_remove_rmap(struct page *page)
495 */ 526 */
496 if (page_test_and_clear_dirty(page)) 527 if (page_test_and_clear_dirty(page))
497 set_page_dirty(page); 528 set_page_dirty(page);
498 dec_page_state(nr_mapped); 529 __dec_page_state(nr_mapped);
499 } 530 }
500} 531}
501 532
diff --git a/mm/shmem.c b/mm/shmem.c
index dc25565a61e9..a1f2f02af724 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next)
457 } while (next); 457 } while (next);
458} 458}
459 459
460static void shmem_truncate(struct inode *inode) 460static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
461{ 461{
462 struct shmem_inode_info *info = SHMEM_I(inode); 462 struct shmem_inode_info *info = SHMEM_I(inode);
463 unsigned long idx; 463 unsigned long idx;
@@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode)
475 long nr_swaps_freed = 0; 475 long nr_swaps_freed = 0;
476 int offset; 476 int offset;
477 int freed; 477 int freed;
478 int punch_hole = 0;
478 479
479 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 480 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
480 idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 481 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
481 if (idx >= info->next_index) 482 if (idx >= info->next_index)
482 return; 483 return;
483 484
484 spin_lock(&info->lock); 485 spin_lock(&info->lock);
485 info->flags |= SHMEM_TRUNCATE; 486 info->flags |= SHMEM_TRUNCATE;
486 limit = info->next_index; 487 if (likely(end == (loff_t) -1)) {
487 info->next_index = idx; 488 limit = info->next_index;
489 info->next_index = idx;
490 } else {
491 limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
492 if (limit > info->next_index)
493 limit = info->next_index;
494 punch_hole = 1;
495 }
496
488 topdir = info->i_indirect; 497 topdir = info->i_indirect;
489 if (topdir && idx <= SHMEM_NR_DIRECT) { 498 if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
490 info->i_indirect = NULL; 499 info->i_indirect = NULL;
491 nr_pages_to_free++; 500 nr_pages_to_free++;
492 list_add(&topdir->lru, &pages_to_free); 501 list_add(&topdir->lru, &pages_to_free);
@@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode)
573 set_page_private(subdir, page_private(subdir) - freed); 582 set_page_private(subdir, page_private(subdir) - freed);
574 if (offset) 583 if (offset)
575 spin_unlock(&info->lock); 584 spin_unlock(&info->lock);
576 BUG_ON(page_private(subdir) > offset); 585 if (!punch_hole)
586 BUG_ON(page_private(subdir) > offset);
577 } 587 }
578 if (offset) 588 if (offset)
579 offset = 0; 589 offset = 0;
580 else if (subdir) { 590 else if (subdir && !page_private(subdir)) {
581 dir[diroff] = NULL; 591 dir[diroff] = NULL;
582 nr_pages_to_free++; 592 nr_pages_to_free++;
583 list_add(&subdir->lru, &pages_to_free); 593 list_add(&subdir->lru, &pages_to_free);
@@ -594,7 +604,7 @@ done2:
594 * Also, though shmem_getpage checks i_size before adding to 604 * Also, though shmem_getpage checks i_size before adding to
595 * cache, no recheck after: so fix the narrow window there too. 605 * cache, no recheck after: so fix the narrow window there too.
596 */ 606 */
597 truncate_inode_pages(inode->i_mapping, inode->i_size); 607 truncate_inode_pages_range(inode->i_mapping, start, end);
598 } 608 }
599 609
600 spin_lock(&info->lock); 610 spin_lock(&info->lock);
@@ -614,6 +624,11 @@ done2:
614 } 624 }
615} 625}
616 626
627static void shmem_truncate(struct inode *inode)
628{
629 shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
630}
631
617static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) 632static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
618{ 633{
619 struct inode *inode = dentry->d_inode; 634 struct inode *inode = dentry->d_inode;
@@ -855,7 +870,7 @@ unlock:
855 swap_free(swap); 870 swap_free(swap);
856redirty: 871redirty:
857 set_page_dirty(page); 872 set_page_dirty(page);
858 return WRITEPAGE_ACTIVATE; /* Return with the page locked */ 873 return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */
859} 874}
860 875
861#ifdef CONFIG_NUMA 876#ifdef CONFIG_NUMA
@@ -1255,7 +1270,7 @@ out_nomem:
1255 return retval; 1270 return retval;
1256} 1271}
1257 1272
1258static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 1273int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1259{ 1274{
1260 file_accessed(file); 1275 file_accessed(file);
1261 vma->vm_ops = &shmem_vm_ops; 1276 vma->vm_ops = &shmem_vm_ops;
@@ -2083,6 +2098,7 @@ static struct file_operations shmem_file_operations = {
2083static struct inode_operations shmem_inode_operations = { 2098static struct inode_operations shmem_inode_operations = {
2084 .truncate = shmem_truncate, 2099 .truncate = shmem_truncate,
2085 .setattr = shmem_notify_change, 2100 .setattr = shmem_notify_change,
2101 .truncate_range = shmem_truncate_range,
2086}; 2102};
2087 2103
2088static struct inode_operations shmem_dir_inode_operations = { 2104static struct inode_operations shmem_dir_inode_operations = {
diff --git a/mm/swap.c b/mm/swap.c
index 73d351439ef6..ee6d71ccfa56 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -156,16 +156,22 @@ void fastcall lru_cache_add_active(struct page *page)
156 put_cpu_var(lru_add_active_pvecs); 156 put_cpu_var(lru_add_active_pvecs);
157} 157}
158 158
159void lru_add_drain(void) 159static void __lru_add_drain(int cpu)
160{ 160{
161 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 161 struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
162 162
163 /* CPU is dead, so no locking needed. */
163 if (pagevec_count(pvec)) 164 if (pagevec_count(pvec))
164 __pagevec_lru_add(pvec); 165 __pagevec_lru_add(pvec);
165 pvec = &__get_cpu_var(lru_add_active_pvecs); 166 pvec = &per_cpu(lru_add_active_pvecs, cpu);
166 if (pagevec_count(pvec)) 167 if (pagevec_count(pvec))
167 __pagevec_lru_add_active(pvec); 168 __pagevec_lru_add_active(pvec);
168 put_cpu_var(lru_add_pvecs); 169}
170
171void lru_add_drain(void)
172{
173 __lru_add_drain(get_cpu());
174 put_cpu();
169} 175}
170 176
171/* 177/*
@@ -412,17 +418,6 @@ void vm_acct_memory(long pages)
412} 418}
413 419
414#ifdef CONFIG_HOTPLUG_CPU 420#ifdef CONFIG_HOTPLUG_CPU
415static void lru_drain_cache(unsigned int cpu)
416{
417 struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
418
419 /* CPU is dead, so no locking needed. */
420 if (pagevec_count(pvec))
421 __pagevec_lru_add(pvec);
422 pvec = &per_cpu(lru_add_active_pvecs, cpu);
423 if (pagevec_count(pvec))
424 __pagevec_lru_add_active(pvec);
425}
426 421
427/* Drop the CPU's cached committed space back into the central pool. */ 422/* Drop the CPU's cached committed space back into the central pool. */
428static int cpu_swap_callback(struct notifier_block *nfb, 423static int cpu_swap_callback(struct notifier_block *nfb,
@@ -435,7 +430,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
435 if (action == CPU_DEAD) { 430 if (action == CPU_DEAD) {
436 atomic_add(*committed, &vm_committed_space); 431 atomic_add(*committed, &vm_committed_space);
437 *committed = 0; 432 *committed = 0;
438 lru_drain_cache((long)hcpu); 433 __lru_add_drain((long)hcpu);
439 } 434 }
440 return NOTIFY_OK; 435 return NOTIFY_OK;
441} 436}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0df9a57b1de8..fc2aecb70a95 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/pagevec.h>
17 18
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19 20
@@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page)
272 */ 273 */
273void free_pages_and_swap_cache(struct page **pages, int nr) 274void free_pages_and_swap_cache(struct page **pages, int nr)
274{ 275{
275 int chunk = 16;
276 struct page **pagep = pages; 276 struct page **pagep = pages;
277 277
278 lru_add_drain(); 278 lru_add_drain();
279 while (nr) { 279 while (nr) {
280 int todo = min(chunk, nr); 280 int todo = min(nr, PAGEVEC_SIZE);
281 int i; 281 int i;
282 282
283 for (i = 0; i < todo; i++) 283 for (i = 0; i < todo; i++)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index edafeace301f..6da4b28b896b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -211,6 +211,26 @@ noswap:
211 return (swp_entry_t) {0}; 211 return (swp_entry_t) {0};
212} 212}
213 213
214swp_entry_t get_swap_page_of_type(int type)
215{
216 struct swap_info_struct *si;
217 pgoff_t offset;
218
219 spin_lock(&swap_lock);
220 si = swap_info + type;
221 if (si->flags & SWP_WRITEOK) {
222 nr_swap_pages--;
223 offset = scan_swap_map(si);
224 if (offset) {
225 spin_unlock(&swap_lock);
226 return swp_entry(type, offset);
227 }
228 nr_swap_pages++;
229 }
230 spin_unlock(&swap_lock);
231 return (swp_entry_t) {0};
232}
233
214static struct swap_info_struct * swap_info_get(swp_entry_t entry) 234static struct swap_info_struct * swap_info_get(swp_entry_t entry)
215{ 235{
216 struct swap_info_struct * p; 236 struct swap_info_struct * p;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index b58abcf44ed6..cdc6d431972b 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
81 goto close_file; 81 goto close_file;
82 82
83 d_instantiate(dentry, inode); 83 d_instantiate(dentry, inode);
84 inode->i_size = size;
85 inode->i_nlink = 0; /* It is unlinked */ 84 inode->i_nlink = 0; /* It is unlinked */
85
86 file->f_vfsmnt = mntget(shm_mnt); 86 file->f_vfsmnt = mntget(shm_mnt);
87 file->f_dentry = dentry; 87 file->f_dentry = dentry;
88 file->f_mapping = inode->i_mapping; 88 file->f_mapping = inode->i_mapping;
89 file->f_op = &ramfs_file_operations; 89 file->f_op = &ramfs_file_operations;
90 file->f_mode = FMODE_WRITE | FMODE_READ; 90 file->f_mode = FMODE_WRITE | FMODE_READ;
91
92 /* notify everyone as to the change of file size */
93 error = do_truncate(dentry, size, file);
94 if (error < 0)
95 goto close_file;
96
91 return file; 97 return file;
92 98
93close_file: 99close_file:
@@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
123{ 129{
124 return 0; 130 return 0;
125} 131}
132
133int shmem_mmap(struct file *file, struct vm_area_struct *vma)
134{
135 file_accessed(file);
136#ifndef CONFIG_MMU
137 return ramfs_nommu_mmap(file, vma);
138#else
139 return 0;
140#endif
141}
142
143#ifndef CONFIG_MMU
144unsigned long shmem_get_unmapped_area(struct file *file,
145 unsigned long addr,
146 unsigned long len,
147 unsigned long pgoff,
148 unsigned long flags)
149{
150 return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
151}
152#endif
diff --git a/mm/truncate.c b/mm/truncate.c
index 9173ab500604..7dee32745901 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
82} 82}
83 83
84/** 84/**
85 * truncate_inode_pages - truncate *all* the pages from an offset 85 * truncate_inode_pages - truncate range of pages specified by start and
86 * end byte offsets
86 * @mapping: mapping to truncate 87 * @mapping: mapping to truncate
87 * @lstart: offset from which to truncate 88 * @lstart: offset from which to truncate
89 * @lend: offset to which to truncate
88 * 90 *
89 * Truncate the page cache at a set offset, removing the pages that are beyond 91 * Truncate the page cache, removing the pages that are between
90 * that offset (and zeroing out partial pages). 92 * specified offsets (and zeroing out partial page
93 * (if lstart is not page aligned)).
91 * 94 *
92 * Truncate takes two passes - the first pass is nonblocking. It will not 95 * Truncate takes two passes - the first pass is nonblocking. It will not
93 * block on page locks and it will not block on writeback. The second pass 96 * block on page locks and it will not block on writeback. The second pass
@@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
101 * We pass down the cache-hot hint to the page freeing code. Even if the 104 * We pass down the cache-hot hint to the page freeing code. Even if the
102 * mapping is large, it is probably the case that the final pages are the most 105 * mapping is large, it is probably the case that the final pages are the most
103 * recently touched, and freeing happens in ascending file offset order. 106 * recently touched, and freeing happens in ascending file offset order.
104 *
105 * Called under (and serialised by) inode->i_sem.
106 */ 107 */
107void truncate_inode_pages(struct address_space *mapping, loff_t lstart) 108void truncate_inode_pages_range(struct address_space *mapping,
109 loff_t lstart, loff_t lend)
108{ 110{
109 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 111 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
112 pgoff_t end;
110 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 113 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
111 struct pagevec pvec; 114 struct pagevec pvec;
112 pgoff_t next; 115 pgoff_t next;
@@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
115 if (mapping->nrpages == 0) 118 if (mapping->nrpages == 0)
116 return; 119 return;
117 120
121 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
122 end = (lend >> PAGE_CACHE_SHIFT);
123
118 pagevec_init(&pvec, 0); 124 pagevec_init(&pvec, 0);
119 next = start; 125 next = start;
120 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 126 while (next <= end &&
127 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
121 for (i = 0; i < pagevec_count(&pvec); i++) { 128 for (i = 0; i < pagevec_count(&pvec); i++) {
122 struct page *page = pvec.pages[i]; 129 struct page *page = pvec.pages[i];
123 pgoff_t page_index = page->index; 130 pgoff_t page_index = page->index;
124 131
132 if (page_index > end) {
133 next = page_index;
134 break;
135 }
136
125 if (page_index > next) 137 if (page_index > next)
126 next = page_index; 138 next = page_index;
127 next++; 139 next++;
@@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
157 next = start; 169 next = start;
158 continue; 170 continue;
159 } 171 }
172 if (pvec.pages[0]->index > end) {
173 pagevec_release(&pvec);
174 break;
175 }
160 for (i = 0; i < pagevec_count(&pvec); i++) { 176 for (i = 0; i < pagevec_count(&pvec); i++) {
161 struct page *page = pvec.pages[i]; 177 struct page *page = pvec.pages[i];
162 178
179 if (page->index > end)
180 break;
163 lock_page(page); 181 lock_page(page);
164 wait_on_page_writeback(page); 182 wait_on_page_writeback(page);
165 if (page->index > next) 183 if (page->index > next)
@@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
171 pagevec_release(&pvec); 189 pagevec_release(&pvec);
172 } 190 }
173} 191}
192EXPORT_SYMBOL(truncate_inode_pages_range);
174 193
194/**
195 * truncate_inode_pages - truncate *all* the pages from an offset
196 * @mapping: mapping to truncate
197 * @lstart: offset from which to truncate
198 *
199 * Called under (and serialised by) inode->i_sem.
200 */
201void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
202{
203 truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
204}
175EXPORT_SYMBOL(truncate_inode_pages); 205EXPORT_SYMBOL(truncate_inode_pages);
176 206
177/** 207/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0cd81c32de6..be8235fb1939 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,9 +63,6 @@ struct scan_control {
63 63
64 unsigned long nr_mapped; /* From page_state */ 64 unsigned long nr_mapped; /* From page_state */
65 65
66 /* How many pages shrink_cache() should reclaim */
67 int nr_to_reclaim;
68
69 /* Ask shrink_caches, or shrink_zone to scan at this priority */ 66 /* Ask shrink_caches, or shrink_zone to scan at this priority */
70 unsigned int priority; 67 unsigned int priority;
71 68
@@ -74,9 +71,6 @@ struct scan_control {
74 71
75 int may_writepage; 72 int may_writepage;
76 73
77 /* Can pages be swapped as part of reclaim? */
78 int may_swap;
79
80 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 74 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
81 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 75 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
82 * In this context, it doesn't matter that we scan the 76 * In this context, it doesn't matter that we scan the
@@ -367,7 +361,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
367 res = mapping->a_ops->writepage(page, &wbc); 361 res = mapping->a_ops->writepage(page, &wbc);
368 if (res < 0) 362 if (res < 0)
369 handle_write_error(mapping, page, res); 363 handle_write_error(mapping, page, res);
370 if (res == WRITEPAGE_ACTIVATE) { 364 if (res == AOP_WRITEPAGE_ACTIVATE) {
371 ClearPageReclaim(page); 365 ClearPageReclaim(page);
372 return PAGE_ACTIVATE; 366 return PAGE_ACTIVATE;
373 } 367 }
@@ -430,8 +424,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
430 * Try to allocate it some swap space here. 424 * Try to allocate it some swap space here.
431 */ 425 */
432 if (PageAnon(page) && !PageSwapCache(page)) { 426 if (PageAnon(page) && !PageSwapCache(page)) {
433 if (!sc->may_swap)
434 goto keep_locked;
435 if (!add_to_swap(page)) 427 if (!add_to_swap(page))
436 goto activate_locked; 428 goto activate_locked;
437 } 429 }
@@ -653,17 +645,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
653 goto done; 645 goto done;
654 646
655 max_scan -= nr_scan; 647 max_scan -= nr_scan;
656 if (current_is_kswapd())
657 mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
658 else
659 mod_page_state_zone(zone, pgscan_direct, nr_scan);
660 nr_freed = shrink_list(&page_list, sc); 648 nr_freed = shrink_list(&page_list, sc);
661 if (current_is_kswapd())
662 mod_page_state(kswapd_steal, nr_freed);
663 mod_page_state_zone(zone, pgsteal, nr_freed);
664 sc->nr_to_reclaim -= nr_freed;
665 649
666 spin_lock_irq(&zone->lru_lock); 650 local_irq_disable();
651 if (current_is_kswapd()) {
652 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
653 __mod_page_state(kswapd_steal, nr_freed);
654 } else
655 __mod_page_state_zone(zone, pgscan_direct, nr_scan);
656 __mod_page_state_zone(zone, pgsteal, nr_freed);
657
658 spin_lock(&zone->lru_lock);
667 /* 659 /*
668 * Put back any unfreeable pages. 660 * Put back any unfreeable pages.
669 */ 661 */
@@ -825,11 +817,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
825 } 817 }
826 } 818 }
827 zone->nr_active += pgmoved; 819 zone->nr_active += pgmoved;
828 spin_unlock_irq(&zone->lru_lock); 820 spin_unlock(&zone->lru_lock);
829 pagevec_release(&pvec); 821
822 __mod_page_state_zone(zone, pgrefill, pgscanned);
823 __mod_page_state(pgdeactivate, pgdeactivate);
824 local_irq_enable();
830 825
831 mod_page_state_zone(zone, pgrefill, pgscanned); 826 pagevec_release(&pvec);
832 mod_page_state(pgdeactivate, pgdeactivate);
833} 827}
834 828
835/* 829/*
@@ -861,8 +855,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
861 else 855 else
862 nr_inactive = 0; 856 nr_inactive = 0;
863 857
864 sc->nr_to_reclaim = sc->swap_cluster_max;
865
866 while (nr_active || nr_inactive) { 858 while (nr_active || nr_inactive) {
867 if (nr_active) { 859 if (nr_active) {
868 sc->nr_to_scan = min(nr_active, 860 sc->nr_to_scan = min(nr_active,
@@ -876,8 +868,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
876 (unsigned long)sc->swap_cluster_max); 868 (unsigned long)sc->swap_cluster_max);
877 nr_inactive -= sc->nr_to_scan; 869 nr_inactive -= sc->nr_to_scan;
878 shrink_cache(zone, sc); 870 shrink_cache(zone, sc);
879 if (sc->nr_to_reclaim <= 0)
880 break;
881 } 871 }
882 } 872 }
883 873
@@ -910,7 +900,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
910 for (i = 0; zones[i] != NULL; i++) { 900 for (i = 0; zones[i] != NULL; i++) {
911 struct zone *zone = zones[i]; 901 struct zone *zone = zones[i];
912 902
913 if (zone->present_pages == 0) 903 if (!populated_zone(zone))
914 continue; 904 continue;
915 905
916 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 906 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
@@ -952,7 +942,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
952 942
953 sc.gfp_mask = gfp_mask; 943 sc.gfp_mask = gfp_mask;
954 sc.may_writepage = 0; 944 sc.may_writepage = 0;
955 sc.may_swap = 1;
956 945
957 inc_page_state(allocstall); 946 inc_page_state(allocstall);
958 947
@@ -1055,7 +1044,6 @@ loop_again:
1055 total_reclaimed = 0; 1044 total_reclaimed = 0;
1056 sc.gfp_mask = GFP_KERNEL; 1045 sc.gfp_mask = GFP_KERNEL;
1057 sc.may_writepage = 0; 1046 sc.may_writepage = 0;
1058 sc.may_swap = 1;
1059 sc.nr_mapped = read_page_state(nr_mapped); 1047 sc.nr_mapped = read_page_state(nr_mapped);
1060 1048
1061 inc_page_state(pageoutrun); 1049 inc_page_state(pageoutrun);
@@ -1084,7 +1072,7 @@ loop_again:
1084 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 1072 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1085 struct zone *zone = pgdat->node_zones + i; 1073 struct zone *zone = pgdat->node_zones + i;
1086 1074
1087 if (zone->present_pages == 0) 1075 if (!populated_zone(zone))
1088 continue; 1076 continue;
1089 1077
1090 if (zone->all_unreclaimable && 1078 if (zone->all_unreclaimable &&
@@ -1121,7 +1109,7 @@ scan:
1121 struct zone *zone = pgdat->node_zones + i; 1109 struct zone *zone = pgdat->node_zones + i;
1122 int nr_slab; 1110 int nr_slab;
1123 1111
1124 if (zone->present_pages == 0) 1112 if (!populated_zone(zone))
1125 continue; 1113 continue;
1126 1114
1127 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1115 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -1273,7 +1261,7 @@ void wakeup_kswapd(struct zone *zone, int order)
1273{ 1261{
1274 pg_data_t *pgdat; 1262 pg_data_t *pgdat;
1275 1263
1276 if (zone->present_pages == 0) 1264 if (!populated_zone(zone))
1277 return; 1265 return;
1278 1266
1279 pgdat = zone->zone_pgdat; 1267 pgdat = zone->zone_pgdat;
@@ -1353,76 +1341,3 @@ static int __init kswapd_init(void)
1353} 1341}
1354 1342
1355module_init(kswapd_init) 1343module_init(kswapd_init)
1356
1357
1358/*
1359 * Try to free up some pages from this zone through reclaim.
1360 */
1361int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1362{
1363 struct scan_control sc;
1364 int nr_pages = 1 << order;
1365 int total_reclaimed = 0;
1366
1367 /* The reclaim may sleep, so don't do it if sleep isn't allowed */
1368 if (!(gfp_mask & __GFP_WAIT))
1369 return 0;
1370 if (zone->all_unreclaimable)
1371 return 0;
1372
1373 sc.gfp_mask = gfp_mask;
1374 sc.may_writepage = 0;
1375 sc.may_swap = 0;
1376 sc.nr_mapped = read_page_state(nr_mapped);
1377 sc.nr_scanned = 0;
1378 sc.nr_reclaimed = 0;
1379 /* scan at the highest priority */
1380 sc.priority = 0;
1381 disable_swap_token();
1382
1383 if (nr_pages > SWAP_CLUSTER_MAX)
1384 sc.swap_cluster_max = nr_pages;
1385 else
1386 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1387
1388 /* Don't reclaim the zone if there are other reclaimers active */
1389 if (atomic_read(&zone->reclaim_in_progress) > 0)
1390 goto out;
1391
1392 shrink_zone(zone, &sc);
1393 total_reclaimed = sc.nr_reclaimed;
1394
1395 out:
1396 return total_reclaimed;
1397}
1398
1399asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
1400 unsigned int state)
1401{
1402 struct zone *z;
1403 int i;
1404
1405 if (!capable(CAP_SYS_ADMIN))
1406 return -EACCES;
1407
1408 if (node >= MAX_NUMNODES || !node_online(node))
1409 return -EINVAL;
1410
1411 /* This will break if we ever add more zones */
1412 if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
1413 return -EINVAL;
1414
1415 for (i = 0; i < MAX_NR_ZONES; i++) {
1416 if (!(zone & 1<<i))
1417 continue;
1418
1419 z = &NODE_DATA(node)->node_zones[i];
1420
1421 if (state)
1422 z->reclaim_pages = 1;
1423 else
1424 z->reclaim_pages = 0;
1425 }
1426
1427 return 0;
1428}
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index cac2e774dd81..3e6c694bbad1 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -101,10 +101,22 @@ static void ip_map_put(struct cache_head *item, struct cache_detail *cd)
101 } 101 }
102} 102}
103 103
104#if IP_HASHBITS == 8
105/* hash_long on a 64 bit machine is currently REALLY BAD for
106 * IP addresses in reverse-endian (i.e. on a little-endian machine).
107 * So use a trivial but reliable hash instead
108 */
109static inline int hash_ip(unsigned long ip)
110{
111 int hash = ip ^ (ip>>16);
112 return (hash ^ (hash>>8)) & 0xff;
113}
114#endif
115
104static inline int ip_map_hash(struct ip_map *item) 116static inline int ip_map_hash(struct ip_map *item)
105{ 117{
106 return hash_str(item->m_class, IP_HASHBITS) ^ 118 return hash_str(item->m_class, IP_HASHBITS) ^
107 hash_long((unsigned long)item->m_addr.s_addr, IP_HASHBITS); 119 hash_ip((unsigned long)item->m_addr.s_addr);
108} 120}
109static inline int ip_map_match(struct ip_map *item, struct ip_map *tmp) 121static inline int ip_map_match(struct ip_map *item, struct ip_map *tmp)
110{ 122{
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index d68eba481291..e67613e4eb18 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1026,7 +1026,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
1026 } else { 1026 } else {
1027 printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", 1027 printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
1028 svsk->sk_server->sv_name, -len); 1028 svsk->sk_server->sv_name, -len);
1029 svc_sock_received(svsk); 1029 goto err_delete;
1030 } 1030 }
1031 1031
1032 return len; 1032 return len;
diff --git a/security/keys/internal.h b/security/keys/internal.h
index db99ed434f3a..39cba97c5eb9 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -25,7 +25,6 @@
25#define kdebug(FMT, a...) do {} while(0) 25#define kdebug(FMT, a...) do {} while(0)
26#endif 26#endif
27 27
28extern struct key_type key_type_dead;
29extern struct key_type key_type_user; 28extern struct key_type key_type_user;
30 29
31/*****************************************************************************/ 30/*****************************************************************************/
diff --git a/security/keys/key.c b/security/keys/key.c
index 01bcfecb7eae..99781b798312 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -36,7 +36,7 @@ static DECLARE_WORK(key_cleanup_task, key_cleanup, NULL);
36DECLARE_RWSEM(key_construction_sem); 36DECLARE_RWSEM(key_construction_sem);
37 37
38/* any key who's type gets unegistered will be re-typed to this */ 38/* any key who's type gets unegistered will be re-typed to this */
39struct key_type key_type_dead = { 39static struct key_type key_type_dead = {
40 .name = "dead", 40 .name = "dead",
41}; 41};
42 42
@@ -240,9 +240,9 @@ static inline void key_alloc_serial(struct key *key)
240/* 240/*
241 * allocate a key of the specified type 241 * allocate a key of the specified type
242 * - update the user's quota to reflect the existence of the key 242 * - update the user's quota to reflect the existence of the key
243 * - called from a key-type operation with key_types_sem read-locked by either 243 * - called from a key-type operation with key_types_sem read-locked by
244 * key_create_or_update() or by key_duplicate(); this prevents unregistration 244 * key_create_or_update()
245 * of the key type 245 * - this prevents unregistration of the key type
246 * - upon return the key is as yet uninstantiated; the caller needs to either 246 * - upon return the key is as yet uninstantiated; the caller needs to either
247 * instantiate the key or discard it before returning 247 * instantiate the key or discard it before returning
248 */ 248 */
@@ -889,56 +889,6 @@ EXPORT_SYMBOL(key_update);
889 889
890/*****************************************************************************/ 890/*****************************************************************************/
891/* 891/*
892 * duplicate a key, potentially with a revised description
893 * - must be supported by the keytype (keyrings for instance can be duplicated)
894 */
895struct key *key_duplicate(struct key *source, const char *desc)
896{
897 struct key *key;
898 int ret;
899
900 key_check(source);
901
902 if (!desc)
903 desc = source->description;
904
905 down_read(&key_types_sem);
906
907 ret = -EINVAL;
908 if (!source->type->duplicate)
909 goto error;
910
911 /* allocate and instantiate a key */
912 key = key_alloc(source->type, desc, current->fsuid, current->fsgid,
913 source->perm, 0);
914 if (IS_ERR(key))
915 goto error_k;
916
917 down_read(&source->sem);
918 ret = key->type->duplicate(key, source);
919 up_read(&source->sem);
920 if (ret < 0)
921 goto error2;
922
923 atomic_inc(&key->user->nikeys);
924 set_bit(KEY_FLAG_INSTANTIATED, &key->flags);
925
926 error_k:
927 up_read(&key_types_sem);
928 out:
929 return key;
930
931 error2:
932 key_put(key);
933 error:
934 up_read(&key_types_sem);
935 key = ERR_PTR(ret);
936 goto out;
937
938} /* end key_duplicate() */
939
940/*****************************************************************************/
941/*
942 * revoke a key 892 * revoke a key
943 */ 893 */
944void key_revoke(struct key *key) 894void key_revoke(struct key *key)
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 4e9fa8be44b8..5d22c0388b32 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -48,7 +48,6 @@ static inline unsigned keyring_hash(const char *desc)
48 */ 48 */
49static int keyring_instantiate(struct key *keyring, 49static int keyring_instantiate(struct key *keyring,
50 const void *data, size_t datalen); 50 const void *data, size_t datalen);
51static int keyring_duplicate(struct key *keyring, const struct key *source);
52static int keyring_match(const struct key *keyring, const void *criterion); 51static int keyring_match(const struct key *keyring, const void *criterion);
53static void keyring_destroy(struct key *keyring); 52static void keyring_destroy(struct key *keyring);
54static void keyring_describe(const struct key *keyring, struct seq_file *m); 53static void keyring_describe(const struct key *keyring, struct seq_file *m);
@@ -59,7 +58,6 @@ struct key_type key_type_keyring = {
59 .name = "keyring", 58 .name = "keyring",
60 .def_datalen = sizeof(struct keyring_list), 59 .def_datalen = sizeof(struct keyring_list),
61 .instantiate = keyring_instantiate, 60 .instantiate = keyring_instantiate,
62 .duplicate = keyring_duplicate,
63 .match = keyring_match, 61 .match = keyring_match,
64 .destroy = keyring_destroy, 62 .destroy = keyring_destroy,
65 .describe = keyring_describe, 63 .describe = keyring_describe,
@@ -70,7 +68,7 @@ struct key_type key_type_keyring = {
70 * semaphore to serialise link/link calls to prevent two link calls in parallel 68 * semaphore to serialise link/link calls to prevent two link calls in parallel
71 * introducing a cycle 69 * introducing a cycle
72 */ 70 */
73DECLARE_RWSEM(keyring_serialise_link_sem); 71static DECLARE_RWSEM(keyring_serialise_link_sem);
74 72
75/*****************************************************************************/ 73/*****************************************************************************/
76/* 74/*
@@ -120,68 +118,6 @@ static int keyring_instantiate(struct key *keyring,
120 118
121/*****************************************************************************/ 119/*****************************************************************************/
122/* 120/*
123 * duplicate the list of subscribed keys from a source keyring into this one
124 */
125static int keyring_duplicate(struct key *keyring, const struct key *source)
126{
127 struct keyring_list *sklist, *klist;
128 unsigned max;
129 size_t size;
130 int loop, ret;
131
132 const unsigned limit =
133 (PAGE_SIZE - sizeof(*klist)) / sizeof(struct key *);
134
135 ret = 0;
136
137 /* find out how many keys are currently linked */
138 rcu_read_lock();
139 sklist = rcu_dereference(source->payload.subscriptions);
140 max = 0;
141 if (sklist)
142 max = sklist->nkeys;
143 rcu_read_unlock();
144
145 /* allocate a new payload and stuff load with key links */
146 if (max > 0) {
147 BUG_ON(max > limit);
148
149 max = (max + 3) & ~3;
150 if (max > limit)
151 max = limit;
152
153 ret = -ENOMEM;
154 size = sizeof(*klist) + sizeof(struct key *) * max;
155 klist = kmalloc(size, GFP_KERNEL);
156 if (!klist)
157 goto error;
158
159 /* set links */
160 rcu_read_lock();
161 sklist = rcu_dereference(source->payload.subscriptions);
162
163 klist->maxkeys = max;
164 klist->nkeys = sklist->nkeys;
165 memcpy(klist->keys,
166 sklist->keys,
167 sklist->nkeys * sizeof(struct key *));
168
169 for (loop = klist->nkeys - 1; loop >= 0; loop--)
170 atomic_inc(&klist->keys[loop]->usage);
171
172 rcu_read_unlock();
173
174 rcu_assign_pointer(keyring->payload.subscriptions, klist);
175 ret = 0;
176 }
177
178 error:
179 return ret;
180
181} /* end keyring_duplicate() */
182
183/*****************************************************************************/
184/*
185 * match keyrings on their name 121 * match keyrings on their name
186 */ 122 */
187static int keyring_match(const struct key *keyring, const void *description) 123static int keyring_match(const struct key *keyring, const void *description)
diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c
index cbda3b2780a1..8e71895b97a7 100644
--- a/security/keys/user_defined.c
+++ b/security/keys/user_defined.c
@@ -26,7 +26,6 @@
26struct key_type key_type_user = { 26struct key_type key_type_user = {
27 .name = "user", 27 .name = "user",
28 .instantiate = user_instantiate, 28 .instantiate = user_instantiate,
29 .duplicate = user_duplicate,
30 .update = user_update, 29 .update = user_update,
31 .match = user_match, 30 .match = user_match,
32 .destroy = user_destroy, 31 .destroy = user_destroy,
@@ -68,42 +67,10 @@ error:
68 return ret; 67 return ret;
69 68
70} /* end user_instantiate() */ 69} /* end user_instantiate() */
71
72EXPORT_SYMBOL_GPL(user_instantiate); 70EXPORT_SYMBOL_GPL(user_instantiate);
73 71
74/*****************************************************************************/ 72/*****************************************************************************/
75/* 73/*
76 * duplicate a user defined key
77 * - both keys' semaphores are locked against further modification
78 * - the new key cannot yet be accessed
79 */
80int user_duplicate(struct key *key, const struct key *source)
81{
82 struct user_key_payload *upayload, *spayload;
83 int ret;
84
85 /* just copy the payload */
86 ret = -ENOMEM;
87 upayload = kmalloc(sizeof(*upayload) + source->datalen, GFP_KERNEL);
88 if (upayload) {
89 spayload = rcu_dereference(source->payload.data);
90 BUG_ON(source->datalen != spayload->datalen);
91
92 upayload->datalen = key->datalen = spayload->datalen;
93 memcpy(upayload->data, spayload->data, key->datalen);
94
95 key->payload.data = upayload;
96 ret = 0;
97 }
98
99 return ret;
100
101} /* end user_duplicate() */
102
103EXPORT_SYMBOL_GPL(user_duplicate);
104
105/*****************************************************************************/
106/*
107 * dispose of the old data from an updated user defined key 74 * dispose of the old data from an updated user defined key
108 */ 75 */
109static void user_update_rcu_disposal(struct rcu_head *rcu) 76static void user_update_rcu_disposal(struct rcu_head *rcu)
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 0e1352a555c8..e59da6398d44 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -376,7 +376,7 @@ static ssize_t selinux_transaction_write(struct file *file, const char __user *b
376 char *data; 376 char *data;
377 ssize_t rv; 377 ssize_t rv;
378 378
379 if (ino >= sizeof(write_op)/sizeof(write_op[0]) || !write_op[ino]) 379 if (ino >= ARRAY_SIZE(write_op) || !write_op[ino])
380 return -EINVAL; 380 return -EINVAL;
381 381
382 data = simple_transaction_get(file, buf, size); 382 data = simple_transaction_get(file, buf, size);
@@ -1161,7 +1161,7 @@ static int sel_make_avc_files(struct dentry *dir)
1161#endif 1161#endif
1162 }; 1162 };
1163 1163
1164 for (i = 0; i < sizeof (files) / sizeof (files[0]); i++) { 1164 for (i = 0; i < ARRAY_SIZE(files); i++) {
1165 struct inode *inode; 1165 struct inode *inode;
1166 struct dentry *dentry; 1166 struct dentry *dentry;
1167 1167
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index dde094feb20d..d049c7acbc8b 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -359,7 +359,7 @@ int avtab_read_item(void *fp, u32 vers, struct avtab *a,
359 return -1; 359 return -1;
360 } 360 }
361 361
362 for (i = 0; i < sizeof(spec_order)/sizeof(u16); i++) { 362 for (i = 0; i < ARRAY_SIZE(spec_order); i++) {
363 if (val & spec_order[i]) { 363 if (val & spec_order[i]) {
364 key.specified = spec_order[i] | enabled; 364 key.specified = spec_order[i] | enabled;
365 datum.data = le32_to_cpu(buf32[items++]); 365 datum.data = le32_to_cpu(buf32[items++]);
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index 0ac311dc8371..0111990ba837 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -103,7 +103,7 @@ static struct policydb_compat_info *policydb_lookup_compat(int version)
103 int i; 103 int i;
104 struct policydb_compat_info *info = NULL; 104 struct policydb_compat_info *info = NULL;
105 105
106 for (i = 0; i < sizeof(policydb_compat)/sizeof(*info); i++) { 106 for (i = 0; i < ARRAY_SIZE(policydb_compat); i++) {
107 if (policydb_compat[i].version == version) { 107 if (policydb_compat[i].version == version) {
108 info = &policydb_compat[i]; 108 info = &policydb_compat[i];
109 break; 109 break;
diff --git a/sound/oss/ad1848.c b/sound/oss/ad1848.c
index 3f30c57676c1..49796be955f3 100644
--- a/sound/oss/ad1848.c
+++ b/sound/oss/ad1848.c
@@ -46,8 +46,6 @@
46#include <linux/interrupt.h> 46#include <linux/interrupt.h>
47#include <linux/module.h> 47#include <linux/module.h>
48#include <linux/stddef.h> 48#include <linux/stddef.h>
49#include <linux/pm.h>
50#include <linux/pm_legacy.h>
51#include <linux/isapnp.h> 49#include <linux/isapnp.h>
52#include <linux/pnp.h> 50#include <linux/pnp.h>
53#include <linux/spinlock.h> 51#include <linux/spinlock.h>
@@ -105,9 +103,6 @@ typedef struct
105 int irq_ok; 103 int irq_ok;
106 mixer_ents *mix_devices; 104 mixer_ents *mix_devices;
107 int mixer_output_port; 105 int mixer_output_port;
108
109 /* Power management */
110 struct pm_dev *pmdev;
111} ad1848_info; 106} ad1848_info;
112 107
113typedef struct ad1848_port_info 108typedef struct ad1848_port_info
@@ -201,7 +196,6 @@ static void ad1848_halt(int dev);
201static void ad1848_halt_input(int dev); 196static void ad1848_halt_input(int dev);
202static void ad1848_halt_output(int dev); 197static void ad1848_halt_output(int dev);
203static void ad1848_trigger(int dev, int bits); 198static void ad1848_trigger(int dev, int bits);
204static int ad1848_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data);
205 199
206#ifndef EXCLUDE_TIMERS 200#ifndef EXCLUDE_TIMERS
207static int ad1848_tmr_install(int dev); 201static int ad1848_tmr_install(int dev);
@@ -2027,10 +2021,6 @@ int ad1848_init (char *name, struct resource *ports, int irq, int dma_playback,
2027 2021
2028 nr_ad1848_devs++; 2022 nr_ad1848_devs++;
2029 2023
2030 devc->pmdev = pm_register(PM_ISA_DEV, my_dev, ad1848_pm_callback);
2031 if (devc->pmdev)
2032 devc->pmdev->data = devc;
2033
2034 ad1848_init_hw(devc); 2024 ad1848_init_hw(devc);
2035 2025
2036 if (irq > 0) 2026 if (irq > 0)
@@ -2197,9 +2187,6 @@ void ad1848_unload(int io_base, int irq, int dma_playback, int dma_capture, int
2197 if(mixer>=0) 2187 if(mixer>=0)
2198 sound_unload_mixerdev(mixer); 2188 sound_unload_mixerdev(mixer);
2199 2189
2200 if (devc->pmdev)
2201 pm_unregister(devc->pmdev);
2202
2203 nr_ad1848_devs--; 2190 nr_ad1848_devs--;
2204 for ( ; i < nr_ad1848_devs ; i++) 2191 for ( ; i < nr_ad1848_devs ; i++)
2205 adev_info[i] = adev_info[i+1]; 2192 adev_info[i] = adev_info[i+1];
@@ -2811,85 +2798,6 @@ static int ad1848_tmr_install(int dev)
2811} 2798}
2812#endif /* EXCLUDE_TIMERS */ 2799#endif /* EXCLUDE_TIMERS */
2813 2800
2814static int ad1848_suspend(ad1848_info *devc)
2815{
2816 unsigned long flags;
2817
2818 spin_lock_irqsave(&devc->lock,flags);
2819
2820 ad_mute(devc);
2821
2822 spin_unlock_irqrestore(&devc->lock,flags);
2823 return 0;
2824}
2825
2826static int ad1848_resume(ad1848_info *devc)
2827{
2828 int mixer_levels[32], i;
2829
2830 /* Thinkpad is a bit more of PITA than normal. The BIOS tends to
2831 restore it in a different config to the one we use. Need to
2832 fix this somehow */
2833
2834 /* store old mixer levels */
2835 memcpy(mixer_levels, devc->levels, sizeof (mixer_levels));
2836 ad1848_init_hw(devc);
2837
2838 /* restore mixer levels */
2839 for (i = 0; i < 32; i++)
2840 ad1848_mixer_set(devc, devc->dev_no, mixer_levels[i]);
2841
2842 if (!devc->subtype) {
2843 static signed char interrupt_bits[12] = { -1, -1, -1, -1, -1, 0x00, -1, 0x08, -1, 0x10, 0x18, 0x20 };
2844 static char dma_bits[4] = { 1, 2, 0, 3 };
2845 unsigned long flags;
2846 signed char bits;
2847 char dma2_bit = 0;
2848
2849 int config_port = devc->base + 0;
2850
2851 bits = interrupt_bits[devc->irq];
2852 if (bits == -1) {
2853 printk(KERN_ERR "MSS: Bad IRQ %d\n", devc->irq);
2854 return -1;
2855 }
2856
2857 spin_lock_irqsave(&devc->lock,flags);
2858
2859 outb((bits | 0x40), config_port);
2860
2861 if (devc->dma2 != -1 && devc->dma2 != devc->dma1)
2862 if ( (devc->dma1 == 0 && devc->dma2 == 1) ||
2863 (devc->dma1 == 1 && devc->dma2 == 0) ||
2864 (devc->dma1 == 3 && devc->dma2 == 0))
2865 dma2_bit = 0x04;
2866
2867 outb((bits | dma_bits[devc->dma1] | dma2_bit), config_port);
2868 spin_unlock_irqrestore(&devc->lock,flags);
2869 }
2870
2871 return 0;
2872}
2873
2874static int ad1848_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data)
2875{
2876 ad1848_info *devc = dev->data;
2877 if (devc) {
2878 DEB(printk("ad1848: pm event received: 0x%x\n", rqst));
2879
2880 switch (rqst) {
2881 case PM_SUSPEND:
2882 ad1848_suspend(devc);
2883 break;
2884 case PM_RESUME:
2885 ad1848_resume(devc);
2886 break;
2887 }
2888 }
2889 return 0;
2890}
2891
2892
2893EXPORT_SYMBOL(ad1848_detect); 2801EXPORT_SYMBOL(ad1848_detect);
2894EXPORT_SYMBOL(ad1848_init); 2802EXPORT_SYMBOL(ad1848_init);
2895EXPORT_SYMBOL(ad1848_unload); 2803EXPORT_SYMBOL(ad1848_unload);
diff --git a/sound/oss/cs4281/cs4281m.c b/sound/oss/cs4281/cs4281m.c
index adc689649fe1..46dd41dc2a34 100644
--- a/sound/oss/cs4281/cs4281m.c
+++ b/sound/oss/cs4281/cs4281m.c
@@ -298,7 +298,6 @@ struct cs4281_state {
298 struct cs4281_pipeline pl[CS4281_NUMBER_OF_PIPELINES]; 298 struct cs4281_pipeline pl[CS4281_NUMBER_OF_PIPELINES];
299}; 299};
300 300
301#include <linux/pm_legacy.h>
302#include "cs4281pm-24.c" 301#include "cs4281pm-24.c"
303 302
304#if CSDEBUG 303#if CSDEBUG
@@ -4256,9 +4255,6 @@ static void __devinit cs4281_InitPM(struct cs4281_state *s)
4256static int __devinit cs4281_probe(struct pci_dev *pcidev, 4255static int __devinit cs4281_probe(struct pci_dev *pcidev,
4257 const struct pci_device_id *pciid) 4256 const struct pci_device_id *pciid)
4258{ 4257{
4259#ifndef NOT_CS4281_PM
4260 struct pm_dev *pmdev;
4261#endif
4262 struct cs4281_state *s; 4258 struct cs4281_state *s;
4263 dma_addr_t dma_mask; 4259 dma_addr_t dma_mask;
4264 mm_segment_t fs; 4260 mm_segment_t fs;
@@ -4374,19 +4370,7 @@ static int __devinit cs4281_probe(struct pci_dev *pcidev,
4374 } 4370 }
4375#ifndef NOT_CS4281_PM 4371#ifndef NOT_CS4281_PM
4376 cs4281_InitPM(s); 4372 cs4281_InitPM(s);
4377 pmdev = cs_pm_register(PM_PCI_DEV, PM_PCI_ID(pcidev), cs4281_pm_callback); 4373 s->pm.flags |= CS4281_PM_NOT_REGISTERED;
4378 if (pmdev)
4379 {
4380 CS_DBGOUT(CS_INIT | CS_PM, 4, printk(KERN_INFO
4381 "cs4281: probe() pm_register() succeeded (%p).\n", pmdev));
4382 pmdev->data = s;
4383 }
4384 else
4385 {
4386 CS_DBGOUT(CS_INIT | CS_PM | CS_ERROR, 0, printk(KERN_INFO
4387 "cs4281: probe() pm_register() failed (%p).\n", pmdev));
4388 s->pm.flags |= CS4281_PM_NOT_REGISTERED;
4389 }
4390#endif 4374#endif
4391 4375
4392 pci_set_master(pcidev); // enable bus mastering 4376 pci_set_master(pcidev); // enable bus mastering
@@ -4487,9 +4471,6 @@ static int __init cs4281_init_module(void)
4487static void __exit cs4281_cleanup_module(void) 4471static void __exit cs4281_cleanup_module(void)
4488{ 4472{
4489 pci_unregister_driver(&cs4281_pci_driver); 4473 pci_unregister_driver(&cs4281_pci_driver);
4490#ifndef NOT_CS4281_PM
4491 cs_pm_unregister_all(cs4281_pm_callback);
4492#endif
4493 CS_DBGOUT(CS_INIT | CS_FUNCTION, 2, 4474 CS_DBGOUT(CS_INIT | CS_FUNCTION, 2,
4494 printk(KERN_INFO "cs4281: cleanup_cs4281() finished\n")); 4475 printk(KERN_INFO "cs4281: cleanup_cs4281() finished\n"));
4495} 4476}
diff --git a/sound/oss/cs4281/cs4281pm-24.c b/sound/oss/cs4281/cs4281pm-24.c
index d2a453aff0aa..90cbd7679534 100644
--- a/sound/oss/cs4281/cs4281pm-24.c
+++ b/sound/oss/cs4281/cs4281pm-24.c
@@ -27,9 +27,6 @@
27#ifndef NOT_CS4281_PM 27#ifndef NOT_CS4281_PM
28#include <linux/pm.h> 28#include <linux/pm.h>
29 29
30#define cs_pm_register(a, b, c) pm_register((a), (b), (c));
31#define cs_pm_unregister_all(a) pm_unregister_all((a));
32
33static int cs4281_suspend(struct cs4281_state *s); 30static int cs4281_suspend(struct cs4281_state *s);
34static int cs4281_resume(struct cs4281_state *s); 31static int cs4281_resume(struct cs4281_state *s);
35/* 32/*
@@ -41,42 +38,6 @@ static int cs4281_resume(struct cs4281_state *s);
41#define CS4281_SUSPEND_TBL cs4281_suspend_null 38#define CS4281_SUSPEND_TBL cs4281_suspend_null
42#define CS4281_RESUME_TBL cs4281_resume_null 39#define CS4281_RESUME_TBL cs4281_resume_null
43 40
44static int cs4281_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data)
45{
46 struct cs4281_state *state;
47
48 CS_DBGOUT(CS_PM, 2, printk(KERN_INFO
49 "cs4281: cs4281_pm_callback dev=%p rqst=0x%x state=%p\n",
50 dev,(unsigned)rqst,data));
51 state = (struct cs4281_state *) dev->data;
52 if (state) {
53 switch(rqst) {
54 case PM_SUSPEND:
55 CS_DBGOUT(CS_PM, 2, printk(KERN_INFO
56 "cs4281: PM suspend request\n"));
57 if(cs4281_suspend(state))
58 {
59 CS_DBGOUT(CS_ERROR, 2, printk(KERN_INFO
60 "cs4281: PM suspend request refused\n"));
61 return 1;
62 }
63 break;
64 case PM_RESUME:
65 CS_DBGOUT(CS_PM, 2, printk(KERN_INFO
66 "cs4281: PM resume request\n"));
67 if(cs4281_resume(state))
68 {
69 CS_DBGOUT(CS_ERROR, 2, printk(KERN_INFO
70 "cs4281: PM resume request refused\n"));
71 return 1;
72 }
73 break;
74 }
75 }
76
77 return 0;
78}
79
80#else /* CS4281_PM */ 41#else /* CS4281_PM */
81#define CS4281_SUSPEND_TBL cs4281_suspend_null 42#define CS4281_SUSPEND_TBL cs4281_suspend_null
82#define CS4281_RESUME_TBL cs4281_resume_null 43#define CS4281_RESUME_TBL cs4281_resume_null
diff --git a/sound/oss/cs46xx.c b/sound/oss/cs46xx.c
index cb998e8c0fdd..0da4d93f04a6 100644
--- a/sound/oss/cs46xx.c
+++ b/sound/oss/cs46xx.c
@@ -391,10 +391,6 @@ static void cs461x_clear_serial_FIFOs(struct cs_card *card, int type);
391static int cs46xx_suspend_tbl(struct pci_dev *pcidev, pm_message_t state); 391static int cs46xx_suspend_tbl(struct pci_dev *pcidev, pm_message_t state);
392static int cs46xx_resume_tbl(struct pci_dev *pcidev); 392static int cs46xx_resume_tbl(struct pci_dev *pcidev);
393 393
394#ifndef CS46XX_ACPI_SUPPORT
395static int cs46xx_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data);
396#endif
397
398#if CSDEBUG 394#if CSDEBUG
399 395
400/* DEBUG ROUTINES */ 396/* DEBUG ROUTINES */
@@ -5320,7 +5316,6 @@ static const char fndmsg[] = KERN_INFO "cs46xx: Found %d audio device(s).\n";
5320static int __devinit cs46xx_probe(struct pci_dev *pci_dev, 5316static int __devinit cs46xx_probe(struct pci_dev *pci_dev,
5321 const struct pci_device_id *pciid) 5317 const struct pci_device_id *pciid)
5322{ 5318{
5323 struct pm_dev *pmdev;
5324 int i,j; 5319 int i,j;
5325 u16 ss_card, ss_vendor; 5320 u16 ss_card, ss_vendor;
5326 struct cs_card *card; 5321 struct cs_card *card;
@@ -5530,22 +5525,6 @@ static int __devinit cs46xx_probe(struct pci_dev *pci_dev,
5530 PCI_SET_DMA_MASK(pci_dev, dma_mask); 5525 PCI_SET_DMA_MASK(pci_dev, dma_mask);
5531 list_add(&card->list, &cs46xx_devs); 5526 list_add(&card->list, &cs46xx_devs);
5532 5527
5533 pmdev = cs_pm_register(PM_PCI_DEV, PM_PCI_ID(pci_dev), cs46xx_pm_callback);
5534 if (pmdev)
5535 {
5536 CS_DBGOUT(CS_INIT | CS_PM, 4, printk(KERN_INFO
5537 "cs46xx: probe() pm_register() succeeded (%p).\n",
5538 pmdev));
5539 pmdev->data = card;
5540 }
5541 else
5542 {
5543 CS_DBGOUT(CS_INIT | CS_PM | CS_ERROR, 2, printk(KERN_INFO
5544 "cs46xx: probe() pm_register() failed (%p).\n",
5545 pmdev));
5546 card->pm.flags |= CS46XX_PM_NOT_REGISTERED;
5547 }
5548
5549 CS_DBGOUT(CS_PM, 9, printk(KERN_INFO "cs46xx: pm.flags=0x%x card=%p\n", 5528 CS_DBGOUT(CS_PM, 9, printk(KERN_INFO "cs46xx: pm.flags=0x%x card=%p\n",
5550 (unsigned)card->pm.flags,card)); 5529 (unsigned)card->pm.flags,card));
5551 5530
@@ -5727,7 +5706,6 @@ static int __init cs46xx_init_module(void)
5727static void __exit cs46xx_cleanup_module(void) 5706static void __exit cs46xx_cleanup_module(void)
5728{ 5707{
5729 pci_unregister_driver(&cs46xx_pci_driver); 5708 pci_unregister_driver(&cs46xx_pci_driver);
5730 cs_pm_unregister_all(cs46xx_pm_callback);
5731 CS_DBGOUT(CS_INIT | CS_FUNCTION, 2, 5709 CS_DBGOUT(CS_INIT | CS_FUNCTION, 2,
5732 printk(KERN_INFO "cs46xx: cleanup_cs46xx() finished\n")); 5710 printk(KERN_INFO "cs46xx: cleanup_cs46xx() finished\n"));
5733} 5711}
@@ -5735,44 +5713,6 @@ static void __exit cs46xx_cleanup_module(void)
5735module_init(cs46xx_init_module); 5713module_init(cs46xx_init_module);
5736module_exit(cs46xx_cleanup_module); 5714module_exit(cs46xx_cleanup_module);
5737 5715
5738#ifndef CS46XX_ACPI_SUPPORT
5739static int cs46xx_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data)
5740{
5741 struct cs_card *card;
5742
5743 CS_DBGOUT(CS_PM, 2, printk(KERN_INFO
5744 "cs46xx: cs46xx_pm_callback dev=%p rqst=0x%x card=%p\n",
5745 dev,(unsigned)rqst,data));
5746 card = (struct cs_card *) dev->data;
5747 if (card) {
5748 switch(rqst) {
5749 case PM_SUSPEND:
5750 CS_DBGOUT(CS_PM, 2, printk(KERN_INFO
5751 "cs46xx: PM suspend request\n"));
5752 if(cs46xx_suspend(card, PMSG_SUSPEND))
5753 {
5754 CS_DBGOUT(CS_ERROR, 2, printk(KERN_INFO
5755 "cs46xx: PM suspend request refused\n"));
5756 return 1;
5757 }
5758 break;
5759 case PM_RESUME:
5760 CS_DBGOUT(CS_PM, 2, printk(KERN_INFO
5761 "cs46xx: PM resume request\n"));
5762 if(cs46xx_resume(card))
5763 {
5764 CS_DBGOUT(CS_ERROR, 2, printk(KERN_INFO
5765 "cs46xx: PM resume request refused\n"));
5766 return 1;
5767 }
5768 break;
5769 }
5770 }
5771
5772 return 0;
5773}
5774#endif
5775
5776#if CS46XX_ACPI_SUPPORT 5716#if CS46XX_ACPI_SUPPORT
5777static int cs46xx_suspend_tbl(struct pci_dev *pcidev, pm_message_t state) 5717static int cs46xx_suspend_tbl(struct pci_dev *pcidev, pm_message_t state)
5778{ 5718{
diff --git a/sound/oss/cs46xxpm-24.h b/sound/oss/cs46xxpm-24.h
index e220bd7240f1..ad82db84d013 100644
--- a/sound/oss/cs46xxpm-24.h
+++ b/sound/oss/cs46xxpm-24.h
@@ -38,13 +38,9 @@
38*/ 38*/
39static int cs46xx_suspend_tbl(struct pci_dev *pcidev, pm_message_t state); 39static int cs46xx_suspend_tbl(struct pci_dev *pcidev, pm_message_t state);
40static int cs46xx_resume_tbl(struct pci_dev *pcidev); 40static int cs46xx_resume_tbl(struct pci_dev *pcidev);
41#define cs_pm_register(a, b, c) NULL
42#define cs_pm_unregister_all(a)
43#define CS46XX_SUSPEND_TBL cs46xx_suspend_tbl 41#define CS46XX_SUSPEND_TBL cs46xx_suspend_tbl
44#define CS46XX_RESUME_TBL cs46xx_resume_tbl 42#define CS46XX_RESUME_TBL cs46xx_resume_tbl
45#else 43#else
46#define cs_pm_register(a, b, c) pm_register((a), (b), (c));
47#define cs_pm_unregister_all(a) pm_unregister_all((a));
48#define CS46XX_SUSPEND_TBL cs46xx_null 44#define CS46XX_SUSPEND_TBL cs46xx_null
49#define CS46XX_RESUME_TBL cs46xx_null 45#define CS46XX_RESUME_TBL cs46xx_null
50#endif 46#endif
diff --git a/sound/oss/maestro.c b/sound/oss/maestro.c
index 3abd3541cbc7..f9ac5b16f61a 100644
--- a/sound/oss/maestro.c
+++ b/sound/oss/maestro.c
@@ -230,10 +230,6 @@
230#include <asm/page.h> 230#include <asm/page.h>
231#include <asm/uaccess.h> 231#include <asm/uaccess.h>
232 232
233#include <linux/pm.h>
234#include <linux/pm_legacy.h>
235static int maestro_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *d);
236
237#include "maestro.h" 233#include "maestro.h"
238 234
239static struct pci_driver maestro_pci_driver; 235static struct pci_driver maestro_pci_driver;
@@ -3404,7 +3400,6 @@ maestro_probe(struct pci_dev *pcidev,const struct pci_device_id *pdid)
3404 int i, ret; 3400 int i, ret;
3405 struct ess_card *card; 3401 struct ess_card *card;
3406 struct ess_state *ess; 3402 struct ess_state *ess;
3407 struct pm_dev *pmdev;
3408 int num = 0; 3403 int num = 0;
3409 3404
3410/* when built into the kernel, we only print version if device is found */ 3405/* when built into the kernel, we only print version if device is found */
@@ -3450,11 +3445,6 @@ maestro_probe(struct pci_dev *pcidev,const struct pci_device_id *pdid)
3450 memset(card, 0, sizeof(*card)); 3445 memset(card, 0, sizeof(*card));
3451 card->pcidev = pcidev; 3446 card->pcidev = pcidev;
3452 3447
3453 pmdev = pm_register(PM_PCI_DEV, PM_PCI_ID(pcidev),
3454 maestro_pm_callback);
3455 if (pmdev)
3456 pmdev->data = card;
3457
3458 card->iobase = iobase; 3448 card->iobase = iobase;
3459 card->card_type = card_type; 3449 card->card_type = card_type;
3460 card->irq = pcidev->irq; 3450 card->irq = pcidev->irq;
@@ -3670,7 +3660,6 @@ static int maestro_notifier(struct notifier_block *nb, unsigned long event, void
3670static void cleanup_maestro(void) { 3660static void cleanup_maestro(void) {
3671 M_printk("maestro: unloading\n"); 3661 M_printk("maestro: unloading\n");
3672 pci_unregister_driver(&maestro_pci_driver); 3662 pci_unregister_driver(&maestro_pci_driver);
3673 pm_unregister_all(maestro_pm_callback);
3674 unregister_reboot_notifier(&maestro_nb); 3663 unregister_reboot_notifier(&maestro_nb);
3675} 3664}
3676 3665
@@ -3691,143 +3680,5 @@ check_suspend(struct ess_card *card)
3691 current->state = TASK_RUNNING; 3680 current->state = TASK_RUNNING;
3692} 3681}
3693 3682
3694static int
3695maestro_suspend(struct ess_card *card)
3696{
3697 unsigned long flags;
3698 int i,j;
3699
3700 spin_lock_irqsave(&card->lock,flags); /* over-kill */
3701
3702 M_printk("maestro: apm in dev %p\n",card);
3703
3704 /* we have to read from the apu regs, need
3705 to power it up */
3706 maestro_power(card,ACPI_D0);
3707
3708 for(i=0;i<NR_DSPS;i++) {
3709 struct ess_state *s = &card->channels[i];
3710
3711 if(s->dev_audio == -1)
3712 continue;
3713
3714 M_printk("maestro: stopping apus for device %d\n",i);
3715 stop_dac(s);
3716 stop_adc(s);
3717 for(j=0;j<6;j++)
3718 card->apu_map[s->apu[j]][5]=apu_get_register(s,j,5);
3719
3720 }
3721
3722 /* get rid of interrupts? */
3723 if( card->dsps_open > 0)
3724 stop_bob(&card->channels[0]);
3725
3726 card->in_suspend++;
3727
3728 spin_unlock_irqrestore(&card->lock,flags);
3729
3730 /* we trust in the bios to power down the chip on suspend.
3731 * XXX I'm also not sure that in_suspend will protect
3732 * against all reg accesses from here on out.
3733 */
3734 return 0;
3735}
3736static int
3737maestro_resume(struct ess_card *card)
3738{
3739 unsigned long flags;
3740 int i;
3741
3742 spin_lock_irqsave(&card->lock,flags); /* over-kill */
3743
3744 card->in_suspend = 0;
3745
3746 M_printk("maestro: resuming card at %p\n",card);
3747
3748 /* restore all our config */
3749 maestro_config(card);
3750 /* need to restore the base pointers.. */
3751 if(card->dmapages)
3752 set_base_registers(&card->channels[0],card->dmapages);
3753
3754 mixer_push_state(card);
3755
3756 /* set each channels' apu control registers before
3757 * restoring audio
3758 */
3759 for(i=0;i<NR_DSPS;i++) {
3760 struct ess_state *s = &card->channels[i];
3761 int chan,reg;
3762
3763 if(s->dev_audio == -1)
3764 continue;
3765
3766 for(chan = 0 ; chan < 6 ; chan++) {
3767 wave_set_register(s,s->apu[chan]<<3,s->apu_base[chan]);
3768 for(reg = 1 ; reg < NR_APU_REGS ; reg++)
3769 apu_set_register(s,chan,reg,s->card->apu_map[s->apu[chan]][reg]);
3770 }
3771 for(chan = 0 ; chan < 6 ; chan++)
3772 apu_set_register(s,chan,0,s->card->apu_map[s->apu[chan]][0] & 0xFF0F);
3773 }
3774
3775 /* now we flip on the music */
3776
3777 if( card->dsps_open <= 0) {
3778 /* this card's idle */
3779 maestro_power(card,ACPI_D2);
3780 } else {
3781 /* ok, we're actually playing things on
3782 this card */
3783 maestro_power(card,ACPI_D0);
3784 start_bob(&card->channels[0]);
3785 for(i=0;i<NR_DSPS;i++) {
3786 struct ess_state *s = &card->channels[i];
3787
3788 /* these use the apu_mode, and can handle
3789 spurious calls */
3790 start_dac(s);
3791 start_adc(s);
3792 }
3793 }
3794
3795 spin_unlock_irqrestore(&card->lock,flags);
3796
3797 /* all right, we think things are ready,
3798 wake up people who were using the device
3799 when we suspended */
3800 wake_up(&(card->suspend_queue));
3801
3802 return 0;
3803}
3804
3805int
3806maestro_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data)
3807{
3808 struct ess_card *card = (struct ess_card*) dev->data;
3809
3810 if ( ! card ) goto out;
3811
3812 M_printk("maestro: pm event 0x%x received for card %p\n", rqst, card);
3813
3814 switch (rqst) {
3815 case PM_SUSPEND:
3816 maestro_suspend(card);
3817 break;
3818 case PM_RESUME:
3819 maestro_resume(card);
3820 break;
3821 /*
3822 * we'd also like to find out about
3823 * power level changes because some biosen
3824 * do mean things to the maestro when they
3825 * change their power state.
3826 */
3827 }
3828out:
3829 return 0;
3830}
3831
3832module_init(init_maestro); 3683module_init(init_maestro);
3833module_exit(cleanup_maestro); 3684module_exit(cleanup_maestro);
diff --git a/sound/oss/nm256_audio.c b/sound/oss/nm256_audio.c
index 0ce2c404a730..42d8f05689c2 100644
--- a/sound/oss/nm256_audio.c
+++ b/sound/oss/nm256_audio.c
@@ -24,8 +24,6 @@
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/pm.h>
28#include <linux/pm_legacy.h>
29#include <linux/delay.h> 27#include <linux/delay.h>
30#include <linux/spinlock.h> 28#include <linux/spinlock.h>
31#include "sound_config.h" 29#include "sound_config.h"
@@ -49,7 +47,6 @@ static int nm256_grabInterrupt (struct nm256_info *card);
49static int nm256_releaseInterrupt (struct nm256_info *card); 47static int nm256_releaseInterrupt (struct nm256_info *card);
50static irqreturn_t nm256_interrupt (int irq, void *dev_id, struct pt_regs *dummy); 48static irqreturn_t nm256_interrupt (int irq, void *dev_id, struct pt_regs *dummy);
51static irqreturn_t nm256_interrupt_zx (int irq, void *dev_id, struct pt_regs *dummy); 49static irqreturn_t nm256_interrupt_zx (int irq, void *dev_id, struct pt_regs *dummy);
52static int handle_pm_event (struct pm_dev *dev, pm_request_t rqst, void *data);
53 50
54/* These belong in linux/pci.h. */ 51/* These belong in linux/pci.h. */
55#define PCI_DEVICE_ID_NEOMAGIC_NM256AV_AUDIO 0x8005 52#define PCI_DEVICE_ID_NEOMAGIC_NM256AV_AUDIO 0x8005
@@ -992,15 +989,6 @@ nm256_install_mixer (struct nm256_info *card)
992 return 0; 989 return 0;
993} 990}
994 991
995/* Perform a full reset on the hardware; this is invoked when an APM
996 resume event occurs. */
997static void
998nm256_full_reset (struct nm256_info *card)
999{
1000 nm256_initHw (card);
1001 ac97_reset (&(card->mdev));
1002}
1003
1004/* 992/*
1005 * See if the signature left by the NM256 BIOS is intact; if so, we use 993 * See if the signature left by the NM256 BIOS is intact; if so, we use
1006 * the associated address as the end of our audio buffer in the video 994 * the associated address as the end of our audio buffer in the video
@@ -1053,7 +1041,6 @@ static int __devinit
1053nm256_install(struct pci_dev *pcidev, enum nm256rev rev, char *verstr) 1041nm256_install(struct pci_dev *pcidev, enum nm256rev rev, char *verstr)
1054{ 1042{
1055 struct nm256_info *card; 1043 struct nm256_info *card;
1056 struct pm_dev *pmdev;
1057 int x; 1044 int x;
1058 1045
1059 if (pci_enable_device(pcidev)) 1046 if (pci_enable_device(pcidev))
@@ -1234,43 +1221,10 @@ nm256_install(struct pci_dev *pcidev, enum nm256rev rev, char *verstr)
1234 1221
1235 nm256_install_mixer (card); 1222 nm256_install_mixer (card);
1236 1223
1237 pmdev = pm_register(PM_PCI_DEV, PM_PCI_ID(pcidev), handle_pm_event);
1238 if (pmdev)
1239 pmdev->data = card;
1240
1241 return 1; 1224 return 1;
1242} 1225}
1243 1226
1244 1227
1245/*
1246 * PM event handler, so the card is properly reinitialized after a power
1247 * event.
1248 */
1249static int
1250handle_pm_event (struct pm_dev *dev, pm_request_t rqst, void *data)
1251{
1252 struct nm256_info *crd = (struct nm256_info*) dev->data;
1253 if (crd) {
1254 switch (rqst) {
1255 case PM_SUSPEND:
1256 break;
1257 case PM_RESUME:
1258 {
1259 int playing = crd->playing;
1260 nm256_full_reset (crd);
1261 /*
1262 * A little ugly, but that's ok; pretend the
1263 * block we were playing is done.
1264 */
1265 if (playing)
1266 DMAbuf_outputintr (crd->dev_for_play, 1);
1267 }
1268 break;
1269 }
1270 }
1271 return 0;
1272}
1273
1274static int __devinit 1228static int __devinit
1275nm256_probe(struct pci_dev *pcidev,const struct pci_device_id *pciid) 1229nm256_probe(struct pci_dev *pcidev,const struct pci_device_id *pciid)
1276{ 1230{
@@ -1696,7 +1650,6 @@ static int __init do_init_nm256(void)
1696static void __exit cleanup_nm256 (void) 1650static void __exit cleanup_nm256 (void)
1697{ 1651{
1698 pci_unregister_driver(&nm256_pci_driver); 1652 pci_unregister_driver(&nm256_pci_driver);
1699 pm_unregister_all (&handle_pm_event);
1700} 1653}
1701 1654
1702module_init(do_init_nm256); 1655module_init(do_init_nm256);
diff --git a/sound/oss/opl3sa2.c b/sound/oss/opl3sa2.c
index cd41d0e4706a..5cecdbcbea9d 100644
--- a/sound/oss/opl3sa2.c
+++ b/sound/oss/opl3sa2.c
@@ -69,8 +69,6 @@
69#include <linux/init.h> 69#include <linux/init.h>
70#include <linux/module.h> 70#include <linux/module.h>
71#include <linux/delay.h> 71#include <linux/delay.h>
72#include <linux/pm.h>
73#include <linux/pm_legacy.h>
74#include "sound_config.h" 72#include "sound_config.h"
75 73
76#include "ad1848.h" 74#include "ad1848.h"
@@ -139,10 +137,6 @@ typedef struct {
139 struct pnp_dev* pdev; 137 struct pnp_dev* pdev;
140 int activated; /* Whether said devices have been activated */ 138 int activated; /* Whether said devices have been activated */
141#endif 139#endif
142#ifdef CONFIG_PM_LEGACY
143 unsigned int in_suspend;
144 struct pm_dev *pmdev;
145#endif
146 unsigned int card; 140 unsigned int card;
147 int chipset; /* What's my version(s)? */ 141 int chipset; /* What's my version(s)? */
148 char *chipset_name; 142 char *chipset_name;
@@ -341,22 +335,6 @@ static void opl3sa2_mixer_reset(opl3sa2_state_t* devc)
341 } 335 }
342} 336}
343 337
344/* Currently only used for power management */
345#ifdef CONFIG_PM_LEGACY
346static void opl3sa2_mixer_restore(opl3sa2_state_t* devc)
347{
348 if (devc) {
349 opl3sa2_set_volume(devc, devc->volume_l, devc->volume_r);
350 opl3sa2_set_mic(devc, devc->mic);
351
352 if (devc->chipset == CHIPSET_OPL3SA3) {
353 opl3sa3_set_bass(devc, devc->bass_l, devc->bass_r);
354 opl3sa3_set_treble(devc, devc->treble_l, devc->treble_r);
355 }
356 }
357}
358#endif /* CONFIG_PM_LEGACY */
359
360static inline void arg_to_vol_mono(unsigned int vol, int* value) 338static inline void arg_to_vol_mono(unsigned int vol, int* value)
361{ 339{
362 int left; 340 int left;
@@ -832,84 +810,6 @@ static struct pnp_driver opl3sa2_driver = {
832 810
833/* End of component functions */ 811/* End of component functions */
834 812
835#ifdef CONFIG_PM_LEGACY
836
837static DEFINE_SPINLOCK(opl3sa2_lock);
838
839/* Power Management support functions */
840static int opl3sa2_suspend(struct pm_dev *pdev, unsigned int pm_mode)
841{
842 unsigned long flags;
843 opl3sa2_state_t *p;
844
845 if (!pdev)
846 return -EINVAL;
847
848 spin_lock_irqsave(&opl3sa2_lock,flags);
849
850 p = (opl3sa2_state_t *) pdev->data;
851 switch (pm_mode) {
852 case 1:
853 pm_mode = OPL3SA2_PM_MODE1;
854 break;
855 case 2:
856 pm_mode = OPL3SA2_PM_MODE2;
857 break;
858 case 3:
859 pm_mode = OPL3SA2_PM_MODE3;
860 break;
861 default:
862 /* we don't know howto handle this... */
863 spin_unlock_irqrestore(&opl3sa2_lock, flags);
864 return -EBUSY;
865 }
866
867 p->in_suspend = 1;
868
869 /* its supposed to automute before suspending, so we won't bother */
870 opl3sa2_write(p->cfg_port, OPL3SA2_PM, pm_mode);
871 /* wait a while for the clock oscillator to stabilise */
872 mdelay(10);
873
874 spin_unlock_irqrestore(&opl3sa2_lock,flags);
875 return 0;
876}
877
878static int opl3sa2_resume(struct pm_dev *pdev)
879{
880 unsigned long flags;
881 opl3sa2_state_t *p;
882
883 if (!pdev)
884 return -EINVAL;
885
886 p = (opl3sa2_state_t *) pdev->data;
887 spin_lock_irqsave(&opl3sa2_lock,flags);
888
889 /* I don't think this is necessary */
890 opl3sa2_write(p->cfg_port, OPL3SA2_PM, OPL3SA2_PM_MODE0);
891 opl3sa2_mixer_restore(p);
892 p->in_suspend = 0;
893
894 spin_unlock_irqrestore(&opl3sa2_lock,flags);
895 return 0;
896}
897
898static int opl3sa2_pm_callback(struct pm_dev *pdev, pm_request_t rqst, void *data)
899{
900 unsigned long mode = (unsigned long)data;
901
902 switch (rqst) {
903 case PM_SUSPEND:
904 return opl3sa2_suspend(pdev, mode);
905
906 case PM_RESUME:
907 return opl3sa2_resume(pdev);
908 }
909 return 0;
910}
911#endif /* CONFIG_PM_LEGACY */
912
913/* 813/*
914 * Install OPL3-SA2 based card(s). 814 * Install OPL3-SA2 based card(s).
915 * 815 *
@@ -1021,12 +921,6 @@ static int __init init_opl3sa2(void)
1021 921
1022 /* ewww =) */ 922 /* ewww =) */
1023 opl3sa2_state[card].card = card; 923 opl3sa2_state[card].card = card;
1024#ifdef CONFIG_PM_LEGACY
1025 /* register our power management capabilities */
1026 opl3sa2_state[card].pmdev = pm_register(PM_ISA_DEV, card, opl3sa2_pm_callback);
1027 if (opl3sa2_state[card].pmdev)
1028 opl3sa2_state[card].pmdev->data = &opl3sa2_state[card];
1029#endif /* CONFIG_PM_LEGACY */
1030 924
1031 /* 925 /*
1032 * Set the Yamaha 3D enhancement mode (aka Ymersion) if asked to and 926 * Set the Yamaha 3D enhancement mode (aka Ymersion) if asked to and
@@ -1083,10 +977,6 @@ static void __exit cleanup_opl3sa2(void)
1083 int card; 977 int card;
1084 978
1085 for(card = 0; card < opl3sa2_cards_num; card++) { 979 for(card = 0; card < opl3sa2_cards_num; card++) {
1086#ifdef CONFIG_PM_LEGACY
1087 if (opl3sa2_state[card].pmdev)
1088 pm_unregister(opl3sa2_state[card].pmdev);
1089#endif
1090 if (opl3sa2_state[card].cfg_mpu.slots[1] != -1) { 980 if (opl3sa2_state[card].cfg_mpu.slots[1] != -1) {
1091 unload_opl3sa2_mpu(&opl3sa2_state[card].cfg_mpu); 981 unload_opl3sa2_mpu(&opl3sa2_state[card].cfg_mpu);
1092 } 982 }