From 0dee3c28af2fbe22ca62739a7f57da5435d35793 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:06 -0700
Subject: ceph: on-wire types

These headers describe the types used to exchange messages between the
Ceph client and various servers.  All types are little-endian and
packed.  These headers are shared between the kernel and userspace, so
all types are in terms of e.g. __u32.

Additionally, we define a few magic values to identify the current
version of the protocol(s) in use, so that discrepancies to be
detected on mount.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.c      |  80 +++++++
 fs/ceph/ceph_fs.h      | 629 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/ceph_strings.c | 163 +++++++++++++
 fs/ceph/msgr.h         | 157 ++++++++++++
 fs/ceph/rados.h        | 372 +++++++++++++++++++++++++++++
 5 files changed, 1401 insertions(+)
 create mode 100644 fs/ceph/ceph_fs.c
 create mode 100644 fs/ceph/ceph_fs.h
 create mode 100644 fs/ceph/ceph_strings.c
 create mode 100644 fs/ceph/msgr.h
 create mode 100644 fs/ceph/rados.h

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..9371ff1c0002
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,80 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include "types.h"
+
+int ceph_flags_to_mode(int flags)
+{
+#ifdef O_DIRECTORY  /* fixme */
+	if ((flags & O_DIRECTORY) == O_DIRECTORY)
+		return CEPH_FILE_MODE_PIN;
+#endif
+#ifdef O_LAZY
+	if (flags & O_LAZY)
+		return CEPH_FILE_MODE_LAZY;
+#endif
+	if ((flags & O_APPEND) == O_APPEND)
+		flags |= O_WRONLY;
+
+	flags &= O_ACCMODE;
+	if ((flags & O_RDWR) == O_RDWR)
+		return CEPH_FILE_MODE_RDWR;
+	if ((flags & O_WRONLY) == O_WRONLY)
+		return CEPH_FILE_MODE_WR;
+	return CEPH_FILE_MODE_RD;
+}
+
+int ceph_caps_for_mode(int mode)
+{
+	switch (mode) {
+	case CEPH_FILE_MODE_PIN:
+		return CEPH_CAP_PIN;
+	case CEPH_FILE_MODE_RD:
+		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+	case CEPH_FILE_MODE_RDWR:
+		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_EXCL |
+			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
+			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+	case CEPH_FILE_MODE_WR:
+		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_EXCL |
+			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+	}
+	return 0;
+}
+
+/* Name hashing routines. Initial hash value */
+/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
+#define ceph_init_name_hash()		0
+
+/* partial hash update function. Assume roughly 4 bits per character */
+static unsigned long ceph_partial_name_hash(unsigned long c,
+					    unsigned long prevhash)
+{
+	return (prevhash + (c << 4) + (c >> 4)) * 11;
+}
+
+/*
+ * Finally: cut down the number of bits to a int value (and try to avoid
+ * losing bits)
+ */
+static unsigned long ceph_end_name_hash(unsigned long hash)
+{
+	return hash & 0xffffffff;
+}
+
+/* Compute the hash for a name string. */
+unsigned int ceph_full_name_hash(const char *name, unsigned int len)
+{
+	unsigned long hash = ceph_init_name_hash();
+	while (len--)
+		hash = ceph_partial_name_hash(*name++, hash);
+	return ceph_end_name_hash(hash);
+}
+
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..21ed51b127f2
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,629 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2
+ */
+
+#ifndef _FS_CEPH_CEPH_FS_H
+#define _FS_CEPH_CEPH_FS_H
+
+#include "msgr.h"
+#include "rados.h"
+
+/*
+ * Ceph release version
+ */
+#define CEPH_VERSION_MAJOR 0
+#define CEPH_VERSION_MINOR 16
+#define CEPH_VERSION_PATCH 1
+
+#define _CEPH_STRINGIFY(x) #x
+#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
+#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
+	"." CEPH_STRINGIFY(z)
+#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
+				       CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
+
+/*
+ * subprotocol versions.  when specific messages types or high-level
+ * protocols change, bump the affected components.  we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSD_PROTOCOL     7 /* cluster internal */
+#define CEPH_MDS_PROTOCOL     9 /* cluster internal */
+#define CEPH_MON_PROTOCOL     4 /* cluster internal */
+#define CEPH_OSDC_PROTOCOL   20 /* server/client */
+#define CEPH_MDSC_PROTOCOL   29 /* server/client */
+#define CEPH_MONC_PROTOCOL   14 /* server/client */
+
+
+#define CEPH_INO_ROOT  1
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON   31
+
+
+unsigned int ceph_full_name_hash(const char *name, unsigned int len);
+
+
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+	/* file -> object mapping */
+	__le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
+				      of page size. */
+	__le32 fl_stripe_count;    /* over this many objects */
+	__le32 fl_object_size;     /* until objects are this big, then move to
+				      new objects */
+	__le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
+
+	/* pg -> disk layout */
+	__le32 fl_object_stripe_unit;  /* for per-object parity, if any */
+
+	/* object -> pg layout */
+	__le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+	__le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+
+
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN               1
+#define CEPH_MSG_PING                   2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP                4
+#define CEPH_MSG_MON_GET_MAP            5
+#define CEPH_MSG_CLIENT_MOUNT           10
+#define CEPH_MSG_CLIENT_MOUNT_ACK       11
+#define CEPH_MSG_STATFS                 13
+#define CEPH_MSG_STATFS_REPLY           14
+#define CEPH_MSG_MON_SUBSCRIBE          15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_GETMAP             20
+#define CEPH_MSG_MDS_MAP                21
+
+#define CEPH_MSG_CLIENT_SESSION         22
+#define CEPH_MSG_CLIENT_RECONNECT       23
+
+#define CEPH_MSG_CLIENT_REQUEST         24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_CAPS            0x310
+#define CEPH_MSG_CLIENT_LEASE           0x311
+#define CEPH_MSG_CLIENT_SNAP            0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+
+/* osd */
+#define CEPH_MSG_OSD_GETMAP       40
+#define CEPH_MSG_OSD_MAP          41
+#define CEPH_MSG_OSD_OP           42
+#define CEPH_MSG_OSD_OPREPLY      43
+
+
+struct ceph_mon_statfs {
+	__le64 have_version;
+	struct ceph_fsid fsid;
+	__le64 tid;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+	__le64 kb, kb_used, kb_avail;
+	__le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+	struct ceph_fsid fsid;
+	__le64 tid;
+	__le64 version;
+	struct ceph_statfs st;
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+	__le64 have_version;
+	struct ceph_fsid fsid;
+	__le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+	__le64 have_version;
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+	__le64 have_version;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_item {
+	__le64 have;
+	__u8 onetime;
+} __attribute__ ((packed));
+
+/*
+ * mds states
+ *   > 0 -> in
+ *  <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
+					  empty log. */
+#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+
+#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
+					  operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ *  - these are bitmasks.. we can compose them
+ *  - they also define the lock ordering by the MDS
+ *  - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DN          1
+#define CEPH_LOCK_ISNAP       2
+#define CEPH_LOCK_IVERSION    4     /* mds internal */
+#define CEPH_LOCK_IFILE       8     /* mds internal */
+#define CEPH_LOCK_IAUTH       32
+#define CEPH_LOCK_ILINK       64
+#define CEPH_LOCK_IDFT        128   /* dir frag tree */
+#define CEPH_LOCK_INEST       256   /* mds internal */
+#define CEPH_LOCK_IXATTR      512
+#define CEPH_LOCK_INO         2048  /* immutable inode bits; not a lock */
+
+/* client_session ops */
+enum {
+	CEPH_SESSION_REQUEST_OPEN,
+	CEPH_SESSION_OPEN,
+	CEPH_SESSION_REQUEST_CLOSE,
+	CEPH_SESSION_CLOSE,
+	CEPH_SESSION_REQUEST_RENEWCAPS,
+	CEPH_SESSION_RENEWCAPS,
+	CEPH_SESSION_STALE,
+	CEPH_SESSION_RECALL_STATE,
+};
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+	__le32 op;
+	__le64 seq;
+	struct ceph_timespec stamp;
+	__le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ *  & 0x001000 -> write op
+ *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ &  & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE        0x001000
+enum {
+	CEPH_MDS_OP_LOOKUP     = 0x00100,
+	CEPH_MDS_OP_GETATTR    = 0x00101,
+	CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+	CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+
+	CEPH_MDS_OP_SETXATTR   = 0x01105,
+	CEPH_MDS_OP_RMXATTR    = 0x01106,
+	CEPH_MDS_OP_SETLAYOUT  = 0x01107,
+	CEPH_MDS_OP_SETATTR    = 0x01108,
+
+	CEPH_MDS_OP_MKNOD      = 0x01201,
+	CEPH_MDS_OP_LINK       = 0x01202,
+	CEPH_MDS_OP_UNLINK     = 0x01203,
+	CEPH_MDS_OP_RENAME     = 0x01204,
+	CEPH_MDS_OP_MKDIR      = 0x01220,
+	CEPH_MDS_OP_RMDIR      = 0x01221,
+	CEPH_MDS_OP_SYMLINK    = 0x01222,
+
+	CEPH_MDS_OP_CREATE     = 0x00301,
+	CEPH_MDS_OP_OPEN       = 0x00302,
+	CEPH_MDS_OP_READDIR    = 0x00305,
+
+	CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+	CEPH_MDS_OP_MKSNAP     = 0x01400,
+	CEPH_MDS_OP_RMSNAP     = 0x01401,
+	CEPH_MDS_OP_LSSNAP     = 0x00402,
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+
+#define CEPH_SETATTR_MODE   1
+#define CEPH_SETATTR_UID    2
+#define CEPH_SETATTR_GID    4
+#define CEPH_SETATTR_MTIME  8
+#define CEPH_SETATTR_ATIME 16
+#define CEPH_SETATTR_SIZE  32
+#define CEPH_SETATTR_CTIME 64
+
+union ceph_mds_request_args {
+	struct {
+		__le32 mask;                 /* CEPH_CAP_* */
+	} __attribute__ ((packed)) getattr;
+	struct {
+		__le32 mode;
+		__le32 uid;
+		__le32 gid;
+		struct ceph_timespec mtime;
+		struct ceph_timespec atime;
+		__le64 size, old_size;       /* old_size needed by truncate */
+		__le32 mask;                 /* CEPH_SETATTR_* */
+	} __attribute__ ((packed)) setattr;
+	struct {
+		__le32 frag;                 /* which dir fragment */
+		__le32 max_entries;          /* how many dentries to grab */
+	} __attribute__ ((packed)) readdir;
+	struct {
+		__le32 mode;
+		__le32 rdev;
+	} __attribute__ ((packed)) mknod;
+	struct {
+		__le32 mode;
+	} __attribute__ ((packed)) mkdir;
+	struct {
+		__le32 flags;
+		__le32 mode;
+		__le32 stripe_unit;          /* layout for newly created file */
+		__le32 stripe_count;         /* ... */
+		__le32 object_size;
+		__le32 file_replication;
+		__le32 preferred;
+	} __attribute__ ((packed)) open;
+	struct {
+		__le32 flags;
+	} __attribute__ ((packed)) setxattr;
+	struct {
+		struct ceph_file_layout layout;
+	} __attribute__ ((packed)) setlayout;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
+
+struct ceph_mds_request_head {
+	__le64 tid, oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args args;
+} __attribute__ ((packed));
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+	__le64 ino, cap_id;            /* ino and unique cap id */
+	__le32 caps, wanted;           /* new issued, wanted */
+	__le32 seq, issue_seq, mseq;
+	__le32 dname_seq;              /* if releasing a dentry lease, a */
+	__le32 dname_len;              /* string follows. */
+} __attribute__ ((packed));
+
+/* client reply */
+struct ceph_mds_reply_head {
+	__le64 tid;
+	__le32 op;
+	__le32 result;
+	__le32 mdsmap_epoch;
+	__u8 safe;                     /* true if committed to disk */
+	__u8 is_dentry, is_target;     /* true if dentry, target inode records
+					  are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+	__le32 frag;                   /* this frag splits... */
+	__le32 by;                     /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+	__le32 nsplits;                /* num ceph_frag_tree_split records */
+	struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+	__le32 caps, wanted;           /* caps issued, wanted */
+	__le64 cap_id;
+	__le32 seq, mseq;
+	__le64 realm;                  /* snap realm */
+	__u8 flags;                    /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
+
+/* inode record, for bundling with mds reply */
+struct ceph_mds_reply_inode {
+	__le64 ino;
+	__le64 snapid;
+	__le32 rdev;
+	__le64 version;                /* inode version */
+	__le64 xattr_version;          /* version for xattr blob */
+	struct ceph_mds_reply_cap cap; /* caps issued for this inode */
+	struct ceph_file_layout layout;
+	struct ceph_timespec ctime, mtime, atime;
+	__le32 time_warp_seq;
+	__le64 size, max_size, truncate_size;
+	__le32 truncate_seq;
+	__le32 mode, uid, gid;
+	__le32 nlink;
+	__le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
+	struct ceph_timespec rctime;
+	struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
+} __attribute__ ((packed));
+/* followed by frag array, then symlink string, then xattr blob */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+	__le16 mask;            /* lease type(s) */
+	__le32 duration_ms;     /* lease duration */
+	__le32 seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_reply_dirfrag {
+	__le32 frag;            /* fragment */
+	__le32 auth;            /* auth mds, if this is a delegation point */
+	__le32 ndist;           /* number of mds' this is replicated on */
+	__le32 dist[];
+} __attribute__ ((packed));
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN        0
+#define CEPH_FILE_MODE_RD         1
+#define CEPH_FILE_MODE_WR         2
+#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
+#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
+#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
+
+int ceph_flags_to_mode(int flags);
+
+
+/* capability bits */
+#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+#define CEPH_CAP_GSHARED     1  /* client can reads */
+#define CEPH_CAP_GEXCL       2  /* client can read and update */
+#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
+#define CEPH_CAP_GRD         8  /* (file) client can read */
+#define CEPH_CAP_GWR        16  /* (file) client can write */
+#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH      2
+#define CEPH_CAP_SLINK      4
+#define CEPH_CAP_SXATTR     6
+#define CEPH_CAP_SFILE      8   /* goes at the end (uses >2 cap bits) */
+
+#define CEPH_CAP_BITS       16
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
+#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |			\
+				 CEPH_CAP_AUTH_SHARED |	\
+				 CEPH_CAP_LINK_SHARED |	\
+				 CEPH_CAP_FILE_SHARED |	\
+				 CEPH_CAP_XATTR_SHARED)
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |			\
+			      CEPH_CAP_LINK_SHARED |			\
+			      CEPH_CAP_XATTR_SHARED |			\
+			      CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |	\
+			   CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |		\
+			   CEPH_CAP_LINK_EXCL |		\
+			   CEPH_CAP_XATTR_EXCL |	\
+			   CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |	\
+			      CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+			   CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+			CEPH_LOCK_IXATTR)
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+	CEPH_CAP_OP_GRANT,         /* mds->client grant */
+	CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
+	CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
+	CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
+	CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
+	CEPH_CAP_OP_UPDATE,        /* client->mds update */
+	CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
+	CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
+	CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
+	CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
+	CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+	CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
+	CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps {
+	__le32 op;                  /* CEPH_CAP_OP_* */
+	__le64 ino, realm;
+	__le64 cap_id;
+	__le32 seq, issue_seq;
+	__le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+	__le32 migrate_seq;
+	__le64 snap_follows;
+	__le32 snap_trace_len;
+	__le64 client_tid;          /* for FLUSH(SNAP) -> FLUSH(SNAP)_ACK */
+
+	/* authlock */
+	__le32 uid, gid, mode;
+
+	/* linklock */
+	__le32 nlink;
+
+	/* xattrlock */
+	__le32 xattr_len;
+	__le64 xattr_version;
+
+	/* filelock */
+	__le64 size, max_size, truncate_size;
+	__le32 truncate_seq;
+	struct ceph_timespec mtime, atime, ctime;
+	struct ceph_file_layout layout;
+	__le32 time_warp_seq;
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+	__le32 num;                /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+	__le64 ino;
+	__le64 cap_id;
+	__le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
+#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
+#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
+#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+	__u8 action;            /* CEPH_MDS_LEASE_* */
+	__le16 mask;            /* which lease */
+	__le64 ino;
+	__le64 first, last;     /* snap range */
+	__le32 seq;
+	__le32 duration_ms;     /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+	__le64 cap_id;
+	__le32 wanted;
+	__le32 issued;
+	__le64 size;
+	struct ceph_timespec mtime, atime;
+	__le64 snaprealm;
+	__le64 pathbase;        /* base ino for our path to this ino */
+} __attribute__ ((packed));
+/* followed by encoded string */
+
+struct ceph_mds_snaprealm_reconnect {
+	__le64 ino;     /* snap realm base */
+	__le64 seq;     /* snap seq for this snap realm */
+	__le64 parent;  /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+	CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
+	CEPH_SNAP_OP_CREATE,
+	CEPH_SNAP_OP_DESTROY,
+	CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+	__le32 op;                /* CEPH_SNAP_OP_* */
+	__le64 split;             /* ino to split off, if any */
+	__le32 num_split_inos;    /* # inos belonging to new child realm */
+	__le32 num_split_realms;  /* # child realms udner new child realm */
+	__le32 trace_len;         /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+	__le64 ino;           /* ino */
+	__le64 created;       /* snap: when created */
+	__le64 parent;        /* ino: parent realm */
+	__le64 parent_since;  /* snap: same parent since */
+	__le64 seq;           /* snap: version */
+	__le32 num_snaps;
+	__le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..90d19d9d8d8f
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,163 @@
+/*
+ * Ceph string constants
+ */
+#include "types.h"
+
+const char *ceph_osd_op_name(int op)
+{
+	switch (op) {
+	case CEPH_OSD_OP_READ: return "read";
+	case CEPH_OSD_OP_STAT: return "stat";
+
+	case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+	case CEPH_OSD_OP_WRITE: return "write";
+	case CEPH_OSD_OP_DELETE: return "delete";
+	case CEPH_OSD_OP_TRUNCATE: return "truncate";
+	case CEPH_OSD_OP_ZERO: return "zero";
+	case CEPH_OSD_OP_WRITEFULL: return "writefull";
+
+	case CEPH_OSD_OP_APPEND: return "append";
+	case CEPH_OSD_OP_STARTSYNC: return "startsync";
+	case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+	case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+	case CEPH_OSD_OP_TMAPUP: return "tmapup";
+	case CEPH_OSD_OP_TMAPGET: return "tmapget";
+	case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+
+	case CEPH_OSD_OP_GETXATTR: return "getxattr";
+	case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+	case CEPH_OSD_OP_SETXATTR: return "setxattr";
+	case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+	case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+	case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+
+	case CEPH_OSD_OP_PULL: return "pull";
+	case CEPH_OSD_OP_PUSH: return "push";
+	case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+	case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+	case CEPH_OSD_OP_SCRUB: return "scrub";
+
+	case CEPH_OSD_OP_WRLOCK: return "wrlock";
+	case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+	case CEPH_OSD_OP_RDLOCK: return "rdlock";
+	case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+	case CEPH_OSD_OP_UPLOCK: return "uplock";
+	case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+	case CEPH_OSD_OP_CALL: return "call";
+
+	case CEPH_OSD_OP_PGLS: return "pgls";
+	}
+	return "???";
+}
+
+const char *ceph_mds_state_name(int s)
+{
+	switch (s) {
+		/* down and out */
+	case CEPH_MDS_STATE_DNE:        return "down:dne";
+	case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
+		/* up and out */
+	case CEPH_MDS_STATE_BOOT:       return "up:boot";
+	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
+	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+	case CEPH_MDS_STATE_CREATING:   return "up:creating";
+	case CEPH_MDS_STATE_STARTING:   return "up:starting";
+		/* up and in */
+	case CEPH_MDS_STATE_REPLAY:     return "up:replay";
+	case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
+	case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
+	case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
+	case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+	case CEPH_MDS_STATE_ACTIVE:     return "up:active";
+	case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
+	}
+	return "???";
+}
+
+const char *ceph_session_op_name(int op)
+{
+	switch (op) {
+	case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+	case CEPH_SESSION_OPEN: return "open";
+	case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+	case CEPH_SESSION_CLOSE: return "close";
+	case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+	case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+	case CEPH_SESSION_STALE: return "stale";
+	case CEPH_SESSION_RECALL_STATE: return "recall_state";
+	}
+	return "???";
+}
+
+const char *ceph_mds_op_name(int op)
+{
+	switch (op) {
+	case CEPH_MDS_OP_LOOKUP:  return "lookup";
+	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
+	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+	case CEPH_MDS_OP_GETATTR:  return "getattr";
+	case CEPH_MDS_OP_SETXATTR: return "setxattr";
+	case CEPH_MDS_OP_SETATTR: return "setattr";
+	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+	case CEPH_MDS_OP_READDIR: return "readdir";
+	case CEPH_MDS_OP_MKNOD: return "mknod";
+	case CEPH_MDS_OP_LINK: return "link";
+	case CEPH_MDS_OP_UNLINK: return "unlink";
+	case CEPH_MDS_OP_RENAME: return "rename";
+	case CEPH_MDS_OP_MKDIR: return "mkdir";
+	case CEPH_MDS_OP_RMDIR: return "rmdir";
+	case CEPH_MDS_OP_SYMLINK: return "symlink";
+	case CEPH_MDS_OP_CREATE: return "create";
+	case CEPH_MDS_OP_OPEN: return "open";
+	case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+	case CEPH_MDS_OP_LSSNAP: return "lssnap";
+	case CEPH_MDS_OP_MKSNAP: return "mksnap";
+	case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+	}
+	return "???";
+}
+
+const char *ceph_cap_op_name(int op)
+{
+	switch (op) {
+	case CEPH_CAP_OP_GRANT: return "grant";
+	case CEPH_CAP_OP_REVOKE: return "revoke";
+	case CEPH_CAP_OP_TRUNC: return "trunc";
+	case CEPH_CAP_OP_EXPORT: return "export";
+	case CEPH_CAP_OP_IMPORT: return "import";
+	case CEPH_CAP_OP_UPDATE: return "update";
+	case CEPH_CAP_OP_DROP: return "drop";
+	case CEPH_CAP_OP_FLUSH: return "flush";
+	case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+	case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+	case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+	case CEPH_CAP_OP_RELEASE: return "release";
+	case CEPH_CAP_OP_RENEW: return "renew";
+	}
+	return "???";
+}
+
+const char *ceph_lease_op_name(int o)
+{
+	switch (o) {
+	case CEPH_MDS_LEASE_REVOKE: return "revoke";
+	case CEPH_MDS_LEASE_RELEASE: return "release";
+	case CEPH_MDS_LEASE_RENEW: return "renew";
+	case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+	}
+	return "???";
+}
+
+const char *ceph_snap_op_name(int o)
+{
+	switch (o) {
+	case CEPH_SNAP_OP_UPDATE: return "update";
+	case CEPH_SNAP_OP_CREATE: return "create";
+	case CEPH_SNAP_OP_DESTROY: return "destroy";
+	case CEPH_SNAP_OP_SPLIT: return "split";
+	}
+	return "???";
+}
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..73921ae43faa
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,157 @@
+#ifndef __MSGR_H
+#define __MSGR_H
+
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+
+#define CEPH_MON_PORT    6789  /* default monitor port */
+
+/*
+ * client-side processes will try to bind to ports in this
+ * range, simply for the benefit of tools like nmap or wireshark
+ * that would like to identify the protocol.
+ */
+#define CEPH_PORT_FIRST  6789
+#define CEPH_PORT_START  6800  /* non-monitors start here */
+#define CEPH_PORT_LAST   6900
+
+/*
+ * tcp connection banner.  include a protocol version. and adjust
+ * whenever the wire protocol changes.  try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v021"
+#define CEPH_BANNER_MAX_LEN 30
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+       return (__s32)a - (__s32)b;
+}
+
+
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+	__u8 type;      /* CEPH_ENTITY_TYPE_* */
+	__le64 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON    1
+#define CEPH_ENTITY_TYPE_MDS    2
+#define CEPH_ENTITY_TYPE_OSD    3
+#define CEPH_ENTITY_TYPE_CLIENT 4
+#define CEPH_ENTITY_TYPE_ADMIN  5
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+	__le32 erank;  /* entity's rank in process */
+	__le32 nonce;  /* unique id for process (e.g. pid) */
+	struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+
+static inline bool ceph_entity_addr_is_local(const struct ceph_entity_addr *a,
+					     const struct ceph_entity_addr *b)
+{
+	return a->nonce == b->nonce &&
+		memcmp(&a->in_addr, &b->in_addr, sizeof(a->in_addr)) == 0;
+}
+
+static inline bool ceph_entity_addr_equal(const struct ceph_entity_addr *a,
+					  const struct ceph_entity_addr *b)
+{
+	return memcmp(a, b, sizeof(*a)) == 0;
+}
+
+struct ceph_entity_inst {
+	struct ceph_entity_name name;
+	struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
+					  incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
+					  with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
+					  with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
+#define CEPH_MSGR_TAG_MSG           7  /* message */
+#define CEPH_MSGR_TAG_ACK           8  /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+	__le32 host_type;    /* CEPH_ENTITY_TYPE_* */
+	__le32 global_seq;   /* count connections initiated by this host */
+	__le32 connect_seq;  /* count connections initiated in this session */
+	__le32 protocol_version;
+	__u8  flags;         /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+	__u8 tag;
+	__le32 global_seq;
+	__le32 connect_seq;
+	__le32 protocol_version;
+	__u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header {
+	__le64 seq;       /* message seq# for this session */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+
+	__le32 front_len; /* bytes in main payload */
+	__le32 middle_len;/* bytes in middle payload */
+	__le32 data_len;  /* bytes of data payload */
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	struct ceph_entity_inst src, orig_src;
+	__le32 dst_erank;
+	__le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW     64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH    196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ */
+struct ceph_msg_footer {
+	__le32 front_crc, middle_crc, data_crc;
+	__u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
+
+
+#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..a48cf4ae391e
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,372 @@
+#ifndef __RADOS_H
+#define __RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include "msgr.h"
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+	unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+				    const struct ceph_fsid *b)
+{
+	return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
+
+struct ceph_timespec {
+	__le32 tv_sec;
+	__le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH     1
+#define CEPH_OBJECT_LAYOUT_LINEAR   2
+#define CEPH_OBJECT_LAYOUT_HASHINO  3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH  0
+#define CEPH_PG_LAYOUT_HASH   1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+union ceph_pg {
+	__u64 pg64;
+	struct {
+		__s16 preferred; /* preferred primary osd */
+		__u16 ps;        /* placement seed */
+		__u32 pool;      /* object pool */
+	} __attribute__ ((packed)) pg;
+} __attribute__ ((packed));
+
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ *  pg_num -- base number of pseudorandomly placed pgs
+ *
+ *  pgp_num -- effective number when calculating pg placement.  this
+ * is used for pg_num increases.  new pgs result in data being "split"
+ * into new pgs.  for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split.  only _then_ are the new
+ * pgs placed independently.
+ *
+ *  lpg_num -- localized pg count (per device).  replicas are randomly
+ * selected.
+ *
+ *  lpgp_num -- as above.
+ */
+#define CEPH_PG_TYPE_REP     1
+#define CEPH_PG_TYPE_RAID4   2
+struct ceph_pg_pool {
+	__u8 type;                /* CEPH_PG_TYPE_* */
+	__u8 size;                /* number of osds in each pg */
+	__u8 crush_ruleset;       /* crush placement rule */
+	__le32 pg_num, pgp_num;   /* number of pg's */
+	__le32 lpg_num, lpgp_num; /* number of localized pg's */
+	__le32 last_change;       /* most recent epoch changed */
+	__le64 snap_seq;          /* seq for per-pool snapshot */
+	__le32 snap_epoch;        /* epoch of last snap */
+	__le32 num_snaps;
+	__le32 num_removed_snap_intervals;
+} __attribute__ ((packed));
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time.  b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+	if ((x & bmask) < b)
+		return x & bmask;
+	else
+		return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+	__le64 ol_pgid;           /* raw pg, with _full_ ps precision. */
+	__le32 ol_stripe_unit;    /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+	__le32 epoch;
+	__le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS 1
+#define CEPH_OSD_UP     2
+
+/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN  0x10000
+#define CEPH_OSD_OUT 0
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
+
+/*
+ * osd ops
+ */
+#define CEPH_OSD_OP_MODE       0xf000
+#define CEPH_OSD_OP_MODE_RD    0x1000
+#define CEPH_OSD_OP_MODE_WR    0x2000
+#define CEPH_OSD_OP_MODE_RMW   0x3000
+#define CEPH_OSD_OP_MODE_SUB   0x4000
+#define CEPH_OSD_OP_MODE_EXEC  0x8000
+
+#define CEPH_OSD_OP_TYPE       0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK  0x0100
+#define CEPH_OSD_OP_TYPE_DATA  0x0200
+#define CEPH_OSD_OP_TYPE_ATTR  0x0300
+#define CEPH_OSD_OP_TYPE_EXEC  0x0400
+#define CEPH_OSD_OP_TYPE_PG    0x0500
+
+enum {
+	/** data **/
+	/* read */
+	CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
+	CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
+
+	/* fancy read */
+	CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
+
+	/* write */
+	CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
+	CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
+	CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
+	CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
+	CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
+
+	/* fancy write */
+	CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
+	CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
+	CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
+	CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
+
+	CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
+	CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
+	CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
+
+	CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
+
+	/** attrs **/
+	/* read */
+	CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
+	CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+
+	/* write */
+	CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
+	CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
+	CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
+	CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
+
+	/** subop **/
+	CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
+	CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
+	CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
+	CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
+	CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
+
+	/** lock **/
+	CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
+	CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
+	CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
+	CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
+	CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
+	CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
+
+	/** exec **/
+	CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
+
+	/** pg **/
+	CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
+};
+
+static inline int ceph_osd_op_type_lock(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+}
+
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_RM  'r'
+
+extern const char *ceph_osd_op_name(int op);
+
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+	CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
+	CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
+	CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
+	CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
+	CEPH_OSD_FLAG_READ = 16,        /* op may read */
+	CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
+	CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
+	CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
+	CEPH_OSD_FLAG_BALANCE_READS = 256,
+	CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
+	CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
+};
+
+enum {
+	CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
+};
+
+#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+
+/*
+ * an individual object operation.  each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+	__le16 op;           /* CEPH_OSD_OP_* */
+	__le32 flags;        /* CEPH_OSD_FLAG_* */
+	union {
+		struct {
+			__le64 offset, length;
+		} __attribute__ ((packed)) extent;
+		struct {
+			__le32 name_len;
+			__le32 value_len;
+		} __attribute__ ((packed)) xattr;
+		struct {
+			__le64 truncate_size;
+			__le32 truncate_seq;
+		} __attribute__ ((packed)) trunc;
+		struct {
+			__u8 class_len;
+			__u8 method_len;
+			__u8 argc;
+			__le32 indata_len;
+		} __attribute__ ((packed)) cls;
+		struct {
+			__le64 cookie, count;
+		} __attribute__ ((packed)) pgls;
+	};
+	__le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * osd request message header.  each request may include multiple
+ * ceph_osd_op object operations.
+ */
+struct ceph_osd_request_head {
+	__le64 tid;                        /* transaction id */
+	__le32 client_inc;                 /* client incarnation */
+	struct ceph_object_layout layout;  /* pgid */
+	__le32 osdmap_epoch;               /* client's osdmap epoch */
+
+	__le32 flags;
+
+	struct ceph_timespec mtime;        /* for mutations only */
+	struct ceph_eversion reassert_version; /* if we are replaying op */
+
+	__le32 object_len;     /* length of object name */
+
+	__le64 snapid;         /* snapid to read */
+	__le64 snap_seq;       /* writer's snap context */
+	__le32 num_snaps;
+
+	__le16 num_ops;
+	struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
+} __attribute__ ((packed));
+
+struct ceph_osd_reply_head {
+	__le64 tid;                       /* transaction id */
+	__le32 client_inc;                /* client incarnation */
+	__le32 flags;
+	struct ceph_object_layout layout;
+	__le32 osdmap_epoch;
+	struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+	__le32 result;                    /* result code */
+
+	__le32 object_len;                /* length of object name */
+	__le32 num_ops;
+	struct ceph_osd_op ops[0];  /* ops[], object */
+} __attribute__ ((packed));
+
+
+#endif
-- 
cgit v1.2.2


From de57606c23afded22202825b3db8a5d61859f198 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:07 -0700
Subject: ceph: client types

We first define constants, types, and prototypes for the kernel client
proper.

A few subsystems are defined separately later: the MDS, OSD, and
monitor clients, and the messaging layer.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_debug.h |  37 +++
 fs/ceph/ceph_frag.c  |  21 ++
 fs/ceph/ceph_frag.h  | 109 +++++++
 fs/ceph/ceph_ver.h   |   6 +
 fs/ceph/super.h      | 890 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/types.h      |  28 ++
 6 files changed, 1091 insertions(+)
 create mode 100644 fs/ceph/ceph_debug.h
 create mode 100644 fs/ceph/ceph_frag.c
 create mode 100644 fs/ceph/ceph_frag.h
 create mode 100644 fs/ceph/ceph_ver.h
 create mode 100644 fs/ceph/super.h
 create mode 100644 fs/ceph/types.h

(limited to 'fs')

diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
+#ifndef _FS_CEPH_DEBUG_H
+#define _FS_CEPH_DEBUG_H
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
+
+/*
+ * wrap pr_debug to include a filename:lineno prefix on each line.
+ * this incurs some overhead (kernel size and execution time) due to
+ * the extra function call at each call site.
+ */
+
+# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+extern const char *ceph_file_part(const char *s, int len);
+#  define dout(fmt, ...)						\
+	pr_debug(" %12.12s:%-4d : " fmt,				\
+		 ceph_file_part(__FILE__, sizeof(__FILE__)),		\
+		 __LINE__, ##__VA_ARGS__)
+# else
+/* faux printk call just to see any compiler warnings. */
+#  define dout(fmt, ...)	do {				\
+		if (0)						\
+			printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
+	} while (0)
+# endif
+
+#else
+
+/*
+ * or, just wrap pr_debug
+ */
+# define dout(fmt, ...)	pr_debug(" " fmt, ##__VA_ARGS__)
+
+#endif
+
+#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
+/*
+ * Ceph 'frag' type
+ */
+#include "types.h"
+
+int ceph_frag_compare(__u32 a, __u32 b)
+{
+	unsigned va = ceph_frag_value(a);
+	unsigned vb = ceph_frag_value(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	va = ceph_frag_bits(a);
+	vb = ceph_frag_bits(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	return 0;
+}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef _FS_CEPH_FRAG_H
+#define _FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask.  Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ *   8 upper bits = "bits"
+ *  24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value.  This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically.  However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+	return (b << 24) |
+		(v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+	return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+	return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+	return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+	return 24 - ceph_frag_bits(f);
+}
+
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+	return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+	/* is sub as specific as us, and contained by us? */
+	return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+	       (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f) - 1,
+			 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+	return ceph_frag_bits(f) > 0 &&
+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+	return ceph_frag_bits(f) > 0 &&
+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f),
+		      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f)+1,
+	      ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+	int newbits = ceph_frag_bits(f) + by;
+	return ceph_frag_make(newbits,
+			 ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+	return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+	return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f),
+			 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/fs/ceph/ceph_ver.h b/fs/ceph/ceph_ver.h
new file mode 100644
index 000000000000..66c3727b671b
--- /dev/null
+++ b/fs/ceph/ceph_ver.h
@@ -0,0 +1,6 @@
+#ifndef __CEPH_VERSION_H
+#define __CEPH_VERSION_H
+
+#define CEPH_GIT_VER 335cd8f952b457095ea2a66aee3db50efb63c91d
+
+#endif
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..cfd39ef4023e
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,890 @@
+#ifndef _FS_CEPH_SUPER_H
+#define _FS_CEPH_SUPER_H
+
+#include "ceph_debug.h"
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+
+#include "types.h"
+#include "messenger.h"
+#include "msgpool.h"
+#include "mon_client.h"
+#include "mds_client.h"
+#include "osd_client.h"
+#include "ceph_fs.h"
+
+/* f_type in struct statfs */
+#define CEPH_SUPER_MAGIC 0x00c36400
+
+/* large granularity for statfs utilization stats to facilitate
+ * large volume sizes on 32-bit machines. */
+#define CEPH_BLOCK_SHIFT   20  /* 1 MB */
+#define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
+
+/*
+ * mount options
+ */
+#define CEPH_OPT_FSID             (1<<0)
+#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
+#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
+#define CEPH_OPT_DIRSTAT          (1<<4) /* funky `cat dirname` for stats */
+#define CEPH_OPT_RBYTES           (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_OPT_NOCRC            (1<<6) /* no data crc on writes */
+#define CEPH_OPT_NOASYNCREADDIR   (1<<7) /* no dcache readdir */
+
+#define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
+
+#define ceph_set_opt(client, opt) \
+	(client)->mount_args.flags |= CEPH_OPT_##opt;
+#define ceph_test_opt(client, opt) \
+	(!!((client)->mount_args.flags & CEPH_OPT_##opt))
+
+
+#define CEPH_MAX_MON_MOUNT_ADDR	5
+
+struct ceph_mount_args {
+	int sb_flags;
+	int flags;
+	int mount_timeout;
+	int caps_wanted_delay_min, caps_wanted_delay_max;
+	struct ceph_fsid fsid;
+	struct ceph_entity_addr my_addr;
+	int wsize;
+	int rsize;            /* max readahead */
+	int max_readdir;      /* max readdir size */
+	int osd_timeout;
+	char *snapdir_name;   /* default ".snap" */
+	char *secret;
+	int cap_release_safety;
+};
+
+/*
+ * defaults
+ */
+#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
+#define CEPH_MOUNT_RSIZE_DEFAULT    (128*1024) /* readahead */
+
+#define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
+
+#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
+
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file.  Delay a minimum amount of time, even if we send a cap
+ * message for some other reason.  Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+
+
+/* mount state */
+enum {
+	CEPH_MOUNT_MOUNTING,
+	CEPH_MOUNT_MOUNTED,
+	CEPH_MOUNT_UNMOUNTING,
+	CEPH_MOUNT_UNMOUNTED,
+	CEPH_MOUNT_SHUTDOWN,
+};
+
+/*
+ * subtract jiffies
+ */
+static inline unsigned long time_sub(unsigned long a, unsigned long b)
+{
+	BUG_ON(time_after(b, a));
+	return (long)a - (long)b;
+}
+
+/*
+ * per-filesystem client state
+ *
+ * possibly shared by multiple mount points, if they are
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_client {
+	__s64 whoami;                   /* my client number */
+	struct dentry *debugfs_monmap;
+	struct dentry *debugfs_mdsmap, *debugfs_osdmap;
+	struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+
+	struct mutex mount_mutex;       /* serialize mount attempts */
+	struct ceph_mount_args mount_args;
+	struct ceph_fsid fsid;
+
+	struct super_block *sb;
+
+	unsigned long mount_state;
+	wait_queue_head_t mount_wq;
+
+	int mount_err;
+	void *signed_ticket;           /* our keys to the kingdom */
+	int signed_ticket_len;
+
+	struct ceph_messenger *msgr;   /* messenger instance */
+	struct ceph_mon_client monc;
+	struct ceph_mds_client mdsc;
+	struct ceph_osd_client osdc;
+
+	/* writeback */
+	mempool_t *wb_pagevec_pool;
+	struct workqueue_struct *wb_wq;
+	struct workqueue_struct *pg_inv_wq;
+	struct workqueue_struct *trunc_wq;
+
+	struct backing_dev_info backing_dev_info;
+};
+
+static inline struct ceph_client *ceph_client(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+
+/*
+ * File i/o capability.  This tracks shared state with the metadata
+ * server that allows us to cache or writeback attributes or to read
+ * and write data.  For any given inode, we should have one or more
+ * capabilities, one issued by each metadata server, and our
+ * cumulative access is the OR of all issued capabilities.
+ *
+ * Each cap is referenced by the inode's i_caps rbtree and by per-mds
+ * session capability lists.
+ */
+struct ceph_cap {
+	struct ceph_inode_info *ci;
+	struct rb_node ci_node;          /* per-ci cap tree */
+	struct ceph_mds_session *session;
+	struct list_head session_caps;   /* per-session caplist */
+	int mds;
+	u64 cap_id;       /* unique cap id (mds provided) */
+	int issued;       /* latest, from the mds */
+	int implemented;  /* implemented superset of issued (for revocation) */
+	int mds_wanted;
+	u32 seq, issue_seq, mseq, gen;
+	unsigned long last_used;
+	struct list_head caps_item;
+};
+
+#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
+#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
+#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
+
+/*
+ * Snapped cap state that is pending flush to mds.  When a snapshot occurs,
+ * we first complete any in-process sync writes and writeback any dirty
+ * data before flushing the snapped state (tracked here) back to the MDS.
+ */
+struct ceph_cap_snap {
+	atomic_t nref;
+	struct ceph_inode_info *ci;
+	struct list_head ci_item, flushing_item;
+
+	u64 follows, flush_tid;
+	int issued, dirty;
+	struct ceph_snap_context *context;
+
+	mode_t mode;
+	uid_t uid;
+	gid_t gid;
+
+	void *xattr_blob;
+	int xattr_len;
+	u64 xattr_version;
+
+	u64 size;
+	struct timespec mtime, atime, ctime;
+	u64 time_warp_seq;
+	int writing;   /* a sync write is still in progress */
+	int dirty_pages;     /* dirty pages awaiting writeback */
+};
+
+static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
+{
+	if (atomic_dec_and_test(&capsnap->nref))
+		kfree(capsnap);
+}
+
+/*
+ * The frag tree describes how a directory is fragmented, potentially across
+ * multiple metadata servers.  It is also used to indicate points where
+ * metadata authority is delegated, and whether/where metadata is replicated.
+ *
+ * A _leaf_ frag will be present in the i_fragtree IFF there is
+ * delegation info.  That is, if mds >= 0 || ndist > 0.
+ */
+#define CEPH_MAX_DIRFRAG_REP 4
+
+struct ceph_inode_frag {
+	struct rb_node node;
+
+	/* fragtree state */
+	u32 frag;
+	int split_by;         /* i.e. 2^(split_by) children */
+
+	/* delegation and replication info */
+	int mds;              /* -1 if same authority as parent */
+	int ndist;            /* >0 if replicated */
+	int dist[CEPH_MAX_DIRFRAG_REP];
+};
+
+/*
+ * We cache inode xattrs as an encoded blob until they are first used,
+ * at which point we parse them into an rbtree.
+ */
+struct ceph_inode_xattr {
+	struct rb_node node;
+
+	const char *name;
+	int name_len;
+	const char *val;
+	int val_len;
+	int dirty;
+
+	int should_free_name;
+	int should_free_val;
+};
+
+struct ceph_inode_xattrs_info {
+	/*
+	 * (still encoded) xattr blob. we avoid the overhead of parsing
+	 * this until someone actually calls getxattr, etc.
+	 *
+	 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
+	 * NULL means we don't know.
+	*/
+	struct ceph_buffer *blob, *prealloc_blob;
+
+	struct rb_root index;
+	bool dirty;
+	int count;
+	int names_size;
+	int vals_size;
+	u64 version, index_version;
+};
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
+#define CEPH_I_NODELAY   4  /* do not delay cap release */
+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+
+struct ceph_inode_info {
+	struct ceph_vino i_vino;   /* ceph ino + snap */
+
+	u64 i_version;
+	u32 i_time_warp_seq;
+
+	unsigned i_ceph_flags;
+	unsigned long i_release_count;
+
+	struct ceph_file_layout i_layout;
+	char *i_symlink;
+
+	/* for dirs */
+	struct timespec i_rctime;
+	u64 i_rbytes, i_rfiles, i_rsubdirs;
+	u64 i_files, i_subdirs;
+	u64 i_max_offset;  /* largest readdir offset, set with I_COMPLETE */
+
+	struct rb_root i_fragtree;
+	struct mutex i_fragtree_mutex;
+
+	struct ceph_inode_xattrs_info i_xattrs;
+
+	/* capabilities.  protected _both_ by i_lock and cap->session's
+	 * s_mutex. */
+	struct rb_root i_caps;           /* cap list */
+	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
+	unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
+	struct list_head i_dirty_item, i_flushing_item;
+	u64 i_cap_flush_seq;
+	/* we need to track cap writeback on a per-cap-bit basis, to allow
+	 * overlapping, pipelined cap flushes to the mds.  we can probably
+	 * reduce the tid to 8 bits if we're concerned about inode size. */
+	u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
+	wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
+	unsigned long i_hold_caps_min; /* jiffies */
+	unsigned long i_hold_caps_max; /* jiffies */
+	struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
+	int i_cap_exporting_mds;         /* to handle cap migration between */
+	unsigned i_cap_exporting_mseq;   /*  mds's. */
+	unsigned i_cap_exporting_issued;
+	struct ceph_cap_reservation i_cap_migration_resv;
+	struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
+	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 */
+	unsigned i_snap_caps;           /* cap bits for snapped files */
+
+	int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
+
+	u32 i_truncate_seq;        /* last truncate to smaller size */
+	u64 i_truncate_size;       /*  and the size we last truncated down to */
+	int i_truncate_pending;    /*  still need to call vmtruncate */
+
+	u64 i_max_size;            /* max file size authorized by mds */
+	u64 i_reported_size; /* (max_)size reported to or requested of mds */
+	u64 i_wanted_max_size;     /* offset we'd like to write too */
+	u64 i_requested_max_size;  /* max_size we've requested */
+
+	/* held references to caps */
+	int i_pin_ref;
+	int i_rd_ref, i_rdcache_ref, i_wr_ref;
+	int i_wrbuffer_ref, i_wrbuffer_ref_head;
+	u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
+	u32 i_rdcache_gen;      /* we increment this each time we get
+				   FILE_CACHE.  If it's non-zero, we
+				   _may_ have cached pages. */
+	u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
+
+	struct list_head i_unsafe_writes; /* uncommitted sync writes */
+	struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+	spinlock_t i_unsafe_lock;
+
+	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+	int i_snap_realm_counter; /* snap realm (if caps) */
+	struct list_head i_snap_realm_item;
+	struct list_head i_snap_flush_item;
+
+	struct work_struct i_wb_work;  /* writeback work */
+	struct work_struct i_pg_inv_work;  /* page invalidation work */
+
+	struct work_struct i_vmtruncate_work;
+
+	struct inode vfs_inode; /* at end */
+};
+
+static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
+{
+	return list_entry(inode, struct ceph_inode_info, vfs_inode);
+}
+
+static inline void ceph_i_clear(struct inode *inode, unsigned mask)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	spin_lock(&inode->i_lock);
+	ci->i_ceph_flags &= ~mask;
+	spin_unlock(&inode->i_lock);
+}
+
+static inline void ceph_i_set(struct inode *inode, unsigned mask)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	spin_lock(&inode->i_lock);
+	ci->i_ceph_flags |= mask;
+	spin_unlock(&inode->i_lock);
+}
+
+static inline bool ceph_i_test(struct inode *inode, unsigned mask)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	bool r;
+
+	smp_mb();
+	r = (ci->i_ceph_flags & mask) == mask;
+	return r;
+}
+
+
+/* find a specific frag @f */
+extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
+						u32 f);
+
+/*
+ * choose fragment for value @v.  copy frag content to pfrag, if leaf
+ * exists
+ */
+extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+			    struct ceph_inode_frag *pfrag,
+			    int *found);
+
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+	struct ceph_mds_session *lease_session;
+	u32 lease_gen, lease_shared_gen;
+	u32 lease_seq;
+	unsigned long lease_renew_after, lease_renew_from;
+	struct list_head lru;
+	struct dentry *dentry;
+	u64 time;
+	u64 offset;
+};
+
+static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+{
+	return (struct ceph_dentry_info *)dentry->d_fsdata;
+}
+
+static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
+{
+	return ((loff_t)frag << 32) | (loff_t)off;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+	ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+	if (!ino)
+		ino = 1;
+#endif
+	return ino;
+}
+
+static inline int ceph_set_ino_cb(struct inode *inode, void *data)
+{
+	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
+	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+	return 0;
+}
+
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino;
+}
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+	struct ceph_vino *pvino = (struct ceph_vino *)data;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return ci->i_vino.ino == pvino->ino &&
+		ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+					    struct ceph_vino vino)
+{
+	ino_t t = ceph_vino_to_ino(vino);
+	return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * caps helpers
+ */
+static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
+{
+	return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+
+extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
+extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
+extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
+				    struct ceph_cap *cap);
+
+static inline int ceph_caps_issued(struct ceph_inode_info *ci)
+{
+	int issued;
+	spin_lock(&ci->vfs_inode.i_lock);
+	issued = __ceph_caps_issued(ci, NULL);
+	spin_unlock(&ci->vfs_inode.i_lock);
+	return issued;
+}
+
+static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
+					int touch)
+{
+	int r;
+	spin_lock(&ci->vfs_inode.i_lock);
+	r = __ceph_caps_issued_mask(ci, mask, touch);
+	spin_unlock(&ci->vfs_inode.i_lock);
+	return r;
+}
+
+static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
+{
+	return ci->i_dirty_caps | ci->i_flushing_caps;
+}
+extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+
+extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_used(struct ceph_inode_info *ci);
+
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+	int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+	if (w & CEPH_CAP_FILE_BUFFER)
+		w |= CEPH_CAP_FILE_EXCL;  /* we want EXCL if dirty data */
+	return w;
+}
+
+/* what the mds thinks we want */
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+
+extern void ceph_caps_init(void);
+extern void ceph_caps_finalize(void);
+extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
+extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
+extern void ceph_reservation_status(struct ceph_client *client,
+				    int *total, int *avail, int *used,
+				    int *reserved);
+
+static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+{
+	return (struct ceph_client *)inode->i_sb->s_fs_info;
+}
+
+static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+{
+	return (struct ceph_client *)sb->s_fs_info;
+}
+
+static inline int ceph_queue_writeback(struct inode *inode)
+{
+	return queue_work(ceph_inode_to_client(inode)->wb_wq,
+		   &ceph_inode(inode)->i_wb_work);
+}
+
+static inline int ceph_queue_page_invalidation(struct inode *inode)
+{
+	return queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
+		   &ceph_inode(inode)->i_pg_inv_work);
+}
+
+
+/*
+ * we keep buffered readdir results attached to file->private_data
+ */
+struct ceph_file_info {
+	int fmode;     /* initialized on open */
+
+	/* readdir: position within the dir */
+	u32 frag;
+	struct ceph_mds_request *last_readdir;
+	int at_end;
+
+	/* readdir: position within a frag */
+	unsigned offset;       /* offset of last chunk, adjusted for . and .. */
+	u64 next_offset;       /* offset of next chunk (last_name's + 1) */
+	char *last_name;       /* last entry in previous chunk */
+	struct dentry *dentry; /* next dentry (for dcache readdir) */
+	unsigned long dir_release_count;
+
+	/* used for -o dirstat read() on directory thing */
+	char *dir_info;
+	int dir_info_len;
+};
+
+
+
+/*
+ * snapshots
+ */
+
+/*
+ * A "snap context" is the set of existing snapshots when we
+ * write data.  It is used by the OSD to guide its COW behavior.
+ *
+ * The ceph_snap_context is refcounted, and attached to each dirty
+ * page, indicating which context the dirty data belonged when it was
+ * dirtied.
+ */
+struct ceph_snap_context {
+	atomic_t nref;
+	u64 seq;
+	int num_snaps;
+	u64 snaps[];
+};
+
+static inline struct ceph_snap_context *
+ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+	/*
+	printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+	       atomic_read(&sc->nref)+1);
+	*/
+	if (sc)
+		atomic_inc(&sc->nref);
+	return sc;
+}
+
+static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+	if (!sc)
+		return;
+	/*
+	printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+	       atomic_read(&sc->nref)-1);
+	*/
+	if (atomic_dec_and_test(&sc->nref)) {
+		/*printk(" deleting snap_context %p\n", sc);*/
+		kfree(sc);
+	}
+}
+
+/*
+ * A "snap realm" describes a subset of the file hierarchy sharing
+ * the same set of snapshots that apply to it.  The realms themselves
+ * are organized into a hierarchy, such that children inherit (some of)
+ * the snapshots of their parents.
+ *
+ * All inodes within the realm that have capabilities are linked into a
+ * per-realm list.
+ */
+struct ceph_snap_realm {
+	u64 ino;
+	atomic_t nref;
+	u64 created, seq;
+	u64 parent_ino;
+	u64 parent_since;   /* snapid when our current parent became so */
+
+	u64 *prior_parent_snaps;      /* snaps inherited from any parents we */
+	int num_prior_parent_snaps;   /*  had prior to parent_since */
+	u64 *snaps;                   /* snaps specific to this realm */
+	int num_snaps;
+
+	struct ceph_snap_realm *parent;
+	struct list_head children;       /* list of child realms */
+	struct list_head child_item;
+
+	struct list_head empty_item;     /* if i have ref==0 */
+
+	/* the current set of snaps for this realm */
+	struct ceph_snap_context *cached_context;
+
+	struct list_head inodes_with_caps;
+	spinlock_t inodes_with_caps_lock;
+};
+
+
+
+/*
+ * calculate the number of pages a given length and offset map onto,
+ * if we align the data.
+ */
+static inline int calc_pages_for(u64 off, u64 len)
+{
+	return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+		(off >> PAGE_CACHE_SHIFT);
+}
+
+
+
+/* snap.c */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+					       u64 ino);
+extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+				struct ceph_snap_realm *realm);
+extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+				struct ceph_snap_realm *realm);
+extern int ceph_update_snap_trace(struct ceph_mds_client *m,
+				  void *p, void *e, bool deletion);
+extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
+			     struct ceph_msg *msg);
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+				struct ceph_snap_context *snapc);
+extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+				  struct ceph_cap_snap *capsnap);
+extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+
+/*
+ * a cap_snap is "pending" if it is still awaiting an in-progress
+ * sync write (that may/may not still update size, mtime, etc.).
+ */
+static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
+{
+	return !list_empty(&ci->i_cap_snaps) &&
+		list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
+			   ci_item)->writing;
+}
+
+
+/* super.c */
+extern struct kmem_cache *ceph_inode_cachep;
+extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_dentry_cachep;
+extern struct kmem_cache *ceph_file_cachep;
+
+extern const char *ceph_msg_type_name(int type);
+
+#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
+	"%02x%02x%02x%02x%02x%02x"
+#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
+		(f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7],    \
+		(f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11],  \
+		(f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
+
+/* inode.c */
+extern const struct inode_operations ceph_file_iops;
+
+extern struct inode *ceph_alloc_inode(struct super_block *sb);
+extern void ceph_destroy_inode(struct inode *inode);
+
+extern struct inode *ceph_get_inode(struct super_block *sb,
+				    struct ceph_vino vino);
+extern struct inode *ceph_get_snapdir(struct inode *parent);
+extern int ceph_fill_file_size(struct inode *inode, int issued,
+			       u32 truncate_seq, u64 truncate_size, u64 size);
+extern void ceph_fill_file_time(struct inode *inode, int issued,
+				u64 time_warp_seq, struct timespec *ctime,
+				struct timespec *mtime, struct timespec *atime);
+extern int ceph_fill_trace(struct super_block *sb,
+			   struct ceph_mds_request *req,
+			   struct ceph_mds_session *session);
+extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+				    struct ceph_mds_session *session);
+
+extern int ceph_inode_holds_cap(struct inode *inode, int mask);
+
+extern int ceph_inode_set_size(struct inode *inode, loff_t size);
+extern void ceph_inode_writeback(struct work_struct *work);
+extern void ceph_vmtruncate_work(struct work_struct *work);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
+extern void __ceph_queue_vmtruncate(struct inode *inode);
+
+extern int ceph_do_getattr(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat);
+
+/* xattr.c */
+extern int ceph_setxattr(struct dentry *, const char *, const void *,
+			 size_t, int);
+extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
+extern int ceph_removexattr(struct dentry *, const char *);
+extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+
+/* caps.c */
+extern const char *ceph_cap_string(int c);
+extern void ceph_handle_caps(struct ceph_mds_session *session,
+			     struct ceph_msg *msg);
+extern int ceph_add_cap(struct inode *inode,
+			struct ceph_mds_session *session, u64 cap_id,
+			int fmode, unsigned issued, unsigned wanted,
+			unsigned cap, unsigned seq, u64 realmino, int flags,
+			struct ceph_cap_reservation *caps_reservation);
+extern void __ceph_remove_cap(struct ceph_cap *cap,
+			      struct ceph_cap_reservation *ctx);
+static inline void ceph_remove_cap(struct ceph_cap *cap)
+{
+	struct inode *inode = &cap->ci->vfs_inode;
+	spin_lock(&inode->i_lock);
+	__ceph_remove_cap(cap, NULL);
+	spin_unlock(&inode->i_lock);
+}
+
+extern void ceph_queue_caps_release(struct inode *inode);
+extern int ceph_write_inode(struct inode *inode, int unused);
+extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+				    struct ceph_mds_session *session);
+extern int ceph_get_cap_mds(struct inode *inode);
+extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
+extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+				       struct ceph_snap_context *snapc);
+extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
+			       struct ceph_mds_session **psession);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+			    struct ceph_mds_session *session);
+extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc,
+				    int flushdirty);
+
+extern int ceph_encode_inode_release(void **p, struct inode *inode,
+				     int mds, int drop, int unless, int force);
+extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+				      int mds, int drop, int unless);
+
+extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+			 int *got, loff_t endoff);
+
+/* for counting open files by mode */
+static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
+{
+	ci->i_nr_by_mode[mode]++;
+}
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+
+/* addr.c */
+extern const struct address_space_operations ceph_aops;
+extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+
+/* file.c */
+extern const struct file_operations ceph_file_fops;
+extern const struct address_space_operations ceph_aops;
+extern int ceph_open(struct inode *inode, struct file *file);
+extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+				       struct nameidata *nd, int mode,
+				       int locked_dir);
+extern int ceph_release(struct inode *inode, struct file *filp);
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+
+/* dir.c */
+extern const struct file_operations ceph_dir_fops;
+extern const struct inode_operations ceph_dir_iops;
+extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
+	ceph_snapdir_dentry_ops;
+
+extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
+extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+					 struct dentry *dentry, int err);
+
+extern void ceph_dentry_lru_add(struct dentry *dn);
+extern void ceph_dentry_lru_touch(struct dentry *dn);
+extern void ceph_dentry_lru_del(struct dentry *dn);
+
+/*
+ * our d_ops vary depending on whether the inode is live,
+ * snapshotted (read-only), or a virtual ".snap" directory.
+ */
+int ceph_init_dentry(struct dentry *dentry);
+
+
+/* ioctl.c */
+extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/* export.c */
+extern const struct export_operations ceph_export_ops;
+
+/* debugfs.c */
+extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_cleanup(void);
+extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+
+static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
+{
+	if (dentry && dentry->d_parent)
+		return dentry->d_parent->d_inode;
+
+	return NULL;
+}
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..8a514568cab2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,28 @@
+#ifndef _FS_CEPH_TYPES_H
+#define _FS_CEPH_TYPES_H
+
+/* needed before including ceph_fs.h */
+#include <linux/in.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+
+#include "ceph_fs.h"
+#include "ceph_frag.h"
+
+/*
+ * Identify inodes by both their ino AND snapshot id (a u64).
+ */
+struct ceph_vino {
+	u64 ino;
+	u64 snap;
+};
+
+
+/* context for the caps reservation mechanism */
+struct ceph_cap_reservation {
+	int count;
+};
+
+
+#endif
-- 
cgit v1.2.2


From c30dbb9cc7fc75ab1d0ee6fb084ba4684f7a665d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:07 -0700
Subject: ceph: ref counted buffer

struct ceph_buffer is a simple ref-counted buffer.  We transparently
choose between kmalloc for small buffers and vmalloc for large ones.

This is currently used only for allocating memory for xattr data.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/buffer.c | 34 ++++++++++++++++++++++++++++++++++
 fs/ceph/buffer.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 fs/ceph/buffer.c
 create mode 100644 fs/ceph/buffer.h

(limited to 'fs')

diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..cf9aaccef22b
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,34 @@
+
+#include "ceph_debug.h"
+#include "buffer.h"
+
+struct ceph_buffer *ceph_buffer_new(gfp_t gfp)
+{
+	struct ceph_buffer *b;
+
+	b = kmalloc(sizeof(*b), gfp);
+	if (!b)
+		return NULL;
+	atomic_set(&b->nref, 1);
+	b->vec.iov_base = NULL;
+	b->vec.iov_len = 0;
+	b->alloc_len = 0;
+	return b;
+}
+
+int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
+{
+	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+	if (b->vec.iov_base) {
+		b->is_vmalloc = false;
+	} else {
+		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+		b->is_vmalloc = true;
+	}
+	if (!b->vec.iov_base)
+		return -ENOMEM;
+	b->alloc_len = len;
+	b->vec.iov_len = len;
+	return 0;
+}
+
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..16b1930acc45
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,55 @@
+#ifndef __FS_CEPH_BUFFER_H
+#define __FS_CEPH_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+
+/*
+ * a simple reference counted buffer.
+ *
+ * use kmalloc for small sizes (<= one page), vmalloc for larger
+ * sizes.
+ */
+struct ceph_buffer {
+	atomic_t nref;
+	struct kvec vec;
+	size_t alloc_len;
+	bool is_vmalloc;
+};
+
+struct ceph_buffer *ceph_buffer_new(gfp_t gfp);
+int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp);
+
+static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
+{
+	atomic_inc(&b->nref);
+	return b;
+}
+
+static inline void ceph_buffer_put(struct ceph_buffer *b)
+{
+	if (b && atomic_dec_and_test(&b->nref)) {
+		if (b->vec.iov_base) {
+			if (b->is_vmalloc)
+				vfree(b->vec.iov_base);
+			else
+				kfree(b->vec.iov_base);
+		}
+		kfree(b);
+	}
+}
+
+static inline struct ceph_buffer *ceph_buffer_new_alloc(int len, gfp_t gfp)
+{
+	struct ceph_buffer *b = ceph_buffer_new(gfp);
+
+	if (b && ceph_buffer_alloc(b, len, gfp) < 0) {
+		ceph_buffer_put(b);
+		b = NULL;
+	}
+	return b;
+}
+
+#endif
-- 
cgit v1.2.2


From 16725b9d2a2e3d0fd2b0034482e2eb0a2d78050f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:07 -0700
Subject: ceph: super.c

Mount option parsing, client setup and teardown, and a few odds and
ends (e.g., statfs).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 936 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 936 insertions(+)
 create mode 100644 fs/ceph/super.c

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..0723fb6e96a1
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,936 @@
+
+#include "ceph_debug.h"
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+
+#include "ceph_ver.h"
+#include "decode.h"
+#include "super.h"
+#include "mon_client.h"
+
+/*
+ * Ceph superblock operations
+ *
+ * Handle the basics of mounting, unmounting.
+ */
+
+
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+	const char *e = s + len;
+
+	while (e != s && *(e-1) != '/')
+		e--;
+	return e;
+}
+
+
+/*
+ * super ops
+ */
+static void ceph_put_super(struct super_block *s)
+{
+	struct ceph_client *cl = ceph_client(s);
+
+	dout("put_super\n");
+	ceph_mdsc_close_sessions(&cl->mdsc);
+	return;
+}
+
+static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
+	struct ceph_monmap *monmap = client->monc.monmap;
+	struct ceph_statfs st;
+	u64 fsid;
+	int err;
+
+	dout("statfs\n");
+	err = ceph_monc_do_statfs(&client->monc, &st);
+	if (err < 0)
+		return err;
+
+	/* fill in kstatfs */
+	buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
+
+	/*
+	 * express utilization in terms of large blocks to avoid
+	 * overflow on 32-bit machines.
+	 */
+	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
+	buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
+		(CEPH_BLOCK_SHIFT-10);
+	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+
+	buf->f_files = le64_to_cpu(st.num_objects);
+	buf->f_ffree = -1;
+	buf->f_namelen = PATH_MAX;
+	buf->f_frsize = PAGE_CACHE_SIZE;
+
+	/* leave fsid little-endian, regardless of host endianness */
+	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
+	buf->f_fsid.val[0] = fsid & 0xffffffff;
+	buf->f_fsid.val[1] = fsid >> 32;
+
+	return 0;
+}
+
+
+static int ceph_syncfs(struct super_block *sb, int wait)
+{
+	dout("sync_fs %d\n", wait);
+	ceph_osdc_sync(&ceph_client(sb)->osdc);
+	ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+	return 0;
+}
+
+
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
+	struct ceph_mount_args *args = &client->mount_args;
+
+	if (args->flags & CEPH_OPT_FSID)
+		seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
+			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
+			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
+	if (args->flags & CEPH_OPT_NOSHARE)
+		seq_puts(m, ",noshare");
+	if (args->flags & CEPH_OPT_DIRSTAT)
+		seq_puts(m, ",dirstat");
+	if ((args->flags & CEPH_OPT_RBYTES) == 0)
+		seq_puts(m, ",norbytes");
+	if (args->flags & CEPH_OPT_NOCRC)
+		seq_puts(m, ",nocrc");
+	if (args->flags & CEPH_OPT_NOASYNCREADDIR)
+		seq_puts(m, ",noasyncreaddir");
+	if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+		seq_printf(m, ",snapdirname=%s", args->snapdir_name);
+	if (args->secret)
+		seq_puts(m, ",secret=<hidden>");
+	return 0;
+}
+
+/*
+ * caches
+ */
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
+{
+	struct ceph_inode_info *ci = foo;
+	inode_init_once(&ci->vfs_inode);
+}
+
+static int __init init_caches(void)
+{
+	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+				      sizeof(struct ceph_inode_info),
+				      __alignof__(struct ceph_inode_info),
+				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+				      ceph_inode_init_once);
+	if (ceph_inode_cachep == NULL)
+		return -ENOMEM;
+
+	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_cap_cachep == NULL)
+		goto bad_cap;
+
+	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_dentry_cachep == NULL)
+		goto bad_dentry;
+
+	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_file_cachep == NULL)
+		goto bad_file;
+
+	return 0;
+
+bad_file:
+	kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+	kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+	kmem_cache_destroy(ceph_inode_cachep);
+	return -ENOMEM;
+}
+
+static void destroy_caches(void)
+{
+	kmem_cache_destroy(ceph_inode_cachep);
+	kmem_cache_destroy(ceph_cap_cachep);
+	kmem_cache_destroy(ceph_dentry_cachep);
+	kmem_cache_destroy(ceph_file_cachep);
+}
+
+
+/*
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
+ */
+static void ceph_umount_begin(struct super_block *sb)
+{
+	struct ceph_client *client = ceph_sb_to_client(sb);
+
+	dout("ceph_umount_begin - starting forced umount\n");
+	if (!client)
+		return;
+	client->mount_state = CEPH_MOUNT_SHUTDOWN;
+	return;
+}
+
+static const struct super_operations ceph_super_ops = {
+	.alloc_inode	= ceph_alloc_inode,
+	.destroy_inode	= ceph_destroy_inode,
+	.write_inode    = ceph_write_inode,
+	.sync_fs        = ceph_syncfs,
+	.put_super	= ceph_put_super,
+	.show_options   = ceph_show_options,
+	.statfs		= ceph_statfs,
+	.umount_begin   = ceph_umount_begin,
+};
+
+
+const char *ceph_msg_type_name(int type)
+{
+	switch (type) {
+	case CEPH_MSG_SHUTDOWN: return "shutdown";
+	case CEPH_MSG_PING: return "ping";
+	case CEPH_MSG_MON_MAP: return "mon_map";
+	case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+	case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+	case CEPH_MSG_CLIENT_MOUNT: return "client_mount";
+	case CEPH_MSG_CLIENT_MOUNT_ACK: return "client_mount_ack";
+	case CEPH_MSG_STATFS: return "statfs";
+	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+	case CEPH_MSG_MDS_GETMAP: return "mds_getmap";
+	case CEPH_MSG_MDS_MAP: return "mds_map";
+	case CEPH_MSG_CLIENT_SESSION: return "client_session";
+	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+	case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+	case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+	case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+	case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+	case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+	case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+	case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+	case CEPH_MSG_OSD_GETMAP: return "osd_getmap";
+	case CEPH_MSG_OSD_MAP: return "osd_map";
+	case CEPH_MSG_OSD_OP: return "osd_op";
+	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+	default: return "unknown";
+	}
+}
+
+
+/*
+ * mount options
+ */
+enum {
+	Opt_fsidmajor,
+	Opt_fsidminor,
+	Opt_monport,
+	Opt_wsize,
+	Opt_rsize,
+	Opt_osdtimeout,
+	Opt_mount_timeout,
+	Opt_caps_wanted_delay_min,
+	Opt_caps_wanted_delay_max,
+	Opt_readdir_max_entries,
+	/* int args above */
+	Opt_snapdirname,
+	Opt_secret,
+	/* string args above */
+	Opt_ip,
+	Opt_noshare,
+	Opt_dirstat,
+	Opt_nodirstat,
+	Opt_rbytes,
+	Opt_norbytes,
+	Opt_nocrc,
+	Opt_noasyncreaddir,
+};
+
+static match_table_t arg_tokens = {
+	{Opt_fsidmajor, "fsidmajor=%ld"},
+	{Opt_fsidminor, "fsidminor=%ld"},
+	{Opt_monport, "monport=%d"},
+	{Opt_wsize, "wsize=%d"},
+	{Opt_rsize, "rsize=%d"},
+	{Opt_osdtimeout, "osdtimeout=%d"},
+	{Opt_mount_timeout, "mount_timeout=%d"},
+	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
+	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
+	/* int args above */
+	{Opt_snapdirname, "snapdirname=%s"},
+	{Opt_secret, "secret=%s"},
+	/* string args above */
+	{Opt_ip, "ip=%s"},
+	{Opt_noshare, "noshare"},
+	{Opt_dirstat, "dirstat"},
+	{Opt_nodirstat, "nodirstat"},
+	{Opt_rbytes, "rbytes"},
+	{Opt_norbytes, "norbytes"},
+	{Opt_nocrc, "nocrc"},
+	{Opt_noasyncreaddir, "noasyncreaddir"},
+	{-1, NULL}
+};
+
+
+static int parse_mount_args(struct ceph_client *client,
+			    int flags, char *options, const char *dev_name,
+			    const char **path)
+{
+	struct ceph_mount_args *args = &client->mount_args;
+	const char *c;
+	int err;
+	substring_t argstr[MAX_OPT_ARGS];
+	int num_mon;
+	struct ceph_entity_addr mon_addr[CEPH_MAX_MON_MOUNT_ADDR];
+	int i;
+
+	dout("parse_mount_args dev_name '%s'\n", dev_name);
+	memset(args, 0, sizeof(*args));
+
+	/* start with defaults */
+	args->sb_flags = flags;
+	args->flags = CEPH_OPT_DEFAULT;
+	args->osd_timeout = 5;    /* seconds */
+	args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+	args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+	args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+	args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+	args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
+	args->max_readdir = 1024;
+
+	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+	if (!dev_name)
+		return -EINVAL;
+	*path = strstr(dev_name, ":/");
+	if (*path == NULL) {
+		pr_err("device name is missing path (no :/ in %s)\n",
+		       dev_name);
+		return -EINVAL;
+	}
+
+	/* get mon ip(s) */
+	err = ceph_parse_ips(dev_name, *path, mon_addr,
+			     CEPH_MAX_MON_MOUNT_ADDR, &num_mon);
+	if (err < 0)
+		return err;
+
+	/* build initial monmap */
+	client->monc.monmap = kzalloc(sizeof(*client->monc.monmap) +
+			       num_mon*sizeof(client->monc.monmap->mon_inst[0]),
+			       GFP_KERNEL);
+	if (!client->monc.monmap)
+		return -ENOMEM;
+	for (i = 0; i < num_mon; i++) {
+		client->monc.monmap->mon_inst[i].addr = mon_addr[i];
+		client->monc.monmap->mon_inst[i].addr.erank = 0;
+		client->monc.monmap->mon_inst[i].addr.nonce = 0;
+		client->monc.monmap->mon_inst[i].name.type =
+			CEPH_ENTITY_TYPE_MON;
+		client->monc.monmap->mon_inst[i].name.num = cpu_to_le64(i);
+	}
+	client->monc.monmap->num_mon = num_mon;
+	memset(&args->my_addr.in_addr, 0, sizeof(args->my_addr.in_addr));
+
+	/* path on server */
+	*path += 2;
+	dout("server path '%s'\n", *path);
+
+	/* parse mount options */
+	while ((c = strsep(&options, ",")) != NULL) {
+		int token, intval, ret;
+		if (!*c)
+			continue;
+		token = match_token((char *)c, arg_tokens, argstr);
+		if (token < 0) {
+			pr_err("bad mount option at '%s'\n", c);
+			return -EINVAL;
+
+		}
+		if (token < Opt_ip) {
+			ret = match_int(&argstr[0], &intval);
+			if (ret < 0) {
+				pr_err("bad mount option arg (not int) "
+				       "at '%s'\n", c);
+				continue;
+			}
+			dout("got token %d intval %d\n", token, intval);
+		}
+		switch (token) {
+		case Opt_fsidmajor:
+			*(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
+			break;
+		case Opt_fsidminor:
+			*(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
+			break;
+		case Opt_ip:
+			err = ceph_parse_ips(argstr[0].from,
+					     argstr[0].to,
+					     &args->my_addr,
+					     1, NULL);
+			if (err < 0)
+				return err;
+			args->flags |= CEPH_OPT_MYIP;
+			break;
+
+		case Opt_snapdirname:
+			kfree(args->snapdir_name);
+			args->snapdir_name = kstrndup(argstr[0].from,
+					      argstr[0].to-argstr[0].from,
+					      GFP_KERNEL);
+			break;
+		case Opt_secret:
+			args->secret = kstrndup(argstr[0].from,
+						argstr[0].to-argstr[0].from,
+						GFP_KERNEL);
+			break;
+
+			/* misc */
+		case Opt_wsize:
+			args->wsize = intval;
+			break;
+		case Opt_rsize:
+			args->rsize = intval;
+			break;
+		case Opt_osdtimeout:
+			args->osd_timeout = intval;
+			break;
+		case Opt_mount_timeout:
+			args->mount_timeout = intval;
+			break;
+		case Opt_caps_wanted_delay_min:
+			args->caps_wanted_delay_min = intval;
+			break;
+		case Opt_caps_wanted_delay_max:
+			args->caps_wanted_delay_max = intval;
+			break;
+		case Opt_readdir_max_entries:
+			args->max_readdir = intval;
+			break;
+
+		case Opt_noshare:
+			args->flags |= CEPH_OPT_NOSHARE;
+			break;
+
+		case Opt_dirstat:
+			args->flags |= CEPH_OPT_DIRSTAT;
+			break;
+		case Opt_nodirstat:
+			args->flags &= ~CEPH_OPT_DIRSTAT;
+			break;
+		case Opt_rbytes:
+			args->flags |= CEPH_OPT_RBYTES;
+			break;
+		case Opt_norbytes:
+			args->flags &= ~CEPH_OPT_RBYTES;
+			break;
+		case Opt_nocrc:
+			args->flags |= CEPH_OPT_NOCRC;
+			break;
+		case Opt_noasyncreaddir:
+			args->flags |= CEPH_OPT_NOASYNCREADDIR;
+			break;
+
+		default:
+			BUG_ON(token);
+		}
+	}
+
+	return 0;
+}
+
+static void release_mount_args(struct ceph_mount_args *args)
+{
+	kfree(args->snapdir_name);
+	args->snapdir_name = NULL;
+	kfree(args->secret);
+	args->secret = NULL;
+}
+
+/*
+ * create a fresh client instance
+ */
+static struct ceph_client *ceph_create_client(void)
+{
+	struct ceph_client *client;
+	int err = -ENOMEM;
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (client == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&client->mount_mutex);
+
+	init_waitqueue_head(&client->mount_wq);
+
+	client->sb = NULL;
+	client->mount_state = CEPH_MOUNT_MOUNTING;
+	client->whoami = -1;
+
+	client->msgr = NULL;
+
+	client->mount_err = 0;
+	client->signed_ticket = NULL;
+	client->signed_ticket_len = 0;
+
+	err = -ENOMEM;
+	client->wb_wq = create_workqueue("ceph-writeback");
+	if (client->wb_wq == NULL)
+		goto fail;
+	client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+	if (client->pg_inv_wq == NULL)
+		goto fail_wb_wq;
+	client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+	if (client->trunc_wq == NULL)
+		goto fail_pg_inv_wq;
+
+	/* subsystems */
+	err = ceph_monc_init(&client->monc, client);
+	if (err < 0)
+		goto fail_trunc_wq;
+	err = ceph_osdc_init(&client->osdc, client);
+	if (err < 0)
+		goto fail_monc;
+	ceph_mdsc_init(&client->mdsc, client);
+	return client;
+
+fail_monc:
+	ceph_monc_stop(&client->monc);
+fail_trunc_wq:
+	destroy_workqueue(client->trunc_wq);
+fail_pg_inv_wq:
+	destroy_workqueue(client->pg_inv_wq);
+fail_wb_wq:
+	destroy_workqueue(client->wb_wq);
+fail:
+	kfree(client);
+	return ERR_PTR(err);
+}
+
+static void ceph_destroy_client(struct ceph_client *client)
+{
+	dout("destroy_client %p\n", client);
+
+	/* unmount */
+	ceph_mdsc_stop(&client->mdsc);
+	ceph_monc_stop(&client->monc);
+	ceph_osdc_stop(&client->osdc);
+
+	kfree(client->signed_ticket);
+
+	ceph_debugfs_client_cleanup(client);
+	destroy_workqueue(client->wb_wq);
+	destroy_workqueue(client->pg_inv_wq);
+	destroy_workqueue(client->trunc_wq);
+
+	if (client->msgr)
+		ceph_messenger_destroy(client->msgr);
+	if (client->wb_pagevec_pool)
+		mempool_destroy(client->wb_pagevec_pool);
+
+	release_mount_args(&client->mount_args);
+
+	kfree(client);
+	dout("destroy_client %p done\n", client);
+}
+
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_map(struct ceph_client *client)
+{
+	return client->monc.monmap && client->monc.monmap->epoch;
+}
+
+/*
+ * Bootstrap mount by opening the root directory.  Note the mount
+ * @started time from caller, and time out if this takes too long.
+ */
+static struct dentry *open_root_dentry(struct ceph_client *client,
+				       const char *path,
+				       unsigned long started)
+{
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req = NULL;
+	int err;
+	struct dentry *root;
+
+	/* open dir */
+	dout("open_root_inode opening '%s'\n", path);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return ERR_PTR(PTR_ERR(req));
+	req->r_path1 = kstrdup(path, GFP_NOFS);
+	req->r_ino1.ino = CEPH_INO_ROOT;
+	req->r_ino1.snap = CEPH_NOSNAP;
+	req->r_started = started;
+	req->r_timeout = client->mount_args.mount_timeout * HZ;
+	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+	req->r_num_caps = 2;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	if (err == 0) {
+		dout("open_root_inode success\n");
+		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
+		    client->sb->s_root == NULL)
+			root = d_alloc_root(req->r_target_inode);
+		else
+			root = d_obtain_alias(req->r_target_inode);
+		req->r_target_inode = NULL;
+		dout("open_root_inode success, root dentry is %p\n", root);
+	} else {
+		root = ERR_PTR(err);
+	}
+	ceph_mdsc_put_request(req);
+	return root;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+		      const char *path)
+{
+	struct ceph_entity_addr *myaddr = NULL;
+	int err;
+	unsigned long timeout = client->mount_args.mount_timeout * HZ;
+	unsigned long started = jiffies;  /* note the start time */
+	struct dentry *root;
+
+	dout("mount start\n");
+	mutex_lock(&client->mount_mutex);
+
+	/* initialize the messenger */
+	if (client->msgr == NULL) {
+		if (ceph_test_opt(client, MYIP))
+			myaddr = &client->mount_args.my_addr;
+		client->msgr = ceph_messenger_create(myaddr);
+		if (IS_ERR(client->msgr)) {
+			err = PTR_ERR(client->msgr);
+			client->msgr = NULL;
+			goto out;
+		}
+		client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+	}
+
+	/* send mount request, and wait for mon, mds, and osd maps */
+	err = ceph_monc_request_mount(&client->monc);
+	if (err < 0)
+		goto out;
+
+	while (!have_mon_map(client) && !client->mount_err) {
+		err = -EIO;
+		if (timeout && time_after_eq(jiffies, started + timeout))
+			goto out;
+
+		/* wait */
+		dout("mount waiting for mount\n");
+		err = wait_event_interruptible_timeout(client->mount_wq,
+			       client->mount_err || have_mon_map(client),
+			       timeout);
+		if (err == -EINTR || err == -ERESTARTSYS)
+			goto out;
+		if (client->mount_err) {
+			err = client->mount_err;
+			goto out;
+		}
+	}
+
+	dout("mount opening root\n");
+	root = open_root_dentry(client, "", started);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto out;
+	}
+	if (client->sb->s_root)
+		dput(root);
+	else
+		client->sb->s_root = root;
+
+	if (path[0] == 0) {
+		dget(root);
+	} else {
+		dout("mount opening base mountpoint\n");
+		root = open_root_dentry(client, path, started);
+		if (IS_ERR(root)) {
+			err = PTR_ERR(root);
+			dput(client->sb->s_root);
+			client->sb->s_root = NULL;
+			goto out;
+		}
+	}
+
+	mnt->mnt_root = root;
+	mnt->mnt_sb = client->sb;
+
+	client->mount_state = CEPH_MOUNT_MOUNTED;
+	dout("mount success\n");
+	err = 0;
+
+out:
+	mutex_unlock(&client->mount_mutex);
+	return err;
+}
+
+static int ceph_set_super(struct super_block *s, void *data)
+{
+	struct ceph_client *client = data;
+	int ret;
+
+	dout("set_super %p data %p\n", s, data);
+
+	s->s_flags = client->mount_args.sb_flags;
+	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
+
+	s->s_fs_info = client;
+	client->sb = s;
+
+	s->s_op = &ceph_super_ops;
+	s->s_export_op = &ceph_export_ops;
+
+	s->s_time_gran = 1000;  /* 1000 ns == 1 us */
+
+	ret = set_anon_super(s, NULL);  /* what is that second arg for? */
+	if (ret != 0)
+		goto fail;
+
+	return ret;
+
+fail:
+	s->s_fs_info = NULL;
+	client->sb = NULL;
+	return ret;
+}
+
+/*
+ * share superblock if same fs AND options
+ */
+static int ceph_compare_super(struct super_block *sb, void *data)
+{
+	struct ceph_client *new = data;
+	struct ceph_mount_args *args = &new->mount_args;
+	struct ceph_client *other = ceph_sb_to_client(sb);
+	int i;
+
+	dout("ceph_compare_super %p\n", sb);
+	if (args->flags & CEPH_OPT_FSID) {
+		if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
+			dout("fsid doesn't match\n");
+			return 0;
+		}
+	} else {
+		/* do we share (a) monitor? */
+		for (i = 0; i < new->monc.monmap->num_mon; i++)
+			if (ceph_monmap_contains(other->monc.monmap,
+					 &new->monc.monmap->mon_inst[i].addr))
+				break;
+		if (i == new->monc.monmap->num_mon) {
+			dout("mon ip not part of monmap\n");
+			return 0;
+		}
+		dout("mon ip matches existing sb %p\n", sb);
+	}
+	if (args->sb_flags != other->mount_args.sb_flags) {
+		dout("flags differ\n");
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * construct our own bdi so we can control readahead, etc.
+ */
+static int ceph_init_bdi(struct super_block *sb, struct ceph_client *client)
+{
+	int err;
+
+	err = bdi_init(&client->backing_dev_info);
+	if (err < 0)
+		return err;
+
+	/* set ra_pages based on rsize mount option? */
+	if (client->mount_args.rsize >= PAGE_CACHE_SIZE)
+		client->backing_dev_info.ra_pages =
+			(client->mount_args.rsize + PAGE_CACHE_SIZE - 1)
+			>> PAGE_SHIFT;
+
+	err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+	return err;
+}
+
+static int ceph_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name, void *data,
+		       struct vfsmount *mnt)
+{
+	struct super_block *sb;
+	struct ceph_client *client;
+	int err;
+	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
+	const char *path;
+
+	dout("ceph_get_sb\n");
+
+	/* create client (which we may/may not use) */
+	client = ceph_create_client();
+	if (IS_ERR(client))
+		return PTR_ERR(client);
+
+	err = parse_mount_args(client, flags, data, dev_name, &path);
+	if (err < 0)
+		goto out;
+
+	if (client->mount_args.flags & CEPH_OPT_NOSHARE)
+		compare_super = NULL;
+	sb = sget(fs_type, compare_super, ceph_set_super, client);
+	if (IS_ERR(sb)) {
+		err = PTR_ERR(sb);
+		goto out;
+	}
+
+	if (ceph_client(sb) != client) {
+		ceph_destroy_client(client);
+		client = ceph_client(sb);
+		dout("get_sb got existing client %p\n", client);
+	} else {
+		dout("get_sb using new client %p\n", client);
+
+		/* set up mempools */
+		err = -ENOMEM;
+		client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+			      client->mount_args.wsize >> PAGE_CACHE_SHIFT);
+		if (!client->wb_pagevec_pool)
+			goto out_splat;
+
+		err = ceph_init_bdi(sb, client);
+		if (err < 0)
+			goto out_splat;
+	}
+
+	err = ceph_mount(client, mnt, path);
+	if (err < 0)
+		goto out_splat;
+	dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
+	     mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
+	return 0;
+
+out_splat:
+	ceph_mdsc_close_sessions(&client->mdsc);
+	up_write(&sb->s_umount);
+	deactivate_super(sb);
+	goto out_final;
+
+out:
+	ceph_destroy_client(client);
+out_final:
+	dout("ceph_get_sb fail %d\n", err);
+	return err;
+}
+
+static void ceph_kill_sb(struct super_block *s)
+{
+	struct ceph_client *client = ceph_sb_to_client(s);
+	dout("kill_sb %p\n", s);
+	ceph_mdsc_pre_umount(&client->mdsc);
+	bdi_unregister(&client->backing_dev_info);
+	kill_anon_super(s);    /* will call put_super after sb is r/o */
+	bdi_destroy(&client->backing_dev_info);
+	ceph_destroy_client(client);
+}
+
+static struct file_system_type ceph_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ceph",
+	.get_sb		= ceph_get_sb,
+	.kill_sb	= ceph_kill_sb,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE,
+};
+
+#define _STRINGIFY(x) #x
+#define STRINGIFY(x) _STRINGIFY(x)
+
+static int __init init_ceph(void)
+{
+	int ret = 0;
+
+	ret = ceph_debugfs_init();
+	if (ret < 0)
+		goto out;
+
+	ret = ceph_msgr_init();
+	if (ret < 0)
+		goto out_debugfs;
+
+	ret = init_caches();
+	if (ret)
+		goto out_msgr;
+
+	ceph_caps_init();
+
+	ret = register_filesystem(&ceph_fs_type);
+	if (ret)
+		goto out_icache;
+
+	pr_info("loaded (%s)\n", STRINGIFY(CEPH_GIT_VER));
+	return 0;
+
+out_icache:
+	destroy_caches();
+out_msgr:
+	ceph_msgr_exit();
+out_debugfs:
+	ceph_debugfs_cleanup();
+out:
+	return ret;
+}
+
+static void __exit exit_ceph(void)
+{
+	dout("exit_ceph\n");
+	unregister_filesystem(&ceph_fs_type);
+	ceph_caps_finalize();
+	destroy_caches();
+	ceph_msgr_exit();
+	ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph);
+module_exit(exit_ceph);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.2


From 355da1eb7a1f91c276b991764e951bbcd8047599 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:08 -0700
Subject: ceph: inode operations

Inode cache and inode operations.  We also include routines to
incorporate metadata structures returned by the MDS into the client
cache, and some helpers to deal with file capabilities and metadata
leases.  The bulk of that work is done by fill_inode() and
fill_trace().

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 1620 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/xattr.c |  833 ++++++++++++++++++++++++++++
 2 files changed, 2453 insertions(+)
 create mode 100644 fs/ceph/inode.c
 create mode 100644 fs/ceph/xattr.c

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..6097af790047
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1620 @@
+#include "ceph_debug.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include <linux/vmalloc.h>
+
+#include "super.h"
+#include "decode.h"
+
+/*
+ * Ceph inode operations
+ *
+ * Implement basic inode helpers (get, alloc) and inode ops (getattr,
+ * setattr, etc.), xattr helpers, and helpers for assimilating
+ * metadata returned by the MDS into our cache.
+ *
+ * Also define helpers for doing asynchronous writeback, invalidation,
+ * and truncation for the benefit of those who can't afford to block
+ * (typically because they are in the message handler path).
+ */
+
+static const struct inode_operations ceph_symlink_iops;
+
+static void ceph_inode_invalidate_pages(struct work_struct *work);
+
+/*
+ * find or create an inode, given the ceph ino number
+ */
+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+{
+	struct inode *inode;
+	ino_t t = ceph_vino_to_ino(vino);
+
+	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+	if (inode == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (inode->i_state & I_NEW) {
+		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
+		     inode, ceph_vinop(inode), (u64)inode->i_ino);
+		unlock_new_inode(inode);
+	}
+
+	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
+	     vino.snap, inode);
+	return inode;
+}
+
+/*
+ * get/constuct snapdir inode for a given directory
+ */
+struct inode *ceph_get_snapdir(struct inode *parent)
+{
+	struct ceph_vino vino = {
+		.ino = ceph_ino(parent),
+		.snap = CEPH_SNAPDIR,
+	};
+	struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+
+	BUG_ON(!S_ISDIR(parent->i_mode));
+	if (IS_ERR(inode))
+		return ERR_PTR(PTR_ERR(inode));
+	inode->i_mode = parent->i_mode;
+	inode->i_uid = parent->i_uid;
+	inode->i_gid = parent->i_gid;
+	inode->i_op = &ceph_dir_iops;
+	inode->i_fop = &ceph_dir_fops;
+	ceph_inode(inode)->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+	return inode;
+}
+
+const struct inode_operations ceph_file_iops = {
+	.permission = ceph_permission,
+	.setattr = ceph_setattr,
+	.getattr = ceph_getattr,
+	.setxattr = ceph_setxattr,
+	.getxattr = ceph_getxattr,
+	.listxattr = ceph_listxattr,
+	.removexattr = ceph_removexattr,
+};
+
+
+/*
+ * We use a 'frag tree' to keep track of the MDS's directory fragments
+ * for a given inode (usually there is just a single fragment).  We
+ * need to know when a child frag is delegated to a new MDS, or when
+ * it is flagged as replicated, so we can direct our requests
+ * accordingly.
+ */
+
+/*
+ * find/create a frag in the tree
+ */
+static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
+						    u32 f)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_frag *frag;
+	int c;
+
+	p = &ci->i_fragtree.rb_node;
+	while (*p) {
+		parent = *p;
+		frag = rb_entry(parent, struct ceph_inode_frag, node);
+		c = ceph_frag_compare(f, frag->frag);
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else
+			return frag;
+	}
+
+	frag = kmalloc(sizeof(*frag), GFP_NOFS);
+	if (!frag) {
+		pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
+		       "frag %x\n", &ci->vfs_inode,
+		       ceph_vinop(&ci->vfs_inode), f);
+		return ERR_PTR(-ENOMEM);
+	}
+	frag->frag = f;
+	frag->split_by = 0;
+	frag->mds = -1;
+	frag->ndist = 0;
+
+	rb_link_node(&frag->node, parent, p);
+	rb_insert_color(&frag->node, &ci->i_fragtree);
+
+	dout("get_or_create_frag added %llx.%llx frag %x\n",
+	     ceph_vinop(&ci->vfs_inode), f);
+	return frag;
+}
+
+/*
+ * find a specific frag @f
+ */
+struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
+{
+	struct rb_node *n = ci->i_fragtree.rb_node;
+
+	while (n) {
+		struct ceph_inode_frag *frag =
+			rb_entry(n, struct ceph_inode_frag, node);
+		int c = ceph_frag_compare(f, frag->frag);
+		if (c < 0)
+			n = n->rb_left;
+		else if (c > 0)
+			n = n->rb_right;
+		else
+			return frag;
+	}
+	return NULL;
+}
+
+/*
+ * Choose frag containing the given value @v.  If @pfrag is
+ * specified, copy the frag delegation info to the caller if
+ * it is present.
+ */
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+		     struct ceph_inode_frag *pfrag,
+		     int *found)
+{
+	u32 t = ceph_frag_make(0, 0);
+	struct ceph_inode_frag *frag;
+	unsigned nway, i;
+	u32 n;
+
+	if (found)
+		*found = 0;
+
+	mutex_lock(&ci->i_fragtree_mutex);
+	while (1) {
+		WARN_ON(!ceph_frag_contains_value(t, v));
+		frag = __ceph_find_frag(ci, t);
+		if (!frag)
+			break; /* t is a leaf */
+		if (frag->split_by == 0) {
+			if (pfrag)
+				memcpy(pfrag, frag, sizeof(*pfrag));
+			if (found)
+				*found = 1;
+			break;
+		}
+
+		/* choose child */
+		nway = 1 << frag->split_by;
+		dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
+		     frag->split_by, nway);
+		for (i = 0; i < nway; i++) {
+			n = ceph_frag_make_child(t, frag->split_by, i);
+			if (ceph_frag_contains_value(n, v)) {
+				t = n;
+				break;
+			}
+		}
+		BUG_ON(i == nway);
+	}
+	dout("choose_frag(%x) = %x\n", v, t);
+
+	mutex_unlock(&ci->i_fragtree_mutex);
+	return t;
+}
+
+/*
+ * Process dirfrag (delegation) info from the mds.  Include leaf
+ * fragment in tree ONLY if ndist > 0.  Otherwise, only
+ * branches/splits are included in i_fragtree)
+ */
+static int ceph_fill_dirfrag(struct inode *inode,
+			     struct ceph_mds_reply_dirfrag *dirinfo)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_inode_frag *frag;
+	u32 id = le32_to_cpu(dirinfo->frag);
+	int mds = le32_to_cpu(dirinfo->auth);
+	int ndist = le32_to_cpu(dirinfo->ndist);
+	int i;
+	int err = 0;
+
+	mutex_lock(&ci->i_fragtree_mutex);
+	if (ndist == 0) {
+		/* no delegation info needed. */
+		frag = __ceph_find_frag(ci, id);
+		if (!frag)
+			goto out;
+		if (frag->split_by == 0) {
+			/* tree leaf, remove */
+			dout("fill_dirfrag removed %llx.%llx frag %x"
+			     " (no ref)\n", ceph_vinop(inode), id);
+			rb_erase(&frag->node, &ci->i_fragtree);
+			kfree(frag);
+		} else {
+			/* tree branch, keep and clear */
+			dout("fill_dirfrag cleared %llx.%llx frag %x"
+			     " referral\n", ceph_vinop(inode), id);
+			frag->mds = -1;
+			frag->ndist = 0;
+		}
+		goto out;
+	}
+
+
+	/* find/add this frag to store mds delegation info */
+	frag = __get_or_create_frag(ci, id);
+	if (IS_ERR(frag)) {
+		/* this is not the end of the world; we can continue
+		   with bad/inaccurate delegation info */
+		pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
+		       ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
+		err = -ENOMEM;
+		goto out;
+	}
+
+	frag->mds = mds;
+	frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
+	for (i = 0; i < frag->ndist; i++)
+		frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
+	dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
+	     ceph_vinop(inode), frag->frag, frag->ndist);
+
+out:
+	mutex_unlock(&ci->i_fragtree_mutex);
+	return err;
+}
+
+
+/*
+ * initialize a newly allocated inode.
+ */
+struct inode *ceph_alloc_inode(struct super_block *sb)
+{
+	struct ceph_inode_info *ci;
+	int i;
+
+	ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+	if (!ci)
+		return NULL;
+
+	dout("alloc_inode %p\n", &ci->vfs_inode);
+
+	ci->i_version = 0;
+	ci->i_time_warp_seq = 0;
+	ci->i_ceph_flags = 0;
+	ci->i_release_count = 0;
+	ci->i_symlink = NULL;
+
+	ci->i_fragtree = RB_ROOT;
+	mutex_init(&ci->i_fragtree_mutex);
+
+	ci->i_xattrs.blob = NULL;
+	ci->i_xattrs.prealloc_blob = NULL;
+	ci->i_xattrs.dirty = false;
+	ci->i_xattrs.index = RB_ROOT;
+	ci->i_xattrs.count = 0;
+	ci->i_xattrs.names_size = 0;
+	ci->i_xattrs.vals_size = 0;
+	ci->i_xattrs.version = 0;
+	ci->i_xattrs.index_version = 0;
+
+	ci->i_caps = RB_ROOT;
+	ci->i_auth_cap = NULL;
+	ci->i_dirty_caps = 0;
+	ci->i_flushing_caps = 0;
+	INIT_LIST_HEAD(&ci->i_dirty_item);
+	INIT_LIST_HEAD(&ci->i_flushing_item);
+	ci->i_cap_flush_seq = 0;
+	ci->i_cap_flush_last_tid = 0;
+	memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+	init_waitqueue_head(&ci->i_cap_wq);
+	ci->i_hold_caps_min = 0;
+	ci->i_hold_caps_max = 0;
+	INIT_LIST_HEAD(&ci->i_cap_delay_list);
+	ci->i_cap_exporting_mds = 0;
+	ci->i_cap_exporting_mseq = 0;
+	ci->i_cap_exporting_issued = 0;
+	INIT_LIST_HEAD(&ci->i_cap_snaps);
+	ci->i_head_snapc = NULL;
+	ci->i_snap_caps = 0;
+
+	for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+		ci->i_nr_by_mode[i] = 0;
+
+	ci->i_truncate_seq = 0;
+	ci->i_truncate_size = 0;
+	ci->i_truncate_pending = 0;
+
+	ci->i_max_size = 0;
+	ci->i_reported_size = 0;
+	ci->i_wanted_max_size = 0;
+	ci->i_requested_max_size = 0;
+
+	ci->i_pin_ref = 0;
+	ci->i_rd_ref = 0;
+	ci->i_rdcache_ref = 0;
+	ci->i_wr_ref = 0;
+	ci->i_wrbuffer_ref = 0;
+	ci->i_wrbuffer_ref_head = 0;
+	ci->i_shared_gen = 0;
+	ci->i_rdcache_gen = 0;
+	ci->i_rdcache_revoking = 0;
+
+	INIT_LIST_HEAD(&ci->i_unsafe_writes);
+	INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+	spin_lock_init(&ci->i_unsafe_lock);
+
+	ci->i_snap_realm = NULL;
+	INIT_LIST_HEAD(&ci->i_snap_realm_item);
+	INIT_LIST_HEAD(&ci->i_snap_flush_item);
+
+	INIT_WORK(&ci->i_wb_work, ceph_inode_writeback);
+	INIT_WORK(&ci->i_pg_inv_work, ceph_inode_invalidate_pages);
+
+	INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
+
+	return &ci->vfs_inode;
+}
+
+void ceph_destroy_inode(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_inode_frag *frag;
+	struct rb_node *n;
+
+	dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+
+	ceph_queue_caps_release(inode);
+
+	kfree(ci->i_symlink);
+	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
+		frag = rb_entry(n, struct ceph_inode_frag, node);
+		rb_erase(n, &ci->i_fragtree);
+		kfree(frag);
+	}
+
+	__ceph_destroy_xattrs(ci);
+	ceph_buffer_put(ci->i_xattrs.blob);
+	ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+
+	kmem_cache_free(ceph_inode_cachep, ci);
+}
+
+
+/*
+ * Helpers to fill in size, ctime, mtime, and atime.  We have to be
+ * careful because either the client or MDS may have more up to date
+ * info, depending on which capabilities are held, and whether
+ * time_warp_seq or truncate_seq have increased.  (Ordinarily, mtime
+ * and size are monotonically increasing, except when utimes() or
+ * truncate() increments the corresponding _seq values.)
+ */
+int ceph_fill_file_size(struct inode *inode, int issued,
+			u32 truncate_seq, u64 truncate_size, u64 size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int queue_trunc = 0;
+
+	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
+	    (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
+		dout("size %lld -> %llu\n", inode->i_size, size);
+		inode->i_size = size;
+		inode->i_blocks = (size + (1<<9) - 1) >> 9;
+		ci->i_reported_size = size;
+		if (truncate_seq != ci->i_truncate_seq) {
+			dout("truncate_seq %u -> %u\n",
+			     ci->i_truncate_seq, truncate_seq);
+			ci->i_truncate_seq = truncate_seq;
+			if (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
+				      CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
+				      CEPH_CAP_FILE_EXCL)) {
+				ci->i_truncate_pending++;
+				queue_trunc = 1;
+			}
+		}
+	}
+	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
+	    ci->i_truncate_size != truncate_size) {
+		dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
+		     truncate_size);
+		ci->i_truncate_size = truncate_size;
+	}
+	return queue_trunc;
+}
+
+void ceph_fill_file_time(struct inode *inode, int issued,
+			 u64 time_warp_seq, struct timespec *ctime,
+			 struct timespec *mtime, struct timespec *atime)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int warn = 0;
+
+	if (issued & (CEPH_CAP_FILE_EXCL|
+		      CEPH_CAP_FILE_WR|
+		      CEPH_CAP_FILE_BUFFER)) {
+		if (timespec_compare(ctime, &inode->i_ctime) > 0) {
+			dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
+			     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+			     ctime->tv_sec, ctime->tv_nsec);
+			inode->i_ctime = *ctime;
+		}
+		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
+			/* the MDS did a utimes() */
+			dout("mtime %ld.%09ld -> %ld.%09ld "
+			     "tw %d -> %d\n",
+			     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+			     mtime->tv_sec, mtime->tv_nsec,
+			     ci->i_time_warp_seq, (int)time_warp_seq);
+
+			inode->i_mtime = *mtime;
+			inode->i_atime = *atime;
+			ci->i_time_warp_seq = time_warp_seq;
+		} else if (time_warp_seq == ci->i_time_warp_seq) {
+			/* nobody did utimes(); take the max */
+			if (timespec_compare(mtime, &inode->i_mtime) > 0) {
+				dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
+				     inode->i_mtime.tv_sec,
+				     inode->i_mtime.tv_nsec,
+				     mtime->tv_sec, mtime->tv_nsec);
+				inode->i_mtime = *mtime;
+			}
+			if (timespec_compare(atime, &inode->i_atime) > 0) {
+				dout("atime %ld.%09ld -> %ld.%09ld inc\n",
+				     inode->i_atime.tv_sec,
+				     inode->i_atime.tv_nsec,
+				     atime->tv_sec, atime->tv_nsec);
+				inode->i_atime = *atime;
+			}
+		} else if (issued & CEPH_CAP_FILE_EXCL) {
+			/* we did a utimes(); ignore mds values */
+		} else {
+			warn = 1;
+		}
+	} else {
+		/* we have no write caps; whatever the MDS says is true */
+		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
+			inode->i_ctime = *ctime;
+			inode->i_mtime = *mtime;
+			inode->i_atime = *atime;
+			ci->i_time_warp_seq = time_warp_seq;
+		} else {
+			warn = 1;
+		}
+	}
+	if (warn) /* time_warp_seq shouldn't go backwards */
+		dout("%p mds time_warp_seq %llu < %u\n",
+		     inode, time_warp_seq, ci->i_time_warp_seq);
+}
+
+/*
+ * Populate an inode based on info from mds.  May be called on new or
+ * existing inodes.
+ */
+static int fill_inode(struct inode *inode,
+		      struct ceph_mds_reply_info_in *iinfo,
+		      struct ceph_mds_reply_dirfrag *dirinfo,
+		      struct ceph_mds_session *session,
+		      unsigned long ttl_from, int cap_fmode,
+		      struct ceph_cap_reservation *caps_reservation)
+{
+	struct ceph_mds_reply_inode *info = iinfo->in;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int i;
+	int issued, implemented;
+	struct timespec mtime, atime, ctime;
+	u32 nsplits;
+	struct ceph_buffer *xattr_blob = NULL;
+	int err = 0;
+	int queue_trunc = 0;
+
+	dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+	     inode, ceph_vinop(inode), le64_to_cpu(info->version),
+	     ci->i_version);
+
+	/*
+	 * prealloc xattr data, if it looks like we'll need it.  only
+	 * if len > 4 (meaning there are actually xattrs; the first 4
+	 * bytes are the xattr count).
+	 */
+	if (iinfo->xattr_len > 4) {
+		xattr_blob = ceph_buffer_new_alloc(iinfo->xattr_len, GFP_NOFS);
+		if (!xattr_blob)
+			pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+			       iinfo->xattr_len);
+	}
+
+	spin_lock(&inode->i_lock);
+
+	/*
+	 * provided version will be odd if inode value is projected,
+	 * even if stable.  skip the update if we have a newer info
+	 * (e.g., due to inode info racing form multiple MDSs), or if
+	 * we are getting projected (unstable) inode info.
+	 */
+	if (le64_to_cpu(info->version) > 0 &&
+	    (ci->i_version & ~1) > le64_to_cpu(info->version))
+		goto no_change;
+
+	issued = __ceph_caps_issued(ci, &implemented);
+	issued |= implemented | __ceph_caps_dirty(ci);
+
+	/* update inode */
+	ci->i_version = le64_to_cpu(info->version);
+	inode->i_version++;
+	inode->i_rdev = le32_to_cpu(info->rdev);
+
+	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+		inode->i_mode = le32_to_cpu(info->mode);
+		inode->i_uid = le32_to_cpu(info->uid);
+		inode->i_gid = le32_to_cpu(info->gid);
+		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+		     inode->i_uid, inode->i_gid);
+	}
+
+	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+		inode->i_nlink = le32_to_cpu(info->nlink);
+
+	/* be careful with mtime, atime, size */
+	ceph_decode_timespec(&atime, &info->atime);
+	ceph_decode_timespec(&mtime, &info->mtime);
+	ceph_decode_timespec(&ctime, &info->ctime);
+	queue_trunc = ceph_fill_file_size(inode, issued,
+					  le32_to_cpu(info->truncate_seq),
+					  le64_to_cpu(info->truncate_size),
+					  S_ISDIR(inode->i_mode) ?
+					  ci->i_rbytes :
+					  le64_to_cpu(info->size));
+	ceph_fill_file_time(inode, issued,
+			    le32_to_cpu(info->time_warp_seq),
+			    &ctime, &mtime, &atime);
+
+	ci->i_max_size = le64_to_cpu(info->max_size);
+	ci->i_layout = info->layout;
+	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+
+	/* xattrs */
+	/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
+	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
+	    le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
+		if (ci->i_xattrs.blob)
+			ceph_buffer_put(ci->i_xattrs.blob);
+		ci->i_xattrs.blob = xattr_blob;
+		if (xattr_blob)
+			memcpy(ci->i_xattrs.blob->vec.iov_base,
+			       iinfo->xattr_data, iinfo->xattr_len);
+		ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+	}
+
+	inode->i_mapping->a_ops = &ceph_aops;
+	inode->i_mapping->backing_dev_info =
+		&ceph_client(inode->i_sb)->backing_dev_info;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFBLK:
+	case S_IFCHR:
+	case S_IFSOCK:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		inode->i_op = &ceph_file_iops;
+		break;
+	case S_IFREG:
+		inode->i_op = &ceph_file_iops;
+		inode->i_fop = &ceph_file_fops;
+		break;
+	case S_IFLNK:
+		inode->i_op = &ceph_symlink_iops;
+		if (!ci->i_symlink) {
+			int symlen = iinfo->symlink_len;
+			char *sym;
+
+			BUG_ON(symlen != inode->i_size);
+			spin_unlock(&inode->i_lock);
+
+			err = -ENOMEM;
+			sym = kmalloc(symlen+1, GFP_NOFS);
+			if (!sym)
+				goto out;
+			memcpy(sym, iinfo->symlink, symlen);
+			sym[symlen] = 0;
+
+			spin_lock(&inode->i_lock);
+			if (!ci->i_symlink)
+				ci->i_symlink = sym;
+			else
+				kfree(sym); /* lost a race */
+		}
+		break;
+	case S_IFDIR:
+		inode->i_op = &ceph_dir_iops;
+		inode->i_fop = &ceph_dir_fops;
+
+		ci->i_files = le64_to_cpu(info->files);
+		ci->i_subdirs = le64_to_cpu(info->subdirs);
+		ci->i_rbytes = le64_to_cpu(info->rbytes);
+		ci->i_rfiles = le64_to_cpu(info->rfiles);
+		ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+		ceph_decode_timespec(&ci->i_rctime, &info->rctime);
+
+		/* set dir completion flag? */
+		if (ci->i_files == 0 && ci->i_subdirs == 0 &&
+		    ceph_snap(inode) == CEPH_NOSNAP &&
+		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
+			dout(" marking %p complete (empty)\n", inode);
+			ci->i_ceph_flags |= CEPH_I_COMPLETE;
+			ci->i_max_offset = 2;
+		}
+
+		/* it may be better to set st_size in getattr instead? */
+		if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
+			inode->i_size = ci->i_rbytes;
+		break;
+	default:
+		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+		       ceph_vinop(inode), inode->i_mode);
+	}
+
+no_change:
+	spin_unlock(&inode->i_lock);
+
+	/* queue truncate if we saw i_size decrease */
+	if (queue_trunc)
+		if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+			       &ci->i_vmtruncate_work))
+			igrab(inode);
+
+	/* populate frag tree */
+	/* FIXME: move me up, if/when version reflects fragtree changes */
+	nsplits = le32_to_cpu(info->fragtree.nsplits);
+	mutex_lock(&ci->i_fragtree_mutex);
+	for (i = 0; i < nsplits; i++) {
+		u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
+		struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
+
+		if (IS_ERR(frag))
+			continue;
+		frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
+		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+	}
+	mutex_unlock(&ci->i_fragtree_mutex);
+
+	/* were we issued a capability? */
+	if (info->cap.caps) {
+		if (ceph_snap(inode) == CEPH_NOSNAP) {
+			ceph_add_cap(inode, session,
+				     le64_to_cpu(info->cap.cap_id),
+				     cap_fmode,
+				     le32_to_cpu(info->cap.caps),
+				     le32_to_cpu(info->cap.wanted),
+				     le32_to_cpu(info->cap.seq),
+				     le32_to_cpu(info->cap.mseq),
+				     le64_to_cpu(info->cap.realm),
+				     info->cap.flags,
+				     caps_reservation);
+		} else {
+			spin_lock(&inode->i_lock);
+			dout(" %p got snap_caps %s\n", inode,
+			     ceph_cap_string(le32_to_cpu(info->cap.caps)));
+			ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
+			if (cap_fmode >= 0)
+				__ceph_get_fmode(ci, cap_fmode);
+			spin_unlock(&inode->i_lock);
+		}
+	}
+
+	/* update delegation info? */
+	if (dirinfo)
+		ceph_fill_dirfrag(inode, dirinfo);
+
+	err = 0;
+
+out:
+	ceph_buffer_put(xattr_blob);
+	return err;
+}
+
+/*
+ * caller should hold session s_mutex.
+ */
+static void update_dentry_lease(struct dentry *dentry,
+				struct ceph_mds_reply_lease *lease,
+				struct ceph_mds_session *session,
+				unsigned long from_time)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	long unsigned duration = le32_to_cpu(lease->duration_ms);
+	long unsigned ttl = from_time + (duration * HZ) / 1000;
+	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
+	struct inode *dir;
+
+	/* only track leases on regular dentries */
+	if (dentry->d_op != &ceph_dentry_ops)
+		return;
+
+	spin_lock(&dentry->d_lock);
+	dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
+	     dentry, le16_to_cpu(lease->mask), duration, ttl);
+
+	/* make lease_rdcache_gen match directory */
+	dir = dentry->d_parent->d_inode;
+	di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
+
+	if (lease->mask == 0)
+		goto out_unlock;
+
+	if (di->lease_gen == session->s_cap_gen &&
+	    time_before(ttl, dentry->d_time))
+		goto out_unlock;  /* we already have a newer lease. */
+
+	if (di->lease_session && di->lease_session != session)
+		goto out_unlock;
+
+	ceph_dentry_lru_touch(dentry);
+
+	if (!di->lease_session)
+		di->lease_session = ceph_get_mds_session(session);
+	di->lease_gen = session->s_cap_gen;
+	di->lease_seq = le32_to_cpu(lease->seq);
+	di->lease_renew_after = half_ttl;
+	di->lease_renew_from = 0;
+	dentry->d_time = ttl;
+out_unlock:
+	spin_unlock(&dentry->d_lock);
+	return;
+}
+
+/*
+ * splice a dentry to an inode.
+ * caller must hold directory i_mutex for this to be safe.
+ *
+ * we will only rehash the resulting dentry if @prehash is
+ * true; @prehash will be set to false (for the benefit of
+ * the caller) if we fail.
+ */
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
+				    bool *prehash)
+{
+	struct dentry *realdn;
+
+	/* dn must be unhashed */
+	if (!d_unhashed(dn))
+		d_drop(dn);
+	realdn = d_materialise_unique(dn, in);
+	if (IS_ERR(realdn)) {
+		pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
+		       dn, in, ceph_vinop(in));
+		if (prehash)
+			*prehash = false; /* don't rehash on error */
+		dn = realdn; /* note realdn contains the error */
+		goto out;
+	} else if (realdn) {
+		dout("dn %p (%d) spliced with %p (%d) "
+		     "inode %p ino %llx.%llx\n",
+		     dn, atomic_read(&dn->d_count),
+		     realdn, atomic_read(&realdn->d_count),
+		     realdn->d_inode, ceph_vinop(realdn->d_inode));
+		dput(dn);
+		dn = realdn;
+	} else {
+		BUG_ON(!ceph_dentry(dn));
+
+		dout("dn %p attached to %p ino %llx.%llx\n",
+		     dn, dn->d_inode, ceph_vinop(dn->d_inode));
+	}
+	if ((!prehash || *prehash) && d_unhashed(dn))
+		d_rehash(dn);
+out:
+	return dn;
+}
+
+/*
+ * Incorporate results into the local cache.  This is either just
+ * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
+ * after a lookup).
+ *
+ * A reply may contain
+ *         a directory inode along with a dentry.
+ *  and/or a target inode
+ *
+ * Called with snap_rwsem (read).
+ */
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
+		    struct ceph_mds_session *session)
+{
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct inode *in = NULL;
+	struct ceph_mds_reply_inode *ininfo;
+	struct ceph_vino vino;
+	int i = 0;
+	int err = 0;
+
+	dout("fill_trace %p is_dentry %d is_target %d\n", req,
+	     rinfo->head->is_dentry, rinfo->head->is_target);
+
+#if 0
+	/*
+	 * Debugging hook:
+	 *
+	 * If we resend completed ops to a recovering mds, we get no
+	 * trace.  Since that is very rare, pretend this is the case
+	 * to ensure the 'no trace' handlers in the callers behave.
+	 *
+	 * Fill in inodes unconditionally to avoid breaking cap
+	 * invariants.
+	 */
+	if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
+		pr_info("fill_trace faking empty trace on %lld %s\n",
+			req->r_tid, ceph_mds_op_name(rinfo->head->op));
+		if (rinfo->head->is_dentry) {
+			rinfo->head->is_dentry = 0;
+			err = fill_inode(req->r_locked_dir,
+					 &rinfo->diri, rinfo->dirfrag,
+					 session, req->r_request_started, -1);
+		}
+		if (rinfo->head->is_target) {
+			rinfo->head->is_target = 0;
+			ininfo = rinfo->targeti.in;
+			vino.ino = le64_to_cpu(ininfo->ino);
+			vino.snap = le64_to_cpu(ininfo->snapid);
+			in = ceph_get_inode(sb, vino);
+			err = fill_inode(in, &rinfo->targeti, NULL,
+					 session, req->r_request_started,
+					 req->r_fmode);
+			iput(in);
+		}
+	}
+#endif
+
+	if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
+		dout("fill_trace reply is empty!\n");
+		if (rinfo->head->result == 0 && req->r_locked_dir) {
+			struct ceph_inode_info *ci =
+				ceph_inode(req->r_locked_dir);
+			dout(" clearing %p complete (empty trace)\n",
+			     req->r_locked_dir);
+			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+			ci->i_release_count++;
+		}
+		return 0;
+	}
+
+	if (rinfo->head->is_dentry) {
+		/*
+		 * lookup link rename   : null -> possibly existing inode
+		 * mknod symlink mkdir  : null -> new inode
+		 * unlink               : linked -> null
+		 */
+		struct inode *dir = req->r_locked_dir;
+		struct dentry *dn = req->r_dentry;
+		bool have_dir_cap, have_lease;
+
+		BUG_ON(!dn);
+		BUG_ON(!dir);
+		BUG_ON(dn->d_parent->d_inode != dir);
+		BUG_ON(ceph_ino(dir) !=
+		       le64_to_cpu(rinfo->diri.in->ino));
+		BUG_ON(ceph_snap(dir) !=
+		       le64_to_cpu(rinfo->diri.in->snapid));
+
+		err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+				 session, req->r_request_started, -1,
+				 &req->r_caps_reservation);
+		if (err < 0)
+			return err;
+
+		/* do we have a lease on the whole dir? */
+		have_dir_cap =
+			(le32_to_cpu(rinfo->diri.in->cap.caps) &
+			 CEPH_CAP_FILE_SHARED);
+
+		/* do we have a dn lease? */
+		have_lease = have_dir_cap ||
+			(le16_to_cpu(rinfo->dlease->mask) &
+			 CEPH_LOCK_DN);
+
+		if (!have_lease)
+			dout("fill_trace  no dentry lease or dir cap\n");
+
+		/* rename? */
+		if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+			dout(" src %p '%.*s' dst %p '%.*s'\n",
+			     req->r_old_dentry,
+			     req->r_old_dentry->d_name.len,
+			     req->r_old_dentry->d_name.name,
+			     dn, dn->d_name.len, dn->d_name.name);
+			dout("fill_trace doing d_move %p -> %p\n",
+			     req->r_old_dentry, dn);
+			d_move(req->r_old_dentry, dn);
+			dout(" src %p '%.*s' dst %p '%.*s'\n",
+			     req->r_old_dentry,
+			     req->r_old_dentry->d_name.len,
+			     req->r_old_dentry->d_name.name,
+			     dn, dn->d_name.len, dn->d_name.name);
+			/* take overwritten dentry's readdir offset */
+			ceph_dentry(req->r_old_dentry)->offset =
+				ceph_dentry(dn)->offset;
+			dn = req->r_old_dentry;  /* use old_dentry */
+			in = dn->d_inode;
+		}
+
+		/* null dentry? */
+		if (!rinfo->head->is_target) {
+			dout("fill_trace null dentry\n");
+			if (dn->d_inode) {
+				dout("d_delete %p\n", dn);
+				d_delete(dn);
+			} else {
+				dout("d_instantiate %p NULL\n", dn);
+				d_instantiate(dn, NULL);
+				if (have_lease && d_unhashed(dn))
+					d_rehash(dn);
+				update_dentry_lease(dn, rinfo->dlease,
+						    session,
+						    req->r_request_started);
+			}
+			goto done;
+		}
+
+		/* attach proper inode */
+		ininfo = rinfo->targeti.in;
+		vino.ino = le64_to_cpu(ininfo->ino);
+		vino.snap = le64_to_cpu(ininfo->snapid);
+		if (!dn->d_inode) {
+			in = ceph_get_inode(sb, vino);
+			if (IS_ERR(in)) {
+				pr_err("fill_trace bad get_inode "
+				       "%llx.%llx\n", vino.ino, vino.snap);
+				err = PTR_ERR(in);
+				d_delete(dn);
+				goto done;
+			}
+			dn = splice_dentry(dn, in, &have_lease);
+			if (IS_ERR(dn)) {
+				err = PTR_ERR(dn);
+				goto done;
+			}
+			req->r_dentry = dn;  /* may have spliced */
+			igrab(in);
+		} else if (ceph_ino(in) == vino.ino &&
+			   ceph_snap(in) == vino.snap) {
+			igrab(in);
+		} else {
+			dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
+			     dn, in, ceph_ino(in), ceph_snap(in),
+			     vino.ino, vino.snap);
+			have_lease = false;
+			in = NULL;
+		}
+
+		if (have_lease)
+			update_dentry_lease(dn, rinfo->dlease, session,
+					    req->r_request_started);
+		dout(" final dn %p\n", dn);
+		i++;
+	} else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+		   req->r_op == CEPH_MDS_OP_MKSNAP) {
+		struct dentry *dn = req->r_dentry;
+
+		/* fill out a snapdir LOOKUPSNAP dentry */
+		BUG_ON(!dn);
+		BUG_ON(!req->r_locked_dir);
+		BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
+		ininfo = rinfo->targeti.in;
+		vino.ino = le64_to_cpu(ininfo->ino);
+		vino.snap = le64_to_cpu(ininfo->snapid);
+		in = ceph_get_inode(sb, vino);
+		if (IS_ERR(in)) {
+			pr_err("fill_inode get_inode badness %llx.%llx\n",
+			       vino.ino, vino.snap);
+			err = PTR_ERR(in);
+			d_delete(dn);
+			goto done;
+		}
+		dout(" linking snapped dir %p to dn %p\n", in, dn);
+		dn = splice_dentry(dn, in, NULL);
+		if (IS_ERR(dn)) {
+			err = PTR_ERR(dn);
+			goto done;
+		}
+		req->r_dentry = dn;  /* may have spliced */
+		igrab(in);
+		rinfo->head->is_dentry = 1;  /* fool notrace handlers */
+	}
+
+	if (rinfo->head->is_target) {
+		vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+		vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+
+		if (in == NULL || ceph_ino(in) != vino.ino ||
+		    ceph_snap(in) != vino.snap) {
+			in = ceph_get_inode(sb, vino);
+			if (IS_ERR(in)) {
+				err = PTR_ERR(in);
+				goto done;
+			}
+		}
+		req->r_target_inode = in;
+
+		err = fill_inode(in,
+				 &rinfo->targeti, NULL,
+				 session, req->r_request_started,
+				 (le32_to_cpu(rinfo->head->result) == 0) ?
+				 req->r_fmode : -1,
+				 &req->r_caps_reservation);
+		if (err < 0) {
+			pr_err("fill_inode badness %p %llx.%llx\n",
+			       in, ceph_vinop(in));
+			goto done;
+		}
+	}
+
+done:
+	dout("fill_trace done err=%d\n", err);
+	return err;
+}
+
+/*
+ * Prepopulate our cache with readdir results, leases, etc.
+ */
+int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+			     struct ceph_mds_session *session)
+{
+	struct dentry *parent = req->r_dentry;
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct qstr dname;
+	struct dentry *dn;
+	struct inode *in;
+	int err = 0, i;
+	struct inode *snapdir = NULL;
+	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
+	u64 frag = le32_to_cpu(rhead->args.readdir.frag);
+	struct ceph_dentry_info *di;
+
+	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
+		snapdir = ceph_get_snapdir(parent->d_inode);
+		parent = d_find_alias(snapdir);
+		dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
+		     rinfo->dir_nr, parent);
+	} else {
+		dout("readdir_prepopulate %d items under dn %p\n",
+		     rinfo->dir_nr, parent);
+		if (rinfo->dir_dir)
+			ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
+	}
+
+	for (i = 0; i < rinfo->dir_nr; i++) {
+		struct ceph_vino vino;
+
+		dname.name = rinfo->dir_dname[i];
+		dname.len = rinfo->dir_dname_len[i];
+		dname.hash = full_name_hash(dname.name, dname.len);
+
+		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+
+retry_lookup:
+		dn = d_lookup(parent, &dname);
+		dout("d_lookup on parent=%p name=%.*s got %p\n",
+		     parent, dname.len, dname.name, dn);
+
+		if (!dn) {
+			dn = d_alloc(parent, &dname);
+			dout("d_alloc %p '%.*s' = %p\n", parent,
+			     dname.len, dname.name, dn);
+			if (dn == NULL) {
+				dout("d_alloc badness\n");
+				err = -ENOMEM;
+				goto out;
+			}
+			err = ceph_init_dentry(dn);
+			if (err < 0)
+				goto out;
+		} else if (dn->d_inode &&
+			   (ceph_ino(dn->d_inode) != vino.ino ||
+			    ceph_snap(dn->d_inode) != vino.snap)) {
+			dout(" dn %p points to wrong inode %p\n",
+			     dn, dn->d_inode);
+			d_delete(dn);
+			dput(dn);
+			goto retry_lookup;
+		} else {
+			/* reorder parent's d_subdirs */
+			spin_lock(&dcache_lock);
+			spin_lock(&dn->d_lock);
+			list_move(&dn->d_u.d_child, &parent->d_subdirs);
+			spin_unlock(&dn->d_lock);
+			spin_unlock(&dcache_lock);
+		}
+
+		di = dn->d_fsdata;
+		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
+
+		/* inode */
+		if (dn->d_inode) {
+			in = dn->d_inode;
+		} else {
+			in = ceph_get_inode(parent->d_sb, vino);
+			if (in == NULL) {
+				dout("new_inode badness\n");
+				d_delete(dn);
+				dput(dn);
+				err = -ENOMEM;
+				goto out;
+			}
+			dn = splice_dentry(dn, in, NULL);
+		}
+
+		if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
+			       req->r_request_started, -1,
+			       &req->r_caps_reservation) < 0) {
+			pr_err("fill_inode badness on %p\n", in);
+			dput(dn);
+			continue;
+		}
+		update_dentry_lease(dn, rinfo->dir_dlease[i],
+				    req->r_session, req->r_request_started);
+		dput(dn);
+	}
+	req->r_did_prepopulate = true;
+
+out:
+	if (snapdir) {
+		iput(snapdir);
+		dput(parent);
+	}
+	dout("readdir_prepopulate done\n");
+	return err;
+}
+
+int ceph_inode_set_size(struct inode *inode, loff_t size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret = 0;
+
+	spin_lock(&inode->i_lock);
+	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+	inode->i_size = size;
+	inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+
+	/* tell the MDS if we are approaching max_size */
+	if ((size << 1) >= ci->i_max_size &&
+	    (ci->i_reported_size << 1) < ci->i_max_size)
+		ret = 1;
+
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+/*
+ * Write back inode data in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+void ceph_inode_writeback(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_wb_work);
+	struct inode *inode = &ci->vfs_inode;
+
+	dout("writeback %p\n", inode);
+	filemap_fdatawrite(&inode->i_data);
+	iput(inode);
+}
+
+/*
+ * Invalidate inode pages in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+static void ceph_inode_invalidate_pages(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_pg_inv_work);
+	struct inode *inode = &ci->vfs_inode;
+	u32 orig_gen;
+	int check = 0;
+
+	spin_lock(&inode->i_lock);
+	dout("invalidate_pages %p gen %d revoking %d\n", inode,
+	     ci->i_rdcache_gen, ci->i_rdcache_revoking);
+	if (ci->i_rdcache_gen == 0 ||
+	    ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+		BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
+		/* nevermind! */
+		ci->i_rdcache_revoking = 0;
+		spin_unlock(&inode->i_lock);
+		goto out;
+	}
+	orig_gen = ci->i_rdcache_gen;
+	spin_unlock(&inode->i_lock);
+
+	truncate_inode_pages(&inode->i_data, 0);
+
+	spin_lock(&inode->i_lock);
+	if (orig_gen == ci->i_rdcache_gen) {
+		dout("invalidate_pages %p gen %d successful\n", inode,
+		     ci->i_rdcache_gen);
+		ci->i_rdcache_gen = 0;
+		ci->i_rdcache_revoking = 0;
+		check = 1;
+	} else {
+		dout("invalidate_pages %p gen %d raced, gen now %d\n",
+		     inode, orig_gen, ci->i_rdcache_gen);
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (check)
+		ceph_check_caps(ci, 0, NULL);
+out:
+	iput(inode);
+}
+
+
+/*
+ * called by trunc_wq; take i_mutex ourselves
+ *
+ * We also truncate in a separate thread as well.
+ */
+void ceph_vmtruncate_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_vmtruncate_work);
+	struct inode *inode = &ci->vfs_inode;
+
+	dout("vmtruncate_work %p\n", inode);
+	mutex_lock(&inode->i_mutex);
+	__ceph_do_pending_vmtruncate(inode);
+	mutex_unlock(&inode->i_mutex);
+	iput(inode);
+}
+
+/*
+ * called with i_mutex held.
+ *
+ * Make sure any pending truncation is applied before doing anything
+ * that may depend on it.
+ */
+void __ceph_do_pending_vmtruncate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 to;
+	int wrbuffer_refs, wake = 0;
+
+retry:
+	spin_lock(&inode->i_lock);
+	if (ci->i_truncate_pending == 0) {
+		dout("__do_pending_vmtruncate %p none pending\n", inode);
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+
+	/*
+	 * make sure any dirty snapped pages are flushed before we
+	 * possibly truncate them.. so write AND block!
+	 */
+	if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
+		dout("__do_pending_vmtruncate %p flushing snaps first\n",
+		     inode);
+		spin_unlock(&inode->i_lock);
+		filemap_write_and_wait_range(&inode->i_data, 0,
+					     inode->i_sb->s_maxbytes);
+		goto retry;
+	}
+
+	to = ci->i_truncate_size;
+	wrbuffer_refs = ci->i_wrbuffer_ref;
+	dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
+	     ci->i_truncate_pending, to);
+	spin_unlock(&inode->i_lock);
+
+	truncate_inode_pages(inode->i_mapping, to);
+
+	spin_lock(&inode->i_lock);
+	ci->i_truncate_pending--;
+	if (ci->i_truncate_pending == 0)
+		wake = 1;
+	spin_unlock(&inode->i_lock);
+
+	if (wrbuffer_refs == 0)
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+	if (wake)
+		wake_up(&ci->i_cap_wq);
+}
+
+
+/*
+ * symlinks
+ */
+static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
+	nd_set_link(nd, ci->i_symlink);
+	return NULL;
+}
+
+static const struct inode_operations ceph_symlink_iops = {
+	.readlink = generic_readlink,
+	.follow_link = ceph_sym_follow_link,
+};
+
+/*
+ * setattr
+ */
+int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+	const unsigned int ia_valid = attr->ia_valid;
+	struct ceph_mds_request *req;
+	struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
+	int issued;
+	int release = 0, dirtied = 0;
+	int mask = 0;
+	int err = 0;
+	int queue_trunc = 0;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	__ceph_do_pending_vmtruncate(inode);
+
+	err = inode_change_ok(inode, attr);
+	if (err != 0)
+		return err;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	spin_lock(&inode->i_lock);
+	issued = __ceph_caps_issued(ci, NULL);
+	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+	if (ia_valid & ATTR_UID) {
+		dout("setattr %p uid %d -> %d\n", inode,
+		     inode->i_uid, attr->ia_uid);
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_uid = attr->ia_uid;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   attr->ia_uid != inode->i_uid) {
+			req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
+			mask |= CEPH_SETATTR_UID;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+	if (ia_valid & ATTR_GID) {
+		dout("setattr %p gid %d -> %d\n", inode,
+		     inode->i_gid, attr->ia_gid);
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_gid = attr->ia_gid;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   attr->ia_gid != inode->i_gid) {
+			req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
+			mask |= CEPH_SETATTR_GID;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+	if (ia_valid & ATTR_MODE) {
+		dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
+		     attr->ia_mode);
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_mode = attr->ia_mode;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   attr->ia_mode != inode->i_mode) {
+			req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
+			mask |= CEPH_SETATTR_MODE;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+
+	if (ia_valid & ATTR_ATIME) {
+		dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
+		     inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
+		     attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+		if (issued & CEPH_CAP_FILE_EXCL) {
+			ci->i_time_warp_seq++;
+			inode->i_atime = attr->ia_atime;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_WR) &&
+			   timespec_compare(&inode->i_atime,
+					    &attr->ia_atime) < 0) {
+			inode->i_atime = attr->ia_atime;
+			dirtied |= CEPH_CAP_FILE_WR;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
+			ceph_encode_timespec(&req->r_args.setattr.atime,
+					     &attr->ia_atime);
+			mask |= CEPH_SETATTR_ATIME;
+			release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+	if (ia_valid & ATTR_MTIME) {
+		dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
+		     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+		     attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+		if (issued & CEPH_CAP_FILE_EXCL) {
+			ci->i_time_warp_seq++;
+			inode->i_mtime = attr->ia_mtime;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_WR) &&
+			   timespec_compare(&inode->i_mtime,
+					    &attr->ia_mtime) < 0) {
+			inode->i_mtime = attr->ia_mtime;
+			dirtied |= CEPH_CAP_FILE_WR;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
+			ceph_encode_timespec(&req->r_args.setattr.mtime,
+					     &attr->ia_mtime);
+			mask |= CEPH_SETATTR_MTIME;
+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+	if (ia_valid & ATTR_SIZE) {
+		dout("setattr %p size %lld -> %lld\n", inode,
+		     inode->i_size, attr->ia_size);
+		if (attr->ia_size > inode->i_sb->s_maxbytes) {
+			err = -EINVAL;
+			goto out;
+		}
+		if ((issued & CEPH_CAP_FILE_EXCL) &&
+		    attr->ia_size > inode->i_size) {
+			inode->i_size = attr->ia_size;
+			if (attr->ia_size < inode->i_size) {
+				ci->i_truncate_size = attr->ia_size;
+				ci->i_truncate_pending++;
+				queue_trunc = 1;
+			}
+			inode->i_blocks =
+				(attr->ia_size + (1 << 9) - 1) >> 9;
+			inode->i_ctime = attr->ia_ctime;
+			ci->i_reported_size = attr->ia_size;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   attr->ia_size != inode->i_size) {
+			req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+			req->r_args.setattr.old_size =
+				cpu_to_le64(inode->i_size);
+			mask |= CEPH_SETATTR_SIZE;
+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+
+	/* these do nothing */
+	if (ia_valid & ATTR_CTIME) {
+		bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
+					 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
+		dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
+		     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+		     attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+		     only ? "ctime only" : "ignored");
+		inode->i_ctime = attr->ia_ctime;
+		if (only) {
+			/*
+			 * if kernel wants to dirty ctime but nothing else,
+			 * we need to choose a cap to dirty under, or do
+			 * a almost-no-op setattr
+			 */
+			if (issued & CEPH_CAP_AUTH_EXCL)
+				dirtied |= CEPH_CAP_AUTH_EXCL;
+			else if (issued & CEPH_CAP_FILE_EXCL)
+				dirtied |= CEPH_CAP_FILE_EXCL;
+			else if (issued & CEPH_CAP_XATTR_EXCL)
+				dirtied |= CEPH_CAP_XATTR_EXCL;
+			else
+				mask |= CEPH_SETATTR_CTIME;
+		}
+	}
+	if (ia_valid & ATTR_FILE)
+		dout("setattr %p ATTR_FILE ... hrm!\n", inode);
+
+	if (dirtied) {
+		__ceph_mark_dirty_caps(ci, dirtied);
+		inode->i_ctime = CURRENT_TIME;
+	}
+
+	release &= issued;
+	spin_unlock(&inode->i_lock);
+
+	if (queue_trunc)
+		__ceph_do_pending_vmtruncate(inode);
+
+	if (mask) {
+		req->r_inode = igrab(inode);
+		req->r_inode_drop = release;
+		req->r_args.setattr.mask = cpu_to_le32(mask);
+		req->r_num_caps = 1;
+		err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	}
+	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
+	     ceph_cap_string(dirtied), mask);
+
+	ceph_mdsc_put_request(req);
+	__ceph_do_pending_vmtruncate(inode);
+	return err;
+out:
+	spin_unlock(&inode->i_lock);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Verify that we have a lease on the given mask.  If not,
+ * do a getattr against an mds.
+ */
+int ceph_do_getattr(struct inode *inode, int mask)
+{
+	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(inode) == CEPH_SNAPDIR) {
+		dout("do_getattr inode %p SNAPDIR\n", inode);
+		return 0;
+	}
+
+	dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+	if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
+		return 0;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+	req->r_num_caps = 1;
+	req->r_args.getattr.mask = cpu_to_le32(mask);
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	ceph_mdsc_put_request(req);
+	dout("do_getattr result=%d\n", err);
+	return err;
+}
+
+
+/*
+ * Check inode permissions.  We verify we have a valid value for
+ * the AUTH cap, then call the generic handler.
+ */
+int ceph_permission(struct inode *inode, int mask)
+{
+	int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+
+	if (!err)
+		err = generic_permission(inode, mask, NULL);
+	return err;
+}
+
+/*
+ * Get all attributes.  Hopefully somedata we'll have a statlite()
+ * and can limit the fields we require to be accurate.
+ */
+int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+
+	err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
+	if (!err) {
+		generic_fillattr(inode, stat);
+		stat->ino = inode->i_ino;
+		if (ceph_snap(inode) != CEPH_NOSNAP)
+			stat->dev = ceph_snap(inode);
+		else
+			stat->dev = 0;
+		if (S_ISDIR(inode->i_mode))
+			stat->blksize = 65536;
+	}
+	return err;
+}
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..8eaac04d1b87
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,833 @@
+#include "ceph_debug.h"
+#include "super.h"
+#include "decode.h"
+
+#include <linux/xattr.h>
+
+static bool ceph_is_valid_xattr(const char *name)
+{
+	return !strncmp(name, XATTR_SECURITY_PREFIX,
+			XATTR_SECURITY_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+
+/*
+ * These define virtual xattrs exposing the recursive directory
+ * statistics and layout metadata.
+ */
+struct ceph_vxattr_cb {
+	bool readonly;
+	char *name;
+	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
+			      size_t size);
+};
+
+/* directories */
+
+static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
+				      size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_files);
+}
+
+static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
+					 size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rfiles);
+}
+
+static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
+					 size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rbytes);
+}
+
+static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
+			(long)ci->i_rctime.tv_nsec);
+}
+
+static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
+	{ true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
+	{ true, "user.ceph.dir.files", ceph_vxattrcb_files},
+	{ true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+	{ true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
+	{ true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+	{ true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+	{ true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+	{ true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
+	{ true, NULL, NULL }
+};
+
+/* files */
+
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+				   size_t size)
+{
+	return snprintf(val, size,
+		"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+		(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+}
+
+static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+	{ true, "user.ceph.layout", ceph_vxattrcb_layout},
+	{ NULL, NULL }
+};
+
+static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode))
+		return ceph_dir_vxattrs;
+	else if (S_ISREG(inode->i_mode))
+		return ceph_file_vxattrs;
+	return NULL;
+}
+
+static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
+						const char *name)
+{
+	do {
+		if (strcmp(vxattr->name, name) == 0)
+			return vxattr;
+		vxattr++;
+	} while (vxattr->name);
+	return NULL;
+}
+
+static int __set_xattr(struct ceph_inode_info *ci,
+			   const char *name, int name_len,
+			   const char *val, int val_len,
+			   int dirty,
+			   int should_free_name, int should_free_val,
+			   struct ceph_inode_xattr **newxattr)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int c;
+	int new = 0;
+
+	p = &ci->i_xattrs.index.rb_node;
+	while (*p) {
+		parent = *p;
+		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+		c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else {
+			if (name_len == xattr->name_len)
+				break;
+			else if (name_len < xattr->name_len)
+				p = &(*p)->rb_left;
+			else
+				p = &(*p)->rb_right;
+		}
+		xattr = NULL;
+	}
+
+	if (!xattr) {
+		new = 1;
+		xattr = *newxattr;
+		xattr->name = name;
+		xattr->name_len = name_len;
+		xattr->should_free_name = should_free_name;
+
+		ci->i_xattrs.count++;
+		dout("__set_xattr count=%d\n", ci->i_xattrs.count);
+	} else {
+		kfree(*newxattr);
+		*newxattr = NULL;
+		if (xattr->should_free_val)
+			kfree((void *)xattr->val);
+
+		if (should_free_name) {
+			kfree((void *)name);
+			name = xattr->name;
+		}
+		ci->i_xattrs.names_size -= xattr->name_len;
+		ci->i_xattrs.vals_size -= xattr->val_len;
+	}
+	if (!xattr) {
+		pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
+		       &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
+		       xattr->val);
+		return -ENOMEM;
+	}
+	ci->i_xattrs.names_size += name_len;
+	ci->i_xattrs.vals_size += val_len;
+	if (val)
+		xattr->val = val;
+	else
+		xattr->val = "";
+
+	xattr->val_len = val_len;
+	xattr->dirty = dirty;
+	xattr->should_free_val = (val && should_free_val);
+
+	if (new) {
+		rb_link_node(&xattr->node, parent, p);
+		rb_insert_color(&xattr->node, &ci->i_xattrs.index);
+		dout("__set_xattr_val p=%p\n", p);
+	}
+
+	dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
+	     ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
+
+	return 0;
+}
+
+static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
+			   const char *name)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int c;
+
+	p = &ci->i_xattrs.index.rb_node;
+	while (*p) {
+		parent = *p;
+		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+		c = strncmp(name, xattr->name, xattr->name_len);
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else {
+			dout("__get_xattr %s: found %.*s\n", name,
+			     xattr->val_len, xattr->val);
+			return xattr;
+		}
+	}
+
+	dout("__get_xattr %s: not found\n", name);
+
+	return NULL;
+}
+
+static void __free_xattr(struct ceph_inode_xattr *xattr)
+{
+	BUG_ON(!xattr);
+
+	if (xattr->should_free_name)
+		kfree((void *)xattr->name);
+	if (xattr->should_free_val)
+		kfree((void *)xattr->val);
+
+	kfree(xattr);
+}
+
+static int __remove_xattr(struct ceph_inode_info *ci,
+			  struct ceph_inode_xattr *xattr)
+{
+	if (!xattr)
+		return -EOPNOTSUPP;
+
+	rb_erase(&xattr->node, &ci->i_xattrs.index);
+
+	if (xattr->should_free_name)
+		kfree((void *)xattr->name);
+	if (xattr->should_free_val)
+		kfree((void *)xattr->val);
+
+	ci->i_xattrs.names_size -= xattr->name_len;
+	ci->i_xattrs.vals_size -= xattr->val_len;
+	ci->i_xattrs.count--;
+	kfree(xattr);
+
+	return 0;
+}
+
+static int __remove_xattr_by_name(struct ceph_inode_info *ci,
+			   const char *name)
+{
+	struct rb_node **p;
+	struct ceph_inode_xattr *xattr;
+	int err;
+
+	p = &ci->i_xattrs.index.rb_node;
+	xattr = __get_xattr(ci, name);
+	err = __remove_xattr(ci, xattr);
+	return err;
+}
+
+static char *__copy_xattr_names(struct ceph_inode_info *ci,
+				char *dest)
+{
+	struct rb_node *p;
+	struct ceph_inode_xattr *xattr = NULL;
+
+	p = rb_first(&ci->i_xattrs.index);
+	dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
+
+	while (p) {
+		xattr = rb_entry(p, struct ceph_inode_xattr, node);
+		memcpy(dest, xattr->name, xattr->name_len);
+		dest[xattr->name_len] = '\0';
+
+		dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
+		     xattr->name_len, ci->i_xattrs.names_size);
+
+		dest += xattr->name_len + 1;
+		p = rb_next(p);
+	}
+
+	return dest;
+}
+
+void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
+{
+	struct rb_node *p, *tmp;
+	struct ceph_inode_xattr *xattr = NULL;
+
+	p = rb_first(&ci->i_xattrs.index);
+
+	dout("__ceph_destroy_xattrs p=%p\n", p);
+
+	while (p) {
+		xattr = rb_entry(p, struct ceph_inode_xattr, node);
+		tmp = p;
+		p = rb_next(tmp);
+		dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
+		     xattr->name_len, xattr->name);
+		rb_erase(tmp, &ci->i_xattrs.index);
+
+		__free_xattr(xattr);
+	}
+
+	ci->i_xattrs.names_size = 0;
+	ci->i_xattrs.vals_size = 0;
+	ci->i_xattrs.index_version = 0;
+	ci->i_xattrs.count = 0;
+	ci->i_xattrs.index = RB_ROOT;
+}
+
+static int __build_xattrs(struct inode *inode)
+{
+	u32 namelen;
+	u32 numattr = 0;
+	void *p, *end;
+	u32 len;
+	const char *name, *val;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int xattr_version;
+	struct ceph_inode_xattr **xattrs = NULL;
+	int err;
+	int i;
+
+	dout("__build_xattrs() len=%d\n",
+	     ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
+
+	if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
+		return 0; /* already built */
+
+	__ceph_destroy_xattrs(ci);
+
+start:
+	/* updated internal xattr rb tree */
+	if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
+		p = ci->i_xattrs.blob->vec.iov_base;
+		end = p + ci->i_xattrs.blob->vec.iov_len;
+		ceph_decode_32_safe(&p, end, numattr, bad);
+		xattr_version = ci->i_xattrs.version;
+		spin_unlock(&inode->i_lock);
+
+		xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
+				 GFP_NOFS);
+		err = -ENOMEM;
+		if (!xattrs)
+			goto bad_lock;
+		memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
+		for (i = 0; i < numattr; i++) {
+			xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
+					    GFP_NOFS);
+			if (!xattrs[i])
+				goto bad_lock;
+		}
+
+		spin_lock(&inode->i_lock);
+		if (ci->i_xattrs.version != xattr_version) {
+			/* lost a race, retry */
+			for (i = 0; i < numattr; i++)
+				kfree(xattrs[i]);
+			kfree(xattrs);
+			goto start;
+		}
+		err = -EIO;
+		while (numattr--) {
+			ceph_decode_32_safe(&p, end, len, bad);
+			namelen = len;
+			name = p;
+			p += len;
+			ceph_decode_32_safe(&p, end, len, bad);
+			val = p;
+			p += len;
+
+			err = __set_xattr(ci, name, namelen, val, len,
+					  0, 0, 0, &xattrs[numattr]);
+
+			if (err < 0)
+				goto bad;
+		}
+		kfree(xattrs);
+	}
+	ci->i_xattrs.index_version = ci->i_xattrs.version;
+	ci->i_xattrs.dirty = false;
+
+	return err;
+bad_lock:
+	spin_lock(&inode->i_lock);
+bad:
+	if (xattrs) {
+		for (i = 0; i < numattr; i++)
+			kfree(xattrs[i]);
+		kfree(xattrs);
+	}
+	ci->i_xattrs.names_size = 0;
+	return err;
+}
+
+static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
+				    int val_size)
+{
+	/*
+	 * 4 bytes for the length, and additional 4 bytes per each xattr name,
+	 * 4 bytes per each value
+	 */
+	int size = 4 + ci->i_xattrs.count*(4 + 4) +
+			     ci->i_xattrs.names_size +
+			     ci->i_xattrs.vals_size;
+	dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
+	     ci->i_xattrs.count, ci->i_xattrs.names_size,
+	     ci->i_xattrs.vals_size);
+
+	if (name_size)
+		size += 4 + 4 + name_size + val_size;
+
+	return size;
+}
+
+/*
+ * If there are dirty xattrs, reencode xattrs into the prealloc_blob
+ * and swap into place.
+ */
+void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
+{
+	struct rb_node *p;
+	struct ceph_inode_xattr *xattr = NULL;
+	void *dest;
+
+	dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+	if (ci->i_xattrs.dirty) {
+		int need = __get_required_blob_size(ci, 0, 0);
+
+		BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
+
+		p = rb_first(&ci->i_xattrs.index);
+		dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+		ceph_encode_32(&dest, ci->i_xattrs.count);
+		while (p) {
+			xattr = rb_entry(p, struct ceph_inode_xattr, node);
+
+			ceph_encode_32(&dest, xattr->name_len);
+			memcpy(dest, xattr->name, xattr->name_len);
+			dest += xattr->name_len;
+			ceph_encode_32(&dest, xattr->val_len);
+			memcpy(dest, xattr->val, xattr->val_len);
+			dest += xattr->val_len;
+
+			p = rb_next(p);
+		}
+
+		/* adjust buffer len; it may be larger than we need */
+		ci->i_xattrs.prealloc_blob->vec.iov_len =
+			dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+		ceph_buffer_put(ci->i_xattrs.blob);
+		ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
+		ci->i_xattrs.prealloc_blob = NULL;
+		ci->i_xattrs.dirty = false;
+	}
+}
+
+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+		      size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int err;
+	struct ceph_inode_xattr *xattr;
+	struct ceph_vxattr_cb *vxattr = NULL;
+
+	if (!ceph_is_valid_xattr(name))
+		return -ENODATA;
+
+	/* let's see if a virtual xattr was requested */
+	if (vxattrs)
+		vxattr = ceph_match_vxattr(vxattrs, name);
+
+	spin_lock(&inode->i_lock);
+	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+	     ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
+		goto get_xattr;
+	} else {
+		spin_unlock(&inode->i_lock);
+		/* get xattrs from mds (if we don't already have them) */
+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+		if (err)
+			return err;
+	}
+
+	spin_lock(&inode->i_lock);
+
+	if (vxattr && vxattr->readonly) {
+		err = vxattr->getxattr_cb(ci, value, size);
+		goto out;
+	}
+
+	err = __build_xattrs(inode);
+	if (err < 0)
+		goto out;
+
+get_xattr:
+	err = -ENODATA;  /* == ENOATTR */
+	xattr = __get_xattr(ci, name);
+	if (!xattr) {
+		if (vxattr)
+			err = vxattr->getxattr_cb(ci, value, size);
+		goto out;
+	}
+
+	err = -ERANGE;
+	if (size && size < xattr->val_len)
+		goto out;
+
+	err = xattr->val_len;
+	if (size == 0)
+		goto out;
+
+	memcpy(value, xattr->val, xattr->val_len);
+
+out:
+	spin_unlock(&inode->i_lock);
+	return err;
+}
+
+ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	u32 vir_namelen = 0;
+	u32 namelen;
+	int err;
+	u32 len;
+	int i;
+
+	spin_lock(&inode->i_lock);
+	dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
+	     ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+	    (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
+		goto list_xattr;
+	} else {
+		spin_unlock(&inode->i_lock);
+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+		if (err)
+			return err;
+	}
+
+	spin_lock(&inode->i_lock);
+
+	err = __build_xattrs(inode);
+	if (err < 0)
+		goto out;
+
+list_xattr:
+	vir_namelen = 0;
+	/* include virtual dir xattrs */
+	if (vxattrs)
+		for (i = 0; vxattrs[i].name; i++)
+			vir_namelen += strlen(vxattrs[i].name) + 1;
+	/* adding 1 byte per each variable due to the null termination */
+	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
+	err = -ERANGE;
+	if (size && namelen > size)
+		goto out;
+
+	err = namelen;
+	if (size == 0)
+		goto out;
+
+	names = __copy_xattr_names(ci, names);
+
+	/* virtual xattr names, too */
+	if (vxattrs)
+		for (i = 0; vxattrs[i].name; i++) {
+			len = sprintf(names, "%s", vxattrs[i].name);
+			names += len + 1;
+		}
+
+out:
+	spin_unlock(&inode->i_lock);
+	return err;
+}
+
+static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
+			      const char *value, size_t size, int flags)
+{
+	struct ceph_client *client = ceph_client(dentry->d_sb);
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+	struct ceph_mds_request *req;
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	int err;
+	int i, nr_pages;
+	struct page **pages = NULL;
+	void *kaddr;
+
+	/* copy value into some pages */
+	nr_pages = calc_pages_for(0, size);
+	if (nr_pages) {
+		pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
+		if (!pages)
+			return -ENOMEM;
+		err = -ENOMEM;
+		for (i = 0; i < nr_pages; i++) {
+			pages[i] = alloc_page(GFP_NOFS);
+			if (!pages[i]) {
+				nr_pages = i;
+				goto out;
+			}
+			kaddr = kmap(pages[i]);
+			memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
+			       min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
+		}
+	}
+
+	dout("setxattr value=%.*s\n", (int)size, value);
+
+	/* do request */
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+	req->r_num_caps = 1;
+	req->r_args.setxattr.flags = cpu_to_le32(flags);
+	req->r_path2 = kstrdup(name, GFP_NOFS);
+
+	req->r_pages = pages;
+	req->r_num_pages = nr_pages;
+	req->r_data_len = size;
+
+	dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	ceph_mdsc_put_request(req);
+	dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
+
+out:
+	if (pages) {
+		for (i = 0; i < nr_pages; i++)
+			__free_page(pages[i]);
+		kfree(pages);
+	}
+	return err;
+}
+
+int ceph_setxattr(struct dentry *dentry, const char *name,
+		  const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int err;
+	int name_len = strlen(name);
+	int val_len = size;
+	char *newname = NULL;
+	char *newval = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int issued;
+	int required_blob_size;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!ceph_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	if (vxattrs) {
+		struct ceph_vxattr_cb *vxattr =
+			ceph_match_vxattr(vxattrs, name);
+		if (vxattr && vxattr->readonly)
+			return -EOPNOTSUPP;
+	}
+
+	/* preallocate memory for xattr name, value, index node */
+	err = -ENOMEM;
+	newname = kmalloc(name_len + 1, GFP_NOFS);
+	if (!newname)
+		goto out;
+	memcpy(newname, name, name_len + 1);
+
+	if (val_len) {
+		newval = kmalloc(val_len + 1, GFP_NOFS);
+		if (!newval)
+			goto out;
+		memcpy(newval, value, val_len);
+		newval[val_len] = '\0';
+	}
+
+	xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
+	if (!xattr)
+		goto out;
+
+	spin_lock(&inode->i_lock);
+retry:
+	issued = __ceph_caps_issued(ci, NULL);
+	if (!(issued & CEPH_CAP_XATTR_EXCL))
+		goto do_sync;
+	__build_xattrs(inode);
+
+	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+
+	if (!ci->i_xattrs.prealloc_blob ||
+	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+		struct ceph_buffer *blob = NULL;
+
+		spin_unlock(&inode->i_lock);
+		dout(" preaallocating new blob size=%d\n", required_blob_size);
+		blob = ceph_buffer_new_alloc(required_blob_size, GFP_NOFS);
+		if (!blob)
+			goto out;
+		spin_lock(&inode->i_lock);
+		ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+		ci->i_xattrs.prealloc_blob = blob;
+		goto retry;
+	}
+
+	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
+	err = __set_xattr(ci, newname, name_len, newval,
+			  val_len, 1, 1, 1, &xattr);
+	__ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+	ci->i_xattrs.dirty = true;
+	inode->i_ctime = CURRENT_TIME;
+	spin_unlock(&inode->i_lock);
+
+	return err;
+
+do_sync:
+	spin_unlock(&inode->i_lock);
+	err = ceph_sync_setxattr(dentry, name, value, size, flags);
+out:
+	kfree(newname);
+	kfree(newval);
+	kfree(xattr);
+	return err;
+}
+
+static int ceph_send_removexattr(struct dentry *dentry, const char *name)
+{
+	struct ceph_client *client = ceph_client(dentry->d_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct inode *inode = dentry->d_inode;
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+	struct ceph_mds_request *req;
+	int err;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+	req->r_num_caps = 1;
+	req->r_path2 = kstrdup(name, GFP_NOFS);
+
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+int ceph_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int issued;
+	int err;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!ceph_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	if (vxattrs) {
+		struct ceph_vxattr_cb *vxattr =
+			ceph_match_vxattr(vxattrs, name);
+		if (vxattr && vxattr->readonly)
+			return -EOPNOTSUPP;
+	}
+
+	spin_lock(&inode->i_lock);
+	__build_xattrs(inode);
+	issued = __ceph_caps_issued(ci, NULL);
+	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+	if (!(issued & CEPH_CAP_XATTR_EXCL))
+		goto do_sync;
+
+	err = __remove_xattr_by_name(ceph_inode(inode), name);
+	__ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+	ci->i_xattrs.dirty = true;
+	inode->i_ctime = CURRENT_TIME;
+
+	spin_unlock(&inode->i_lock);
+
+	return err;
+do_sync:
+	spin_unlock(&inode->i_lock);
+	err = ceph_send_removexattr(dentry, name);
+	return err;
+}
+
-- 
cgit v1.2.2


From 2817b000b02c5f0c05af67c01fb2684e1381d6ef Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:08 -0700
Subject: ceph: directory operations

Directory operations, including lookup, are defined here.  We take
advantage of lookup intents when possible.  For the most part, we just
need to build the proper requests for the metadata server(s) and
pass things off to the mds_client.

The results of most operations are normally incorporated into the
client's cache when the reply is parsed by ceph_fill_trace().
However, if the MDS replies without a trace (e.g., when retrying an
update after an MDS failure recovery), some operation-specific cleanup
may be needed.

We can validate cached dentries in two ways.  A per-dentry lease may
be issued by the MDS, or a per-directory cap may be issued that acts
as a lease on the entire directory.  In the latter case, a 'gen' value
is used to determine which dentries belong to the currently leased
directory contents.

We normally prepopulate the dcache and icache with readdir results.
This makes subsequent lookups and getattrs avoid any server
interaction.  It also lets us satisfy readdir operation by peeking at
the dcache IFF we hold the per-directory cap/lease, previously
performed a readdir, and haven't dropped any of the resulting
dentries.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 1212 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1212 insertions(+)
 create mode 100644 fs/ceph/dir.c

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..7bb8db524e58
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1212 @@
+#include "ceph_debug.h"
+
+#include <linux/spinlock.h>
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+
+#include "super.h"
+
+/*
+ * Directory operations: readdir, lookup, create, link, unlink,
+ * rename, etc.
+ */
+
+/*
+ * Ceph MDS operations are specified in terms of a base ino and
+ * relative path.  Thus, the client can specify an operation on a
+ * specific inode (e.g., a getattr due to fstat(2)), or as a path
+ * relative to, say, the root directory.
+ *
+ * Normally, we limit ourselves to strict inode ops (no path component)
+ * or dentry operations (a single path component relative to an ino).  The
+ * exception to this is open_root_dentry(), which will open the mount
+ * point by name.
+ */
+
+const struct inode_operations ceph_dir_iops;
+const struct file_operations ceph_dir_fops;
+struct dentry_operations ceph_dentry_ops;
+
+/*
+ * Initialize ceph dentry state.
+ */
+int ceph_init_dentry(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di;
+
+	if (dentry->d_fsdata)
+		return 0;
+
+	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+		dentry->d_op = &ceph_dentry_ops;
+	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
+		dentry->d_op = &ceph_snapdir_dentry_ops;
+	else
+		dentry->d_op = &ceph_snap_dentry_ops;
+
+	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
+	if (!di)
+		return -ENOMEM;          /* oh well */
+
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_fsdata) /* lost a race */
+		goto out_unlock;
+	di->dentry = dentry;
+	di->lease_session = NULL;
+	dentry->d_fsdata = di;
+	dentry->d_time = jiffies;
+	ceph_dentry_lru_add(dentry);
+out_unlock:
+	spin_unlock(&dentry->d_lock);
+	return 0;
+}
+
+
+
+/*
+ * for readdir, we encode the directory frag and offset within that
+ * frag into f_pos.
+ */
+static unsigned fpos_frag(loff_t p)
+{
+	return p >> 32;
+}
+static unsigned fpos_off(loff_t p)
+{
+	return p & 0xffffffff;
+}
+
+/*
+ * When possible, we try to satisfy a readdir by peeking at the
+ * dcache.  We make this work by carefully ordering dentries on
+ * d_u.d_child when we initially get results back from the MDS, and
+ * falling back to a "normal" sync readdir if any dentries in the dir
+ * are dropped.
+ *
+ * I_COMPLETE tells indicates we have all dentries in the dir.  It is
+ * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
+ * the MDS if/when the directory is modified).
+ */
+static int __dcache_readdir(struct file *filp,
+			    void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_file_info *fi = filp->private_data;
+	struct dentry *parent = filp->f_dentry;
+	struct inode *dir = parent->d_inode;
+	struct list_head *p;
+	struct dentry *dentry, *last;
+	struct ceph_dentry_info *di;
+	int err = 0;
+
+	/* claim ref on last dentry we returned */
+	last = fi->dentry;
+	fi->dentry = NULL;
+
+	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+	     last);
+
+	spin_lock(&dcache_lock);
+
+	/* start at beginning? */
+	if (filp->f_pos == 2 || (last &&
+				 filp->f_pos < ceph_dentry(last)->offset)) {
+		if (list_empty(&parent->d_subdirs))
+			goto out_unlock;
+		p = parent->d_subdirs.prev;
+		dout(" initial p %p/%p\n", p->prev, p->next);
+	} else {
+		p = last->d_u.d_child.prev;
+	}
+
+more:
+	dentry = list_entry(p, struct dentry, d_u.d_child);
+	di = ceph_dentry(dentry);
+	while (1) {
+		dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+		     parent->d_subdirs.prev, parent->d_subdirs.next);
+		if (p == &parent->d_subdirs) {
+			fi->at_end = 1;
+			goto out_unlock;
+		}
+		if (!d_unhashed(dentry) && dentry->d_inode &&
+		    filp->f_pos <= di->offset)
+			break;
+		dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
+		     dentry->d_name.len, dentry->d_name.name, di->offset,
+		     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+		     !dentry->d_inode ? " null" : "");
+		p = p->prev;
+		dentry = list_entry(p, struct dentry, d_u.d_child);
+		di = ceph_dentry(dentry);
+	}
+
+	atomic_inc(&dentry->d_count);
+	spin_unlock(&dcache_lock);
+	spin_unlock(&inode->i_lock);
+
+	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+	filp->f_pos = di->offset;
+	err = filldir(dirent, dentry->d_name.name,
+		      dentry->d_name.len, di->offset,
+		      dentry->d_inode->i_ino,
+		      dentry->d_inode->i_mode >> 12);
+
+	if (last) {
+		if (err < 0) {
+			/* remember our position */
+			fi->dentry = last;
+			fi->next_offset = di->offset;
+		} else {
+			dput(last);
+		}
+		last = NULL;
+	}
+
+	spin_lock(&inode->i_lock);
+	spin_lock(&dcache_lock);
+
+	if (err < 0)
+		goto out_unlock;
+
+	last = dentry;
+
+	p = p->prev;
+	filp->f_pos++;
+
+	/* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+	if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
+		goto more;
+	dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+	err = -EAGAIN;
+
+out_unlock:
+	spin_unlock(&dcache_lock);
+
+	if (last) {
+		spin_unlock(&inode->i_lock);
+		dput(last);
+		spin_lock(&inode->i_lock);
+	}
+
+	return err;
+}
+
+/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+			    int len)
+{
+	kfree(fi->last_name);
+	fi->last_name = kmalloc(len+1, GFP_NOFS);
+	if (!fi->last_name)
+		return -ENOMEM;
+	memcpy(fi->last_name, name, len);
+	fi->last_name[len] = 0;
+	dout("note_last_dentry '%s'\n", fi->last_name);
+	return 0;
+}
+
+static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct ceph_file_info *fi = filp->private_data;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	unsigned frag = fpos_frag(filp->f_pos);
+	int off = fpos_off(filp->f_pos);
+	int err;
+	u32 ftype;
+	struct ceph_mds_reply_info_parsed *rinfo;
+	const int max_entries = client->mount_args.max_readdir;
+
+	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+	if (fi->at_end)
+		return 0;
+
+	/* always start with . and .. */
+	if (filp->f_pos == 0) {
+		/* note dir version at start of readdir so we can tell
+		 * if any dentries get dropped */
+		fi->dir_release_count = ci->i_release_count;
+
+		dout("readdir off 0 -> '.'\n");
+		if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+			    inode->i_ino, inode->i_mode >> 12) < 0)
+			return 0;
+		filp->f_pos = 1;
+		off = 1;
+	}
+	if (filp->f_pos == 1) {
+		dout("readdir off 1 -> '..'\n");
+		if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+			    filp->f_dentry->d_parent->d_inode->i_ino,
+			    inode->i_mode >> 12) < 0)
+			return 0;
+		filp->f_pos = 2;
+		off = 2;
+	}
+
+	/* can we use the dcache? */
+	spin_lock(&inode->i_lock);
+	if ((filp->f_pos == 2 || fi->dentry) &&
+	    !ceph_test_opt(client, NOASYNCREADDIR) &&
+	    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+		err = __dcache_readdir(filp, dirent, filldir);
+		if (err != -EAGAIN) {
+			spin_unlock(&inode->i_lock);
+			return err;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	if (fi->dentry) {
+		err = note_last_dentry(fi, fi->dentry->d_name.name,
+				       fi->dentry->d_name.len);
+		if (err)
+			return err;
+		dput(fi->dentry);
+		fi->dentry = NULL;
+	}
+
+	/* proceed with a normal readdir */
+
+more:
+	/* do we have the correct frag content buffered? */
+	if (fi->frag != frag || fi->last_readdir == NULL) {
+		struct ceph_mds_request *req;
+		int op = ceph_snap(inode) == CEPH_SNAPDIR ?
+			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
+
+		/* discard old result, if any */
+		if (fi->last_readdir)
+			ceph_mdsc_put_request(fi->last_readdir);
+
+		/* requery frag tree, as the frag topology may have changed */
+		frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
+
+		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
+		     ceph_vinop(inode), frag, fi->last_name);
+		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+		req->r_inode = igrab(inode);
+		req->r_dentry = dget(filp->f_dentry);
+		/* hints to request -> mds selection code */
+		req->r_direct_mode = USE_AUTH_MDS;
+		req->r_direct_hash = ceph_frag_value(frag);
+		req->r_direct_is_hash = true;
+		req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+		req->r_readdir_offset = fi->next_offset;
+		req->r_args.readdir.frag = cpu_to_le32(frag);
+		req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+		req->r_num_caps = max_entries;
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		if (err < 0) {
+			ceph_mdsc_put_request(req);
+			return err;
+		}
+		dout("readdir got and parsed readdir result=%d"
+		     " on frag %x, end=%d, complete=%d\n", err, frag,
+		     (int)req->r_reply_info.dir_end,
+		     (int)req->r_reply_info.dir_complete);
+
+		if (!req->r_did_prepopulate) {
+			dout("readdir !did_prepopulate");
+			fi->dir_release_count--;    /* preclude I_COMPLETE */
+		}
+
+		/* note next offset and last dentry name */
+		fi->offset = fi->next_offset;
+		fi->last_readdir = req;
+
+		if (req->r_reply_info.dir_end) {
+			kfree(fi->last_name);
+			fi->last_name = NULL;
+			fi->next_offset = 0;
+		} else {
+			rinfo = &req->r_reply_info;
+			err = note_last_dentry(fi,
+				       rinfo->dir_dname[rinfo->dir_nr-1],
+				       rinfo->dir_dname_len[rinfo->dir_nr-1]);
+			if (err)
+				return err;
+			fi->next_offset += rinfo->dir_nr;
+		}
+	}
+
+	rinfo = &fi->last_readdir->r_reply_info;
+	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
+	     rinfo->dir_nr, off, fi->offset);
+	while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
+		u64 pos = ceph_make_fpos(frag, off);
+		struct ceph_mds_reply_inode *in =
+			rinfo->dir_in[off - fi->offset].in;
+		dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
+		     off, off - fi->offset, rinfo->dir_nr, pos,
+		     rinfo->dir_dname_len[off - fi->offset],
+		     rinfo->dir_dname[off - fi->offset], in);
+		BUG_ON(!in);
+		ftype = le32_to_cpu(in->mode) >> 12;
+		if (filldir(dirent,
+			    rinfo->dir_dname[off - fi->offset],
+			    rinfo->dir_dname_len[off - fi->offset],
+			    pos,
+			    le64_to_cpu(in->ino),
+			    ftype) < 0) {
+			dout("filldir stopping us...\n");
+			return 0;
+		}
+		off++;
+		filp->f_pos = pos + 1;
+	}
+
+	if (fi->last_name) {
+		ceph_mdsc_put_request(fi->last_readdir);
+		fi->last_readdir = NULL;
+		goto more;
+	}
+
+	/* more frags? */
+	if (!ceph_frag_is_rightmost(frag)) {
+		frag = ceph_frag_next(frag);
+		off = 0;
+		filp->f_pos = ceph_make_fpos(frag, off);
+		dout("readdir next frag is %x\n", frag);
+		goto more;
+	}
+	fi->at_end = 1;
+
+	/*
+	 * if dir_release_count still matches the dir, no dentries
+	 * were released during the whole readdir, and we should have
+	 * the complete dir contents in our cache.
+	 */
+	spin_lock(&inode->i_lock);
+	if (ci->i_release_count == fi->dir_release_count) {
+		dout(" marking %p complete\n", inode);
+		ci->i_ceph_flags |= CEPH_I_COMPLETE;
+		ci->i_max_offset = filp->f_pos;
+	}
+	spin_unlock(&inode->i_lock);
+
+	dout("readdir %p filp %p done.\n", inode, filp);
+	return 0;
+}
+
+static void reset_readdir(struct ceph_file_info *fi)
+{
+	if (fi->last_readdir) {
+		ceph_mdsc_put_request(fi->last_readdir);
+		fi->last_readdir = NULL;
+	}
+	kfree(fi->last_name);
+	fi->next_offset = 2;  /* compensate for . and .. */
+	if (fi->dentry) {
+		dput(fi->dentry);
+		fi->dentry = NULL;
+	}
+	fi->at_end = 0;
+}
+
+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file->f_mapping->host;
+	loff_t old_offset = offset;
+	loff_t retval;
+
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+	case SEEK_END:
+		offset += inode->i_size + 2;   /* FIXME */
+		break;
+	case SEEK_CUR:
+		offset += file->f_pos;
+	}
+	retval = -EINVAL;
+	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+		if (offset != file->f_pos) {
+			file->f_pos = offset;
+			file->f_version = 0;
+			fi->at_end = 0;
+		}
+		retval = offset;
+
+		/*
+		 * discard buffered readdir content on seekdir(0), or
+		 * seek to new frag, or seek prior to current chunk.
+		 */
+		if (offset == 0 ||
+		    fpos_frag(offset) != fpos_frag(old_offset) ||
+		    fpos_off(offset) < fi->offset) {
+			dout("dir_llseek dropping %p content\n", file);
+			reset_readdir(fi);
+		}
+
+		/* bump dir_release_count if we did a forward seek */
+		if (offset > old_offset)
+			fi->dir_release_count--;
+	}
+	mutex_unlock(&inode->i_mutex);
+	return retval;
+}
+
+/*
+ * Process result of a lookup/open request.
+ *
+ * Mainly, make sure we return the final req->r_dentry (if it already
+ * existed) in place of the original VFS-provided dentry when they
+ * differ.
+ *
+ * Gracefully handle the case where the MDS replies with -ENOENT and
+ * no trace (which it may do, at its discretion, e.g., if it doesn't
+ * care to issue a lease on the negative dentry).
+ */
+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+				  struct dentry *dentry, int err)
+{
+	struct ceph_client *client = ceph_client(dentry->d_sb);
+	struct inode *parent = dentry->d_parent->d_inode;
+
+	/* .snap dir? */
+	if (err == -ENOENT &&
+	    ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
+	    strcmp(dentry->d_name.name, client->mount_args.snapdir_name) == 0) {
+		struct inode *inode = ceph_get_snapdir(parent);
+		dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
+		     dentry, dentry->d_name.len, dentry->d_name.name, inode);
+		d_add(dentry, inode);
+		err = 0;
+	}
+
+	if (err == -ENOENT) {
+		/* no trace? */
+		err = 0;
+		if (!req->r_reply_info.head->is_dentry) {
+			dout("ENOENT and no trace, dentry %p inode %p\n",
+			     dentry, dentry->d_inode);
+			if (dentry->d_inode) {
+				d_drop(dentry);
+				err = -ENOENT;
+			} else {
+				d_add(dentry, NULL);
+			}
+		}
+	}
+	if (err)
+		dentry = ERR_PTR(err);
+	else if (dentry != req->r_dentry)
+		dentry = dget(req->r_dentry);   /* we got spliced */
+	else
+		dentry = NULL;
+	return dentry;
+}
+
+/*
+ * Look up a single dir entry.  If there is a lookup intent, inform
+ * the MDS so that it gets our 'caps wanted' value in a single op.
+ */
+static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int op;
+	int err;
+
+	dout("lookup %p dentry %p '%.*s'\n",
+	     dir, dentry, dentry->d_name.len, dentry->d_name.name);
+
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	err = ceph_init_dentry(dentry);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	/* open (but not create!) intent? */
+	if (nd &&
+	    (nd->flags & LOOKUP_OPEN) &&
+	    (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
+	    !(nd->intent.open.flags & O_CREAT)) {
+		int mode = nd->intent.open.create_mode & ~current->fs->umask;
+		return ceph_lookup_open(dir, dentry, nd, mode, 1);
+	}
+
+	/* can we conclude ENOENT locally? */
+	if (dentry->d_inode == NULL) {
+		struct ceph_inode_info *ci = ceph_inode(dir);
+		struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+		spin_lock(&dir->i_lock);
+		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+		if (strncmp(dentry->d_name.name,
+			    client->mount_args.snapdir_name,
+			    dentry->d_name.len) &&
+		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+			di->offset = ci->i_max_offset++;
+			spin_unlock(&dir->i_lock);
+			dout(" dir %p complete, -ENOENT\n", dir);
+			d_add(dentry, NULL);
+			di->lease_shared_gen = ci->i_shared_gen;
+			return NULL;
+		}
+		spin_unlock(&dir->i_lock);
+	}
+
+	op = ceph_snap(dir) == CEPH_SNAPDIR ?
+		CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+	req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return ERR_PTR(PTR_ERR(req));
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	/* we only need inode linkage */
+	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+	req->r_locked_dir = dir;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	dentry = ceph_finish_lookup(req, dentry, err);
+	ceph_mdsc_put_request(req);  /* will dput(dentry) */
+	dout("lookup result=%p\n", dentry);
+	return dentry;
+}
+
+/*
+ * If we do a create but get no trace back from the MDS, follow up with
+ * a lookup (the VFS expects us to link up the provided dentry).
+ */
+int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
+{
+	struct dentry *result = ceph_lookup(dir, dentry, NULL);
+
+	if (result && !IS_ERR(result)) {
+		/*
+		 * We created the item, then did a lookup, and found
+		 * it was already linked to another inode we already
+		 * had in our cache (and thus got spliced).  Link our
+		 * dentry to that inode, but don't hash it, just in
+		 * case the VFS wants to dereference it.
+		 */
+		BUG_ON(!result->d_inode);
+		d_instantiate(dentry, result->d_inode);
+		return 0;
+	}
+	return PTR_ERR(result);
+}
+
+static int ceph_mknod(struct inode *dir, struct dentry *dentry,
+		      int mode, dev_t rdev)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
+	     dir, dentry, mode, rdev);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		d_drop(dentry);
+		return PTR_ERR(req);
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_args.mknod.mode = cpu_to_le32(mode);
+	req->r_args.mknod.rdev = cpu_to_le32(rdev);
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+	if (err)
+		d_drop(dentry);
+	return err;
+}
+
+static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
+		       struct nameidata *nd)
+{
+	dout("create in dir %p dentry %p name '%.*s'\n",
+	     dir, dentry, dentry->d_name.len, dentry->d_name.name);
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (nd) {
+		BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
+		dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
+		/* hrm, what should i do here if we get aliased? */
+		if (IS_ERR(dentry))
+			return PTR_ERR(dentry);
+		return 0;
+	}
+
+	/* fall back to mknod */
+	return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
+}
+
+static int ceph_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *dest)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		d_drop(dentry);
+		return PTR_ERR(req);
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_path2 = kstrdup(dest, GFP_NOFS);
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+	if (err)
+		d_drop(dentry);
+	return err;
+}
+
+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err = -EROFS;
+	int op;
+
+	if (ceph_snap(dir) == CEPH_SNAPDIR) {
+		/* mkdir .snap/foo is a MKSNAP */
+		op = CEPH_MDS_OP_MKSNAP;
+		dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
+		     dentry->d_name.len, dentry->d_name.name, dentry);
+	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
+		dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
+		op = CEPH_MDS_OP_MKDIR;
+	} else {
+		goto out;
+	}
+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_args.mkdir.mode = cpu_to_le32(mode);
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+out:
+	if (err < 0)
+		d_drop(dentry);
+	return err;
+}
+
+static int ceph_link(struct dentry *old_dentry, struct inode *dir,
+		     struct dentry *dentry)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("link in dir %p old_dentry %p dentry %p\n", dir,
+	     old_dentry, dentry);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		d_drop(dentry);
+		return PTR_ERR(req);
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (err)
+		d_drop(dentry);
+	else if (!req->r_reply_info.head->is_dentry)
+		d_instantiate(dentry, igrab(old_dentry->d_inode));
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps.  If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+static int drop_caps_for_unlink(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+
+	spin_lock(&inode->i_lock);
+	if (inode->i_nlink == 1) {
+		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+		ci->i_ceph_flags |= CEPH_I_NODELAY;
+	}
+	spin_unlock(&inode->i_lock);
+	return drop;
+}
+
+/*
+ * rmdir and unlink are differ only by the metadata op code
+ */
+static int ceph_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct inode *inode = dentry->d_inode;
+	struct ceph_mds_request *req;
+	int err = -EROFS;
+	int op;
+
+	if (ceph_snap(dir) == CEPH_SNAPDIR) {
+		/* rmdir .snap/foo is RMSNAP */
+		dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
+		     dentry->d_name.name, dentry);
+		op = CEPH_MDS_OP_RMSNAP;
+	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
+		dout("unlink/rmdir dir %p dn %p inode %p\n",
+		     dir, dentry, inode);
+		op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
+			CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
+	} else
+		goto out;
+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	req->r_inode_drop = drop_caps_for_unlink(inode);
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		d_delete(dentry);
+	ceph_mdsc_put_request(req);
+out:
+	return err;
+}
+
+static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(old_dir) != ceph_snap(new_dir))
+		return -EXDEV;
+	if (ceph_snap(old_dir) != CEPH_NOSNAP ||
+	    ceph_snap(new_dir) != CEPH_NOSNAP)
+		return -EROFS;
+	dout("rename dir %p dentry %p to dir %p dentry %p\n",
+	     old_dir, old_dentry, new_dir, new_dentry);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_dentry = dget(new_dentry);
+	req->r_num_caps = 2;
+	req->r_old_dentry = dget(old_dentry);
+	req->r_locked_dir = new_dir;
+	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	/* release LINK_RDCACHE on source inode (mds will lock it) */
+	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
+	if (new_dentry->d_inode)
+		req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
+	err = ceph_mdsc_do_request(mdsc, old_dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry) {
+		/*
+		 * Normally d_move() is done by fill_trace (called by
+		 * do_request, above).  If there is no trace, we need
+		 * to do it here.
+		 */
+		d_move(old_dentry, new_dentry);
+	}
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+
+/*
+ * Check if dentry lease is valid.  If not, delete the lease.  Try to
+ * renew if the least is more than half up.
+ */
+static int dentry_lease_is_valid(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di;
+	struct ceph_mds_session *s;
+	int valid = 0;
+	u32 gen;
+	unsigned long ttl;
+	struct ceph_mds_session *session = NULL;
+	struct inode *dir = NULL;
+	u32 seq = 0;
+
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	if (di && di->lease_session) {
+		s = di->lease_session;
+		spin_lock(&s->s_cap_lock);
+		gen = s->s_cap_gen;
+		ttl = s->s_cap_ttl;
+		spin_unlock(&s->s_cap_lock);
+
+		if (di->lease_gen == gen &&
+		    time_before(jiffies, dentry->d_time) &&
+		    time_before(jiffies, ttl)) {
+			valid = 1;
+			if (di->lease_renew_after &&
+			    time_after(jiffies, di->lease_renew_after)) {
+				/* we should renew */
+				dir = dentry->d_parent->d_inode;
+				session = ceph_get_mds_session(s);
+				seq = di->lease_seq;
+				di->lease_renew_after = 0;
+				di->lease_renew_from = jiffies;
+			}
+		} else {
+			__ceph_mdsc_drop_dentry_lease(dentry);
+		}
+	}
+	spin_unlock(&dentry->d_lock);
+
+	if (session) {
+		ceph_mdsc_lease_send_msg(session, dir, dentry,
+					 CEPH_MDS_LEASE_RENEW, seq);
+		ceph_put_mds_session(session);
+	}
+	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+	return valid;
+}
+
+/*
+ * Check if directory-wide content lease/cap is valid.
+ */
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+{
+	struct ceph_inode_info *ci = ceph_inode(dir);
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	int valid = 0;
+
+	spin_lock(&dir->i_lock);
+	if (ci->i_shared_gen == di->lease_shared_gen)
+		valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+	spin_unlock(&dir->i_lock);
+	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
+	     dir, (unsigned)ci->i_shared_gen, dentry,
+	     (unsigned)di->lease_shared_gen, valid);
+	return valid;
+}
+
+/*
+ * Check if cached dentry can be trusted.
+ */
+static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+
+	dout("d_revalidate %p '%.*s' inode %p\n", dentry,
+	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+
+	/* always trust cached snapped dentries, snapdir dentry */
+	if (ceph_snap(dir) != CEPH_NOSNAP) {
+		dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
+		     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+		goto out_touch;
+	}
+	if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
+		goto out_touch;
+
+	if (dentry_lease_is_valid(dentry) ||
+	    dir_lease_is_valid(dir, dentry))
+		goto out_touch;
+
+	dout("d_revalidate %p invalid\n", dentry);
+	d_drop(dentry);
+	return 0;
+out_touch:
+	ceph_dentry_lru_touch(dentry);
+	return 1;
+}
+
+/*
+ * When a dentry is released, clear the dir I_COMPLETE if it was part
+ * of the current dir gen.
+ */
+static void ceph_dentry_release(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+
+	if (parent_inode) {
+		struct ceph_inode_info *ci = ceph_inode(parent_inode);
+
+		spin_lock(&parent_inode->i_lock);
+		if (ci->i_shared_gen == di->lease_shared_gen) {
+			dout(" clearing %p complete (d_release)\n",
+			     parent_inode);
+			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+			ci->i_release_count++;
+		}
+		spin_unlock(&parent_inode->i_lock);
+	}
+	if (di) {
+		ceph_dentry_lru_del(dentry);
+		if (di->lease_session)
+			ceph_put_mds_session(di->lease_session);
+		kmem_cache_free(ceph_dentry_cachep, di);
+		dentry->d_fsdata = NULL;
+	}
+}
+
+static int ceph_snapdir_d_revalidate(struct dentry *dentry,
+					  struct nameidata *nd)
+{
+	/*
+	 * Eventually, we'll want to revalidate snapped metadata
+	 * too... probably...
+	 */
+	return 1;
+}
+
+
+
+/*
+ * read() on a dir.  This weird interface hack only works if mounted
+ * with '-o dirstat'.
+ */
+static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
+			     loff_t *ppos)
+{
+	struct ceph_file_info *cf = file->private_data;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int left;
+
+	if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+		return -EISDIR;
+
+	if (!cf->dir_info) {
+		cf->dir_info = kmalloc(1024, GFP_NOFS);
+		if (!cf->dir_info)
+			return -ENOMEM;
+		cf->dir_info_len =
+			sprintf(cf->dir_info,
+				"entries:   %20lld\n"
+				" files:    %20lld\n"
+				" subdirs:  %20lld\n"
+				"rentries:  %20lld\n"
+				" rfiles:   %20lld\n"
+				" rsubdirs: %20lld\n"
+				"rbytes:    %20lld\n"
+				"rctime:    %10ld.%09ld\n",
+				ci->i_files + ci->i_subdirs,
+				ci->i_files,
+				ci->i_subdirs,
+				ci->i_rfiles + ci->i_rsubdirs,
+				ci->i_rfiles,
+				ci->i_rsubdirs,
+				ci->i_rbytes,
+				(long)ci->i_rctime.tv_sec,
+				(long)ci->i_rctime.tv_nsec);
+	}
+
+	if (*ppos >= cf->dir_info_len)
+		return 0;
+	size = min_t(unsigned, size, cf->dir_info_len-*ppos);
+	left = copy_to_user(buf, cf->dir_info + *ppos, size);
+	if (left == size)
+		return -EFAULT;
+	*ppos += (size - left);
+	return size - left;
+}
+
+/*
+ * an fsync() on a dir will wait for any uncommitted directory
+ * operations to commit.
+ */
+static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
+			  int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct list_head *head = &ci->i_unsafe_dirops;
+	struct ceph_mds_request *req;
+	u64 last_tid;
+	int ret = 0;
+
+	dout("dir_fsync %p\n", inode);
+	spin_lock(&ci->i_unsafe_lock);
+	if (list_empty(head))
+		goto out;
+
+	req = list_entry(head->prev,
+			 struct ceph_mds_request, r_unsafe_dir_item);
+	last_tid = req->r_tid;
+
+	do {
+		ceph_mdsc_get_request(req);
+		spin_unlock(&ci->i_unsafe_lock);
+		dout("dir_fsync %p wait on tid %llu (until %llu)\n",
+		     inode, req->r_tid, last_tid);
+		if (req->r_timeout) {
+			ret = wait_for_completion_timeout(
+				&req->r_safe_completion, req->r_timeout);
+			if (ret > 0)
+				ret = 0;
+			else if (ret == 0)
+				ret = -EIO;  /* timed out */
+		} else {
+			wait_for_completion(&req->r_safe_completion);
+		}
+		spin_lock(&ci->i_unsafe_lock);
+		ceph_mdsc_put_request(req);
+
+		if (ret || list_empty(head))
+			break;
+		req = list_entry(head->next,
+				 struct ceph_mds_request, r_unsafe_dir_item);
+	} while (req->r_tid < last_tid);
+out:
+	spin_unlock(&ci->i_unsafe_lock);
+	return ret;
+}
+
+/*
+ * We maintain a private dentry LRU.
+ *
+ * FIXME: this needs to be changed to a per-mds lru to be useful.
+ */
+void ceph_dentry_lru_add(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+	dout("dentry_lru_add %p %p\t%.*s\n",
+			di, dn, dn->d_name.len, dn->d_name.name);
+
+	if (di) {
+		mdsc = &ceph_client(dn->d_sb)->mdsc;
+		spin_lock(&mdsc->dentry_lru_lock);
+		list_add_tail(&di->lru, &mdsc->dentry_lru);
+		mdsc->num_dentry++;
+		spin_unlock(&mdsc->dentry_lru_lock);
+	}
+}
+
+void ceph_dentry_lru_touch(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+	dout("dentry_lru_touch %p %p\t%.*s\n",
+			di, dn, dn->d_name.len, dn->d_name.name);
+
+	if (di) {
+		mdsc = &ceph_client(dn->d_sb)->mdsc;
+		spin_lock(&mdsc->dentry_lru_lock);
+		list_move_tail(&di->lru, &mdsc->dentry_lru);
+		spin_unlock(&mdsc->dentry_lru_lock);
+	}
+}
+
+void ceph_dentry_lru_del(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lru_del %p %p\t%.*s\n",
+			di, dn, dn->d_name.len, dn->d_name.name);
+	if (di) {
+		mdsc = &ceph_client(dn->d_sb)->mdsc;
+		spin_lock(&mdsc->dentry_lru_lock);
+		list_del_init(&di->lru);
+		mdsc->num_dentry--;
+		spin_unlock(&mdsc->dentry_lru_lock);
+	}
+}
+
+const struct file_operations ceph_dir_fops = {
+	.read = ceph_read_dir,
+	.readdir = ceph_readdir,
+	.llseek = ceph_dir_llseek,
+	.open = ceph_open,
+	.release = ceph_release,
+	.unlocked_ioctl = ceph_ioctl,
+	.fsync = ceph_dir_fsync,
+};
+
+const struct inode_operations ceph_dir_iops = {
+	.lookup = ceph_lookup,
+	.permission = ceph_permission,
+	.getattr = ceph_getattr,
+	.setattr = ceph_setattr,
+	.setxattr = ceph_setxattr,
+	.getxattr = ceph_getxattr,
+	.listxattr = ceph_listxattr,
+	.removexattr = ceph_removexattr,
+	.mknod = ceph_mknod,
+	.symlink = ceph_symlink,
+	.mkdir = ceph_mkdir,
+	.link = ceph_link,
+	.unlink = ceph_unlink,
+	.rmdir = ceph_unlink,
+	.rename = ceph_rename,
+	.create = ceph_create,
+};
+
+struct dentry_operations ceph_dentry_ops = {
+	.d_revalidate = ceph_d_revalidate,
+	.d_release = ceph_dentry_release,
+};
+
+struct dentry_operations ceph_snapdir_dentry_ops = {
+	.d_revalidate = ceph_snapdir_d_revalidate,
+};
+
+struct dentry_operations ceph_snap_dentry_ops = {
+};
-- 
cgit v1.2.2


From 124e68e74099090e28da5518f73fda878e7e8232 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:08 -0700
Subject: ceph: file operations

File open and close operations, and read and write methods that ensure
we have obtained the proper capabilities from the MDS cluster before
performing IO on a file.  We take references on held capabilities for
the duration of the read/write to avoid prematurely releasing them
back to the MDS.

We implement two main paths for read and write: one that is buffered
(and uses generic_aio_{read,write}), and one that is fully synchronous
and blocking (operating either on a __user pointer or, if O_DIRECT,
directly on user pages).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/file.c | 904 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 904 insertions(+)
 create mode 100644 fs/ceph/file.c

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..1bd57c8953bf
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,904 @@
+#include "ceph_debug.h"
+
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+/*
+ * Ceph file operations
+ *
+ * Implement basic open/close functionality, and implement
+ * read/write.
+ *
+ * We implement three modes of file I/O:
+ *  - buffered uses the generic_file_aio_{read,write} helpers
+ *
+ *  - synchronous is used when there is multi-client read/write
+ *    sharing, avoids the page cache, and synchronously waits for an
+ *    ack from the OSD.
+ *
+ *  - direct io takes the variant of the sync path that references
+ *    user pages directly.
+ *
+ * fsync() flushes and waits on dirty pages, but just queues metadata
+ * for writeback: since the MDS can recover size and mtime there is no
+ * need to wait for MDS acknowledgement.
+ */
+
+
+/*
+ * Prepare an open request.  Preallocate ceph_cap to avoid an
+ * inopportune ENOMEM later.
+ */
+static struct ceph_mds_request *
+prepare_open_request(struct super_block *sb, int flags, int create_mode)
+{
+	struct ceph_client *client = ceph_sb_to_client(sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int want_auth = USE_ANY_MDS;
+	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
+
+	if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
+		want_auth = USE_AUTH_MDS;
+
+	req = ceph_mdsc_create_request(mdsc, op, want_auth);
+	if (IS_ERR(req))
+		goto out;
+	req->r_fmode = ceph_flags_to_mode(flags);
+	req->r_args.open.flags = cpu_to_le32(flags);
+	req->r_args.open.mode = cpu_to_le32(create_mode);
+	req->r_args.open.preferred = -1;
+out:
+	return req;
+}
+
+/*
+ * initialize private struct file data.
+ * if we fail, clean up by dropping fmode reference on the ceph_inode
+ */
+static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
+{
+	struct ceph_file_info *cf;
+	int ret = 0;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+		dout("init_file %p %p 0%o (regular)\n", inode, file,
+		     inode->i_mode);
+		cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
+		if (cf == NULL) {
+			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+			return -ENOMEM;
+		}
+		cf->fmode = fmode;
+		cf->next_offset = 2;
+		file->private_data = cf;
+		BUG_ON(inode->i_fop->release != ceph_release);
+		break;
+
+	case S_IFLNK:
+		dout("init_file %p %p 0%o (symlink)\n", inode, file,
+		     inode->i_mode);
+		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		break;
+
+	default:
+		dout("init_file %p %p 0%o (special)\n", inode, file,
+		     inode->i_mode);
+		/*
+		 * we need to drop the open ref now, since we don't
+		 * have .release set to ceph_release.
+		 */
+		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		BUG_ON(inode->i_fop->release == ceph_release);
+
+		/* call the proper open fop */
+		ret = inode->i_fop->open(inode, file);
+	}
+	return ret;
+}
+
+/*
+ * If the filp already has private_data, that means the file was
+ * already opened by intent during lookup, and we do nothing.
+ *
+ * If we already have the requisite capabilities, we can satisfy
+ * the open request locally (no need to request new caps from the
+ * MDS).  We do, however, need to inform the MDS (asynchronously)
+ * if our wanted caps set expands.
+ */
+int ceph_open(struct inode *inode, struct file *file)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_file_info *cf = file->private_data;
+	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+	int err;
+	int flags, fmode, wanted;
+
+	if (cf) {
+		dout("open file %p is already opened\n", file);
+		return 0;
+	}
+
+	/* filter out O_CREAT|O_EXCL; vfs did that already.  yuck. */
+	flags = file->f_flags & ~(O_CREAT|O_EXCL);
+	if (S_ISDIR(inode->i_mode))
+		flags = O_DIRECTORY;  /* mds likes to know */
+
+	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
+	     ceph_vinop(inode), file, flags, file->f_flags);
+	fmode = ceph_flags_to_mode(flags);
+	wanted = ceph_caps_for_mode(fmode);
+
+	/* snapped files are read-only */
+	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
+		return -EROFS;
+
+	/* trivially open snapdir */
+	if (ceph_snap(inode) == CEPH_SNAPDIR) {
+		spin_lock(&inode->i_lock);
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&inode->i_lock);
+		return ceph_init_file(inode, file, fmode);
+	}
+
+	/*
+	 * No need to block if we have any caps.  Update wanted set
+	 * asynchronously.
+	 */
+	spin_lock(&inode->i_lock);
+	if (__ceph_is_any_real_caps(ci)) {
+		int mds_wanted = __ceph_caps_mds_wanted(ci);
+		int issued = __ceph_caps_issued(ci, NULL);
+
+		dout("open %p fmode %d want %s issued %s using existing\n",
+		     inode, fmode, ceph_cap_string(wanted),
+		     ceph_cap_string(issued));
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&inode->i_lock);
+
+		/* adjust wanted? */
+		if ((issued & wanted) != wanted &&
+		    (mds_wanted & wanted) != wanted &&
+		    ceph_snap(inode) != CEPH_SNAPDIR)
+			ceph_check_caps(ci, 0, NULL);
+
+		return ceph_init_file(inode, file, fmode);
+	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
+		   (ci->i_snap_caps & wanted) == wanted) {
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&inode->i_lock);
+		return ceph_init_file(inode, file, fmode);
+	}
+	spin_unlock(&inode->i_lock);
+
+	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
+	req = prepare_open_request(inode->i_sb, flags, 0);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_inode = igrab(inode);
+	req->r_num_caps = 1;
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	if (!err)
+		err = ceph_init_file(inode, file, req->r_fmode);
+	ceph_mdsc_put_request(req);
+	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
+out:
+	return err;
+}
+
+
+/*
+ * Do a lookup + open with a single request.
+ *
+ * If this succeeds, but some subsequent check in the vfs
+ * may_open() fails, the struct *file gets cleaned up (i.e.
+ * ceph_release gets called).  So fear not!
+ */
+/*
+ * flags
+ *  path_lookup_open   -> LOOKUP_OPEN
+ *  path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
+ */
+struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+				struct nameidata *nd, int mode,
+				int locked_dir)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct file *file = nd->intent.open.file;
+	struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
+	struct ceph_mds_request *req;
+	int err;
+	int flags = nd->intent.open.flags - 1;  /* silly vfs! */
+
+	dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
+	     dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
+
+	/* do the open */
+	req = prepare_open_request(dir->i_sb, flags, mode);
+	if (IS_ERR(req))
+		return ERR_PTR(PTR_ERR(req));
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	if (flags & O_CREAT) {
+		req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	}
+	req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	dentry = ceph_finish_lookup(req, dentry, err);
+	if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	if (!err)
+		err = ceph_init_file(req->r_dentry->d_inode, file,
+				     req->r_fmode);
+	ceph_mdsc_put_request(req);
+	dout("ceph_lookup_open result=%p\n", dentry);
+	return dentry;
+}
+
+int ceph_release(struct inode *inode, struct file *file)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_file_info *cf = file->private_data;
+
+	dout("release inode %p file %p\n", inode, file);
+	ceph_put_fmode(ci, cf->fmode);
+	if (cf->last_readdir)
+		ceph_mdsc_put_request(cf->last_readdir);
+	kfree(cf->last_name);
+	kfree(cf->dir_info);
+	dput(cf->dentry);
+	kmem_cache_free(ceph_file_cachep, cf);
+	return 0;
+}
+
+/*
+ * build a vector of user pages
+ */
+static struct page **get_direct_page_vector(const char __user *data,
+					    int num_pages,
+					    loff_t off, size_t len)
+{
+	struct page **pages;
+	int rc;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	down_read(&current->mm->mmap_sem);
+	rc = get_user_pages(current, current->mm, (unsigned long)data,
+			    num_pages, 0, 0, pages, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (rc < 0)
+		goto fail;
+	return pages;
+
+fail:
+	kfree(pages);
+	return ERR_PTR(rc);
+}
+
+static void put_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+	kfree(pages);
+}
+
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		__free_pages(pages[i], 0);
+	kfree(pages);
+}
+
+/*
+ * allocate a vector new pages
+ */
+static struct page **alloc_page_vector(int num_pages)
+{
+	struct page **pages;
+	int i;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = alloc_page(GFP_NOFS);
+		if (pages[i] == NULL) {
+			ceph_release_page_vector(pages, i);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	return pages;
+}
+
+/*
+ * copy user data into a page vector
+ */
+static int copy_user_to_page_vector(struct page **pages,
+				    const char __user *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, PAGE_CACHE_SIZE-po, left);
+		bad = copy_from_user(page_address(pages[i]) + po, data, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		if (po) {
+			po += l - bad;
+			if (po == PAGE_CACHE_SIZE)
+				po = 0;
+		}
+	}
+	return len;
+}
+
+/*
+ * copy user data from a page vector into a user pointer
+ */
+static int copy_page_vector_to_user(struct page **pages, char __user *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, left, PAGE_CACHE_SIZE-po);
+		bad = copy_to_user(data, page_address(pages[i]) + po, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		if (po) {
+			po += l - bad;
+			if (po == PAGE_CACHE_SIZE)
+				po = 0;
+		}
+		i++;
+	}
+	return len;
+}
+
+/*
+ * Zero an extent within a page vector.  Offset is relative to the
+ * start of the first page.
+ */
+static void zero_page_vector_range(int off, int len, struct page **pages)
+{
+	int i = off >> PAGE_CACHE_SHIFT;
+
+	dout("zero_page_vector_page %u~%u\n", off, len);
+	BUG_ON(len < PAGE_CACHE_SIZE);
+
+	/* leading partial page? */
+	if (off & ~PAGE_CACHE_MASK) {
+		dout("zeroing %d %p head from %d\n", i, pages[i],
+		     (int)(off & ~PAGE_CACHE_MASK));
+		zero_user_segment(pages[i], off & ~PAGE_CACHE_MASK,
+				  PAGE_CACHE_SIZE);
+		off += PAGE_CACHE_SIZE;
+		off &= PAGE_CACHE_MASK;
+		i++;
+	}
+	while (len >= PAGE_CACHE_SIZE) {
+		dout("zeroing %d %p\n", i, pages[i]);
+		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+		off += PAGE_CACHE_SIZE;
+		len -= PAGE_CACHE_SIZE;
+		i++;
+	}
+	/* trailing partial page? */
+	if (len) {
+		dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+		zero_user_segment(pages[i], 0, len);
+	}
+}
+
+
+/*
+ * Read a range of bytes striped over one or more objects.  Iterate over
+ * objects we stripe over.  (That's not atomic, but good enough for now.)
+ *
+ * If we get a short result from the OSD, check against i_size; we need to
+ * only return a short read to the caller if we hit EOF.
+ */
+static int striped_read(struct inode *inode,
+			u64 off, u64 len,
+			struct page **pages, int num_pages)
+{
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 pos, this_len;
+	int page_off = off & ~PAGE_CACHE_SIZE; /* first byte's offset in page */
+	int left, pages_left;
+	int read;
+	struct page **page_pos;
+	int ret;
+	bool hit_stripe, was_short;
+
+	/*
+	 * we may need to do multiple reads.  not atomic, unfortunately.
+	 */
+	pos = off;
+	left = len;
+	page_pos = pages;
+	pages_left = num_pages;
+	read = 0;
+
+more:
+	this_len = left;
+	ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+				  &ci->i_layout, pos, &this_len,
+				  ci->i_truncate_seq,
+				  ci->i_truncate_size,
+				  page_pos, pages_left);
+	hit_stripe = this_len < left;
+	was_short = ret >= 0 && ret < this_len;
+	if (ret == -ENOENT)
+		ret = 0;
+	dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
+	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
+
+	if (ret > 0) {
+		int didpages =
+			((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
+
+		if (read < pos - off) {
+			dout(" zero gap %llu to %llu\n", off + read, pos);
+			zero_page_vector_range(page_off + read,
+					       pos - off - read, pages);
+		}
+		pos += ret;
+		read = pos - off;
+		left -= ret;
+		page_pos += didpages;
+		pages_left -= didpages;
+
+		/* hit stripe? */
+		if (left && hit_stripe)
+			goto more;
+	}
+
+	if (was_short) {
+		/* was original extent fully inside i_size? */
+		if (pos + left <= inode->i_size) {
+			dout("zero tail\n");
+			zero_page_vector_range(page_off + read, len - read,
+					       pages);
+			goto out;
+		}
+
+		/* check i_size */
+		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+		if (ret < 0)
+			goto out;
+
+		/* hit EOF? */
+		if (pos >= inode->i_size)
+			goto out;
+
+		goto more;
+	}
+
+out:
+	if (ret >= 0)
+		ret = read;
+	dout("striped_read returns %d\n", ret);
+	return ret;
+}
+
+/*
+ * Completely synchronous read and write methods.  Direct from __user
+ * buffer to osd, or directly to user pages (if O_DIRECT).
+ *
+ * If the read spans object boundary, just do multiple reads.
+ */
+static ssize_t ceph_sync_read(struct file *file, char __user *data,
+			      unsigned len, loff_t *poff)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct page **pages;
+	u64 off = *poff;
+	int num_pages = calc_pages_for(off, len);
+	int ret;
+
+	dout("sync_read on file %p %llu~%u %s\n", file, off, len,
+	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+	if (file->f_flags & O_DIRECT) {
+		pages = get_direct_page_vector(data, num_pages, off, len);
+
+		/*
+		 * flush any page cache pages in this range.  this
+		 * will make concurrent normal and O_DIRECT io slow,
+		 * but it will at least behave sensibly when they are
+		 * in sequence.
+		 */
+		filemap_write_and_wait(inode->i_mapping);
+	} else {
+		pages = alloc_page_vector(num_pages);
+	}
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	ret = striped_read(inode, off, len, pages, num_pages);
+
+	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
+		ret = copy_page_vector_to_user(pages, data, off, ret);
+	if (ret >= 0)
+		*poff = off + ret;
+
+	if (file->f_flags & O_DIRECT)
+		put_page_vector(pages, num_pages);
+	else
+		ceph_release_page_vector(pages, num_pages);
+	dout("sync_read result %d\n", ret);
+	return ret;
+}
+
+/*
+ * Write commit callback, called if we requested both an ACK and
+ * ONDISK commit reply from the OSD.
+ */
+static void sync_write_commit(struct ceph_osd_request *req,
+			      struct ceph_msg *msg)
+{
+	struct ceph_inode_info *ci = ceph_inode(req->r_inode);
+
+	dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
+	spin_lock(&ci->i_unsafe_lock);
+	list_del_init(&req->r_unsafe_item);
+	spin_unlock(&ci->i_unsafe_lock);
+	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+}
+
+/*
+ * Synchronous write, straight from __user pointer or user pages (if
+ * O_DIRECT).
+ *
+ * If write spans object boundary, just do multiple writes.  (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t ceph_sync_write(struct file *file, const char __user *data,
+			       size_t left, loff_t *offset)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_osd_request *req;
+	struct page **pages;
+	int num_pages;
+	long long unsigned pos;
+	u64 len;
+	int written = 0;
+	int flags;
+	int do_sync = 0;
+	int check_caps = 0;
+	int ret;
+	struct timespec mtime = CURRENT_TIME;
+
+	if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("sync_write on file %p %lld~%u %s\n", file, *offset,
+	     (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+	if (file->f_flags & O_APPEND)
+		pos = i_size_read(inode);
+	else
+		pos = *offset;
+
+	flags = CEPH_OSD_FLAG_ORDERSNAP |
+		CEPH_OSD_FLAG_ONDISK |
+		CEPH_OSD_FLAG_WRITE;
+	if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
+		flags |= CEPH_OSD_FLAG_ACK;
+	else
+		do_sync = 1;
+
+	/*
+	 * we may need to do multiple writes here if we span an object
+	 * boundary.  this isn't atomic, unfortunately.  :(
+	 */
+more:
+	len = left;
+	req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+				    ceph_vino(inode), pos, &len,
+				    CEPH_OSD_OP_WRITE, flags,
+				    ci->i_snap_realm->cached_context,
+				    do_sync,
+				    ci->i_truncate_seq, ci->i_truncate_size,
+				    &mtime, false, 2);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	num_pages = calc_pages_for(pos, len);
+
+	if (file->f_flags & O_DIRECT) {
+		pages = get_direct_page_vector(data, num_pages, pos, len);
+		if (IS_ERR(pages)) {
+			ret = PTR_ERR(pages);
+			goto out;
+		}
+
+		/*
+		 * throw out any page cache pages in this range. this
+		 * may block.
+		 */
+		truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
+	} else {
+		pages = alloc_page_vector(num_pages);
+		if (IS_ERR(pages)) {
+			ret = PTR_ERR(pages);
+			goto out;
+		}
+		ret = copy_user_to_page_vector(pages, data, pos, len);
+		if (ret < 0) {
+			ceph_release_page_vector(pages, num_pages);
+			goto out;
+		}
+
+		if ((file->f_flags & O_SYNC) == 0) {
+			/* get a second commit callback */
+			req->r_safe_callback = sync_write_commit;
+			req->r_own_pages = 1;
+		}
+	}
+	req->r_pages = pages;
+	req->r_num_pages = num_pages;
+	req->r_inode = inode;
+
+	ret = ceph_osdc_start_request(&client->osdc, req, false);
+	if (!ret) {
+		if (req->r_safe_callback) {
+			/*
+			 * Add to inode unsafe list only after we
+			 * start_request so that a tid has been assigned.
+			 */
+			spin_lock(&ci->i_unsafe_lock);
+			list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
+			spin_unlock(&ci->i_unsafe_lock);
+			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
+		}
+		ret = ceph_osdc_wait_request(&client->osdc, req);
+	}
+
+	if (file->f_flags & O_DIRECT)
+		put_page_vector(pages, num_pages);
+	else if (file->f_flags & O_SYNC)
+		ceph_release_page_vector(pages, num_pages);
+
+out:
+	ceph_osdc_put_request(req);
+	if (ret == 0) {
+		pos += len;
+		written += len;
+		left -= len;
+		if (left)
+			goto more;
+
+		ret = written;
+		*offset = pos;
+		if (pos > i_size_read(inode))
+			check_caps = ceph_inode_set_size(inode, pos);
+		if (check_caps)
+			ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
+					NULL);
+	}
+	return ret;
+}
+
+/*
+ * Wrap generic_file_aio_read with checks for cap bits on the inode.
+ * Atomically grab references, so that those bits are not released
+ * back to the MDS mid-read.
+ *
+ * Hmm, the sync read case isn't actually async... should it be?
+ */
+static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
+			     unsigned long nr_segs, loff_t pos)
+{
+	struct file *filp = iocb->ki_filp;
+	loff_t *ppos = &iocb->ki_pos;
+	size_t len = iov->iov_len;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	ssize_t ret;
+	int got = 0;
+
+	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)len, inode);
+	__ceph_do_pending_vmtruncate(inode);
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
+			    &got, -1);
+	if (ret < 0)
+		goto out;
+	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)len,
+	     ceph_cap_string(got));
+
+	if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
+	    (iocb->ki_filp->f_flags & O_DIRECT) ||
+	    (inode->i_sb->s_flags & MS_SYNCHRONOUS))
+		/* hmm, this isn't really async... */
+		ret = ceph_sync_read(filp, iov->iov_base, len, ppos);
+	else
+		ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+
+out:
+	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
+	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+	ceph_put_cap_refs(ci, got);
+	return ret;
+}
+
+/*
+ * Take cap references to avoid releasing caps to MDS mid-write.
+ *
+ * If we are synchronous, and write with an old snap context, the OSD
+ * may return EOLDSNAPC.  In that case, retry the write.. _after_
+ * dropping our cap refs and allowing the pending snap to logically
+ * complete _before_ this write occurs.
+ *
+ * If we are near ENOSPC, write synchronously.
+ */
+static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
+		       unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+	loff_t endoff = pos + iov->iov_len;
+	int got = 0;
+	int ret;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+retry_snap:
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
+		return -ENOSPC;
+	__ceph_do_pending_vmtruncate(inode);
+	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     inode->i_size);
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+			    &got, endoff);
+	if (ret < 0)
+		goto out;
+
+	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     ceph_cap_string(got));
+
+	if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
+	    (iocb->ki_filp->f_flags & O_DIRECT) ||
+	    (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
+		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
+			&iocb->ki_pos);
+	} else {
+		ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+		if ((ret >= 0 || ret == -EIOCBQUEUED) &&
+		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
+		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL)))
+			ret = vfs_fsync_range(file, file->f_path.dentry,
+					      pos, pos + ret - 1, 1);
+	}
+	if (ret >= 0) {
+		spin_lock(&inode->i_lock);
+		__ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		spin_unlock(&inode->i_lock);
+	}
+
+out:
+	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     ceph_cap_string(got));
+	ceph_put_cap_refs(ci, got);
+
+	if (ret == -EOLDSNAPC) {
+		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
+		     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
+		goto retry_snap;
+	}
+
+	return ret;
+}
+
+/*
+ * llseek.  be sure to verify file size on SEEK_END.
+ */
+static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	mutex_lock(&inode->i_mutex);
+	__ceph_do_pending_vmtruncate(inode);
+	switch (origin) {
+	case SEEK_END:
+		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+		if (ret < 0) {
+			offset = ret;
+			goto out;
+		}
+		offset += inode->i_size;
+		break;
+	case SEEK_CUR:
+		/*
+		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+		 * position-querying operation.  Avoid rewriting the "same"
+		 * f_pos value back to the file because a concurrent read(),
+		 * write() or lseek() might have altered it
+		 */
+		if (offset == 0) {
+			offset = file->f_pos;
+			goto out;
+		}
+		offset += file->f_pos;
+		break;
+	}
+
+	if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
+		offset = -EINVAL;
+		goto out;
+	}
+
+	/* Special lock needed here? */
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return offset;
+}
+
+const struct file_operations ceph_file_fops = {
+	.open = ceph_open,
+	.release = ceph_release,
+	.llseek = ceph_llseek,
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = ceph_aio_read,
+	.aio_write = ceph_aio_write,
+	.mmap = ceph_mmap,
+	.fsync = ceph_fsync,
+	.splice_read = generic_file_splice_read,
+	.splice_write = generic_file_splice_write,
+	.unlocked_ioctl = ceph_ioctl,
+	.compat_ioctl	= ceph_ioctl,
+};
+
-- 
cgit v1.2.2


From 1d3576fd10f0d7a104204267b81cf84a07028dad Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:09 -0700
Subject: ceph: address space operations

The ceph address space methods are concerned primarily with managing
the dirty page accounting in the inode, which (among other things)
must keep track of which snapshot context each page was dirtied in,
and ensure that dirty data is written out to the OSDs in snapshort
order.

A writepage() on a page that is not currently writeable due to
snapshot writeback ordering constraints is ignored (it was presumably
called from kswapd).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c | 1115 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1115 insertions(+)
 create mode 100644 fs/ceph/addr.c

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..c7d673ffe023
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1115 @@
+#include "ceph_debug.h"
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>	/* generic_writepages */
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "super.h"
+#include "osd_client.h"
+
+/*
+ * Ceph address space ops.
+ *
+ * There are a few funny things going on here.
+ *
+ * The page->private field is used to reference a struct
+ * ceph_snap_context for _every_ dirty page.  This indicates which
+ * snapshot the page was logically dirtied in, and thus which snap
+ * context needs to be associated with the osd write during writeback.
+ *
+ * Similarly, struct ceph_inode_info maintains a set of counters to
+ * count dirty pages on the inode.  In the absense of snapshots,
+ * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
+ *
+ * When a snapshot is taken (that is, when the client receives
+ * notification that a snapshot was taken), each inode with caps and
+ * with dirty pages (dirty pages implies there is a cap) gets a new
+ * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
+ * order, new snaps go to the tail).  The i_wrbuffer_ref_head count is
+ * moved to capsnap->dirty. (Unless a sync write is currently in
+ * progress.  In that case, the capsnap is said to be "pending", new
+ * writes cannot start, and the capsnap isn't "finalized" until the
+ * write completes (or fails) and a final size/mtime for the inode for
+ * that snap can be settled upon.)  i_wrbuffer_ref_head is reset to 0.
+ *
+ * On writeback, we must submit writes to the osd IN SNAP ORDER.  So,
+ * we look for the first capsnap in i_cap_snaps and write out pages in
+ * that snap context _only_.  Then we move on to the next capsnap,
+ * eventually reaching the "live" or "head" context (i.e., pages that
+ * are not yet snapped) and are writing the most recently dirtied
+ * pages.
+ *
+ * Invalidate and so forth must take care to ensure the dirty page
+ * accounting is preserved.
+ */
+
+
+/*
+ * Dirty a page.  Optimistically adjust accounting, on the assumption
+ * that we won't race with invalidate.  If we do, readjust.
+ */
+static int ceph_set_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	int undo = 0;
+	struct ceph_snap_context *snapc;
+
+	if (unlikely(!mapping))
+		return !TestSetPageDirty(page);
+
+	if (TestSetPageDirty(page)) {
+		dout("%p set_page_dirty %p idx %lu -- already dirty\n",
+		     mapping->host, page, page->index);
+		return 0;
+	}
+
+	inode = mapping->host;
+	ci = ceph_inode(inode);
+
+	/*
+	 * Note that we're grabbing a snapc ref here without holding
+	 * any locks!
+	 */
+	snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+
+	/* dirty the head */
+	spin_lock(&inode->i_lock);
+	if (ci->i_wrbuffer_ref_head == 0)
+		ci->i_head_snapc = ceph_get_snap_context(snapc);
+	++ci->i_wrbuffer_ref_head;
+	if (ci->i_wrbuffer_ref == 0)
+		igrab(inode);
+	++ci->i_wrbuffer_ref;
+	dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+	     "snapc %p seq %lld (%d snaps)\n",
+	     mapping->host, page, page->index,
+	     ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+	     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+	     snapc, snapc->seq, snapc->num_snaps);
+	spin_unlock(&inode->i_lock);
+
+	/* now adjust page */
+	spin_lock_irq(&mapping->tree_lock);
+	if (page->mapping) {	/* Race with truncate? */
+		WARN_ON_ONCE(!PageUptodate(page));
+
+		if (mapping_cap_account_dirty(mapping)) {
+			__inc_zone_page_state(page, NR_FILE_DIRTY);
+			__inc_bdi_stat(mapping->backing_dev_info,
+					BDI_RECLAIMABLE);
+			task_io_account_write(PAGE_CACHE_SIZE);
+		}
+		radix_tree_tag_set(&mapping->page_tree,
+				page_index(page), PAGECACHE_TAG_DIRTY);
+
+		/*
+		 * Reference snap context in page->private.  Also set
+		 * PagePrivate so that we get invalidatepage callback.
+		 */
+		page->private = (unsigned long)snapc;
+		SetPagePrivate(page);
+	} else {
+		dout("ANON set_page_dirty %p (raced truncate?)\n", page);
+		undo = 1;
+	}
+
+	spin_unlock_irq(&mapping->tree_lock);
+
+	if (undo)
+		/* whoops, we failed to dirty the page */
+		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+
+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	BUG_ON(!PageDirty(page));
+	return 1;
+}
+
+/*
+ * If we are truncating the full page (i.e. offset == 0), adjust the
+ * dirty page counters appropriately.  Only called if there is private
+ * data on the page.
+ */
+static void ceph_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct inode *inode = page->mapping->host;
+	struct ceph_inode_info *ci;
+	struct ceph_snap_context *snapc = (void *)page->private;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!page->private);
+	BUG_ON(!PagePrivate(page));
+	BUG_ON(!page->mapping);
+
+	/*
+	 * We can get non-dirty pages here due to races between
+	 * set_page_dirty and truncate_complete_page; just spit out a
+	 * warning, in case we end up with accounting problems later.
+	 */
+	if (!PageDirty(page))
+		pr_err("%p invalidatepage %p page not dirty\n", inode, page);
+
+	if (offset == 0)
+		ClearPageChecked(page);
+
+	ci = ceph_inode(inode);
+	if (offset == 0) {
+		dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
+		     inode, page, page->index, offset);
+		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+		ceph_put_snap_context(snapc);
+		page->private = 0;
+		ClearPagePrivate(page);
+	} else {
+		dout("%p invalidatepage %p idx %lu partial dirty page\n",
+		     inode, page, page->index);
+	}
+}
+
+/* just a sanity check */
+static int ceph_releasepage(struct page *page, gfp_t g)
+{
+	struct inode *inode = page->mapping ? page->mapping->host : NULL;
+	dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+	WARN_ON(PageDirty(page));
+	WARN_ON(page->private);
+	WARN_ON(PagePrivate(page));
+	return 0;
+}
+
+/*
+ * read a single page, without unlocking it.
+ */
+static int readpage_nounlock(struct file *filp, struct page *page)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+	int err = 0;
+	u64 len = PAGE_CACHE_SIZE;
+
+	dout("readpage inode %p file %p page %p index %lu\n",
+	     inode, filp, page, page->index);
+	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+				  page->index << PAGE_CACHE_SHIFT, &len,
+				  ci->i_truncate_seq, ci->i_truncate_size,
+				  &page, 1);
+	if (err == -ENOENT)
+		err = 0;
+	if (err < 0) {
+		SetPageError(page);
+		goto out;
+	} else if (err < PAGE_CACHE_SIZE) {
+		/* zero fill remainder of page */
+		zero_user_segment(page, err, PAGE_CACHE_SIZE);
+	}
+	SetPageUptodate(page);
+
+out:
+	return err < 0 ? err : 0;
+}
+
+static int ceph_readpage(struct file *filp, struct page *page)
+{
+	int r = readpage_nounlock(filp, page);
+	unlock_page(page);
+	return r;
+}
+
+/*
+ * Build a vector of contiguous pages from the provided page list.
+ */
+static struct page **page_vector_from_list(struct list_head *page_list,
+					   unsigned *nr_pages)
+{
+	struct page **pages;
+	struct page *page;
+	int next_index, contig_pages = 0;
+
+	/* build page vector */
+	pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	BUG_ON(list_empty(page_list));
+	next_index = list_entry(page_list->prev, struct page, lru)->index;
+	list_for_each_entry_reverse(page, page_list, lru) {
+		if (page->index == next_index) {
+			dout("readpages page %d %p\n", contig_pages, page);
+			pages[contig_pages] = page;
+			contig_pages++;
+			next_index++;
+		} else {
+			break;
+		}
+	}
+	*nr_pages = contig_pages;
+	return pages;
+}
+
+/*
+ * Read multiple pages.  Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *page_list, unsigned nr_pages)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+	int rc = 0;
+	struct page **pages;
+	struct pagevec pvec;
+	loff_t offset;
+	u64 len;
+
+	dout("readpages %p file %p nr_pages %d\n",
+	     inode, file, nr_pages);
+
+	pages = page_vector_from_list(page_list, &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	/* guess read extent */
+	offset = pages[0]->index << PAGE_CACHE_SHIFT;
+	len = nr_pages << PAGE_CACHE_SHIFT;
+	rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+				 offset, &len,
+				 ci->i_truncate_seq, ci->i_truncate_size,
+				 pages, nr_pages);
+	if (rc == -ENOENT)
+		rc = 0;
+	if (rc < 0)
+		goto out;
+
+	/* set uptodate and add to lru in pagevec-sized chunks */
+	pagevec_init(&pvec, 0);
+	for (; !list_empty(page_list) && len > 0;
+	     rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
+		struct page *page =
+			list_entry(page_list->prev, struct page, lru);
+
+		list_del(&page->lru);
+
+		if (rc < (int)PAGE_CACHE_SIZE) {
+			/* zero (remainder of) page */
+			int s = rc < 0 ? 0 : rc;
+			zero_user_segment(page, s, PAGE_CACHE_SIZE);
+		}
+
+		if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+			page_cache_release(page);
+			dout("readpages %p add_to_page_cache failed %p\n",
+			     inode, page);
+			continue;
+		}
+		dout("readpages %p adding %p idx %lu\n", inode, page,
+		     page->index);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		unlock_page(page);
+		if (pagevec_add(&pvec, page) == 0)
+			pagevec_lru_add_file(&pvec);   /* add to lru */
+	}
+	pagevec_lru_add_file(&pvec);
+	rc = 0;
+
+out:
+	kfree(pages);
+	return rc;
+}
+
+/*
+ * Get ref for the oldest snapc for an inode with dirty data... that is, the
+ * only snap context we are allowed to write back.
+ *
+ * Caller holds i_lock.
+ */
+static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
+						      u64 *snap_size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_snap_context *snapc = NULL;
+	struct ceph_cap_snap *capsnap = NULL;
+
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
+		     capsnap->context, capsnap->dirty_pages);
+		if (capsnap->dirty_pages) {
+			snapc = ceph_get_snap_context(capsnap->context);
+			if (snap_size)
+				*snap_size = capsnap->size;
+			break;
+		}
+	}
+	if (!snapc && ci->i_snap_realm) {
+		snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+		dout(" head snapc %p has %d dirty pages\n",
+		     snapc, ci->i_wrbuffer_ref_head);
+	}
+	return snapc;
+}
+
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
+						    u64 *snap_size)
+{
+	struct ceph_snap_context *snapc = NULL;
+
+	spin_lock(&inode->i_lock);
+	snapc = __get_oldest_context(inode, snap_size);
+	spin_unlock(&inode->i_lock);
+	return snapc;
+}
+
+/*
+ * Write a single page, but leave the page locked.
+ *
+ * If we get a write error, set the page error bit, but still adjust the
+ * dirty page accounting (i.e., page is no longer dirty).
+ */
+static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_osd_client *osdc;
+	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
+	int len = PAGE_CACHE_SIZE;
+	loff_t i_size;
+	int err = 0;
+	struct ceph_snap_context *snapc;
+	u64 snap_size = 0;
+
+	dout("writepage %p idx %lu\n", page, page->index);
+
+	if (!page->mapping || !page->mapping->host) {
+		dout("writepage %p - no mapping\n", page);
+		return -EFAULT;
+	}
+	inode = page->mapping->host;
+	ci = ceph_inode(inode);
+	osdc = &ceph_inode_to_client(inode)->osdc;
+
+	/* verify this is a writeable snap context */
+	snapc = (void *)page->private;
+	if (snapc == NULL) {
+		dout("writepage %p page %p not dirty?\n", inode, page);
+		goto out;
+	}
+	if (snapc != get_oldest_context(inode, &snap_size)) {
+		dout("writepage %p page %p snapc %p not writeable - noop\n",
+		     inode, page, (void *)page->private);
+		/* we should only noop if called by kswapd */
+		WARN_ON((current->flags & PF_MEMALLOC) == 0);
+		goto out;
+	}
+
+	/* is this a partial page at end of file? */
+	if (snap_size)
+		i_size = snap_size;
+	else
+		i_size = i_size_read(inode);
+	if (i_size < page_off + len)
+		len = i_size - page_off;
+
+	dout("writepage %p page %p index %lu on %llu~%u\n",
+	     inode, page, page->index, page_off, len);
+
+	set_page_writeback(page);
+	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
+				   &ci->i_layout, snapc,
+				   page_off, len,
+				   ci->i_truncate_seq, ci->i_truncate_size,
+				   &inode->i_mtime,
+				   &page, 1, 0, 0, true);
+	if (err < 0) {
+		dout("writepage setting page/mapping error %d %p\n", err, page);
+		SetPageError(page);
+		mapping_set_error(&inode->i_data, err);
+		if (wbc)
+			wbc->pages_skipped++;
+	} else {
+		dout("writepage cleaned page %p\n", page);
+		err = 0;  /* vfs expects us to return 0 */
+	}
+	page->private = 0;
+	ClearPagePrivate(page);
+	end_page_writeback(page);
+	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+	ceph_put_snap_context(snapc);
+out:
+	return err;
+}
+
+static int ceph_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int err = writepage_nounlock(page, wbc);
+	unlock_page(page);
+	return err;
+}
+
+
+/*
+ * lame release_pages helper.  release_pages() isn't exported to
+ * modules.
+ */
+static void ceph_release_pages(struct page **pages, int num)
+{
+	struct pagevec pvec;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	for (i = 0; i < num; i++) {
+		if (pagevec_add(&pvec, pages[i]) == 0)
+			pagevec_release(&pvec);
+	}
+	pagevec_release(&pvec);
+}
+
+
+/*
+ * async writeback completion handler.
+ *
+ * If we get an error, set the mapping error bit, but not the individual
+ * page error bits.
+ */
+static void writepages_finish(struct ceph_osd_request *req,
+			      struct ceph_msg *msg)
+{
+	struct inode *inode = req->r_inode;
+	struct ceph_osd_reply_head *replyhead;
+	struct ceph_osd_op *op;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned wrote;
+	loff_t offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
+	struct page *page;
+	int i;
+	struct ceph_snap_context *snapc = req->r_snapc;
+	struct address_space *mapping = inode->i_mapping;
+	struct writeback_control *wbc = req->r_wbc;
+	__s32 rc = -EIO;
+	u64 bytes = 0;
+
+	/* parse reply */
+	replyhead = msg->front.iov_base;
+	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+	op = (void *)(replyhead + 1);
+	rc = le32_to_cpu(replyhead->result);
+	bytes = le64_to_cpu(op->extent.length);
+
+	if (rc >= 0) {
+		wrote = (bytes + (offset & ~PAGE_CACHE_MASK) + ~PAGE_CACHE_MASK)
+			>> PAGE_CACHE_SHIFT;
+		WARN_ON(wrote != req->r_num_pages);
+	} else {
+		wrote = 0;
+		mapping_set_error(mapping, rc);
+	}
+	dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
+	     inode, rc, bytes, wrote);
+
+	/* clean all pages */
+	for (i = 0; i < req->r_num_pages; i++) {
+		page = req->r_pages[i];
+		BUG_ON(!page);
+		WARN_ON(!PageUptodate(page));
+
+		if (i >= wrote) {
+			dout("inode %p skipping page %p\n", inode, page);
+			wbc->pages_skipped++;
+		}
+		page->private = 0;
+		ClearPagePrivate(page);
+		ceph_put_snap_context(snapc);
+		dout("unlocking %d %p\n", i, page);
+		end_page_writeback(page);
+		unlock_page(page);
+	}
+	dout("%p wrote+cleaned %d pages\n", inode, wrote);
+	ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+
+	ceph_release_pages(req->r_pages, req->r_num_pages);
+	if (req->r_pages_from_pool)
+		mempool_free(req->r_pages,
+			     ceph_client(inode->i_sb)->wb_pagevec_pool);
+	else
+		kfree(req->r_pages);
+	ceph_osdc_put_request(req);
+}
+
+/*
+ * allocate a page vec, either directly, or if necessary, via a the
+ * mempool.  we avoid the mempool if we can because req->r_num_pages
+ * may be less than the maximum write size.
+ */
+static void alloc_page_vec(struct ceph_client *client,
+			   struct ceph_osd_request *req)
+{
+	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
+			       GFP_NOFS);
+	if (!req->r_pages) {
+		req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+		req->r_pages_from_pool = 1;
+		WARN_ON(!req->r_pages);
+	}
+}
+
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+				 struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	pgoff_t index, start, end;
+	int range_whole = 0;
+	int should_loop = 1;
+	pgoff_t max_pages = 0, max_pages_ever = 0;
+	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
+	struct pagevec pvec;
+	int done = 0;
+	int rc = 0;
+	unsigned wsize = 1 << inode->i_blkbits;
+	struct ceph_osd_request *req = NULL;
+	int do_sync;
+	u64 snap_size = 0;
+
+	/*
+	 * Include a 'sync' in the OSD request if this is a data
+	 * integrity write (e.g., O_SYNC write or fsync()), or if our
+	 * cap is being revoked.
+	 */
+	do_sync = wbc->sync_mode == WB_SYNC_ALL;
+	if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+		do_sync = 1;
+	dout("writepages_start %p dosync=%d (mode=%s)\n",
+	     inode, do_sync,
+	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+
+	client = ceph_inode_to_client(inode);
+	if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+		pr_warning("writepage_start %p on forced umount\n", inode);
+		return -EIO; /* we're in a forced umount, don't write! */
+	}
+	if (client->mount_args.wsize && client->mount_args.wsize < wsize)
+		wsize = client->mount_args.wsize;
+	if (wsize < PAGE_CACHE_SIZE)
+		wsize = PAGE_CACHE_SIZE;
+	max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
+
+	pagevec_init(&pvec, 0);
+
+	/* ?? */
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		dout(" writepages congested\n");
+		wbc->encountered_congestion = 1;
+		goto out_final;
+	}
+
+	/* where to start/end? */
+	if (wbc->range_cyclic) {
+		start = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+		dout(" cyclic, start at %lu\n", start);
+	} else {
+		start = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		should_loop = 0;
+		dout(" not cyclic, %lu to %lu\n", start, end);
+	}
+	index = start;
+
+retry:
+	/* find oldest snap context with dirty data */
+	ceph_put_snap_context(snapc);
+	snapc = get_oldest_context(inode, &snap_size);
+	if (!snapc) {
+		/* hmm, why does writepages get called when there
+		   is no dirty data? */
+		dout(" no snap context with dirty data?\n");
+		goto out;
+	}
+	dout(" oldest snapc is %p seq %lld (%d snaps)\n",
+	     snapc, snapc->seq, snapc->num_snaps);
+	if (last_snapc && snapc != last_snapc) {
+		/* if we switched to a newer snapc, restart our scan at the
+		 * start of the original file range. */
+		dout("  snapc differs from last pass, restarting at %lu\n",
+		     index);
+		index = start;
+	}
+	last_snapc = snapc;
+
+	while (!done && index <= end) {
+		unsigned i;
+		int first;
+		pgoff_t next;
+		int pvec_pages, locked_pages;
+		struct page *page;
+		int want;
+		u64 offset, len;
+		struct ceph_osd_request_head *reqhead;
+		struct ceph_osd_op *op;
+
+		next = 0;
+		locked_pages = 0;
+		max_pages = max_pages_ever;
+
+get_more_pages:
+		first = -1;
+		want = min(end - index,
+			   min((pgoff_t)PAGEVEC_SIZE,
+			       max_pages - (pgoff_t)locked_pages) - 1)
+			+ 1;
+		pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+						PAGECACHE_TAG_DIRTY,
+						want);
+		dout("pagevec_lookup_tag got %d\n", pvec_pages);
+		if (!pvec_pages && !locked_pages)
+			break;
+		for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
+			page = pvec.pages[i];
+			dout("? %p idx %lu\n", page, page->index);
+			if (locked_pages == 0)
+				lock_page(page);  /* first page */
+			else if (!trylock_page(page))
+				break;
+
+			/* only dirty pages, or our accounting breaks */
+			if (unlikely(!PageDirty(page)) ||
+			    unlikely(page->mapping != mapping)) {
+				dout("!dirty or !mapping %p\n", page);
+				unlock_page(page);
+				break;
+			}
+			if (!wbc->range_cyclic && page->index > end) {
+				dout("end of range %p\n", page);
+				done = 1;
+				unlock_page(page);
+				break;
+			}
+			if (next && (page->index != next)) {
+				dout("not consecutive %p\n", page);
+				unlock_page(page);
+				break;
+			}
+			if (wbc->sync_mode != WB_SYNC_NONE) {
+				dout("waiting on writeback %p\n", page);
+				wait_on_page_writeback(page);
+			}
+			if ((snap_size && page_offset(page) > snap_size) ||
+			    (!snap_size &&
+			     page_offset(page) > i_size_read(inode))) {
+				dout("%p page eof %llu\n", page, snap_size ?
+				     snap_size : i_size_read(inode));
+				done = 1;
+				unlock_page(page);
+				break;
+			}
+			if (PageWriteback(page)) {
+				dout("%p under writeback\n", page);
+				unlock_page(page);
+				break;
+			}
+
+			/* only if matching snap context */
+			if (snapc != (void *)page->private) {
+				dout("page snapc %p != oldest %p\n",
+				     (void *)page->private, snapc);
+				unlock_page(page);
+				if (!locked_pages)
+					continue; /* keep looking for snap */
+				break;
+			}
+
+			if (!clear_page_dirty_for_io(page)) {
+				dout("%p !clear_page_dirty_for_io\n", page);
+				unlock_page(page);
+				break;
+			}
+
+			/* ok */
+			if (locked_pages == 0) {
+				/* prepare async write request */
+				offset = page->index << PAGE_CACHE_SHIFT;
+				len = wsize;
+				req = ceph_osdc_new_request(&client->osdc,
+					    &ci->i_layout,
+					    ceph_vino(inode),
+					    offset, &len,
+					    CEPH_OSD_OP_WRITE,
+					    CEPH_OSD_FLAG_WRITE |
+						    CEPH_OSD_FLAG_ONDISK,
+					    snapc, do_sync,
+					    ci->i_truncate_seq,
+					    ci->i_truncate_size,
+					    &inode->i_mtime, true, 1);
+				max_pages = req->r_num_pages;
+
+				alloc_page_vec(client, req);
+				req->r_callback = writepages_finish;
+				req->r_inode = inode;
+				req->r_wbc = wbc;
+			}
+
+			/* note position of first page in pvec */
+			if (first < 0)
+				first = i;
+			dout("%p will write page %p idx %lu\n",
+			     inode, page, page->index);
+			set_page_writeback(page);
+			req->r_pages[locked_pages] = page;
+			locked_pages++;
+			next = page->index + 1;
+		}
+
+		/* did we get anything? */
+		if (!locked_pages)
+			goto release_pvec_pages;
+		if (i) {
+			int j;
+			BUG_ON(!locked_pages || first < 0);
+
+			if (pvec_pages && i == pvec_pages &&
+			    locked_pages < max_pages) {
+				dout("reached end pvec, trying for more\n");
+				pagevec_reinit(&pvec);
+				goto get_more_pages;
+			}
+
+			/* shift unused pages over in the pvec...  we
+			 * will need to release them below. */
+			for (j = i; j < pvec_pages; j++) {
+				dout(" pvec leftover page %p\n",
+				     pvec.pages[j]);
+				pvec.pages[j-i+first] = pvec.pages[j];
+			}
+			pvec.nr -= i-first;
+		}
+
+		/* submit the write */
+		offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
+		len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
+			  (u64)locked_pages << PAGE_CACHE_SHIFT);
+		dout("writepages got %d pages at %llu~%llu\n",
+		     locked_pages, offset, len);
+
+		/* revise final length, page count */
+		req->r_num_pages = locked_pages;
+		reqhead = req->r_request->front.iov_base;
+		op = (void *)(reqhead + 1);
+		op->extent.length = cpu_to_le64(len);
+		op->payload_len = cpu_to_le32(len);
+		req->r_request->hdr.data_len = cpu_to_le32(len);
+
+		ceph_osdc_start_request(&client->osdc, req, true);
+		req = NULL;
+
+		/* continue? */
+		index = next;
+		wbc->nr_to_write -= locked_pages;
+		if (wbc->nr_to_write <= 0)
+			done = 1;
+
+release_pvec_pages:
+		dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
+		     pvec.nr ? pvec.pages[0] : NULL);
+		pagevec_release(&pvec);
+
+		if (locked_pages && !done)
+			goto retry;
+	}
+
+	if (should_loop && !done) {
+		/* more to do; loop back to beginning of file */
+		dout("writepages looping back to beginning of file\n");
+		should_loop = 0;
+		index = 0;
+		goto retry;
+	}
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+
+out:
+	if (req)
+		ceph_osdc_put_request(req);
+	if (rc > 0)
+		rc = 0;  /* vfs expects us to return 0 */
+	ceph_put_snap_context(snapc);
+	dout("writepages done, rc = %d\n", rc);
+out_final:
+	return rc;
+}
+
+
+
+/*
+ * See if a given @snapc is either writeable, or already written.
+ */
+static int context_is_writeable_or_written(struct inode *inode,
+					   struct ceph_snap_context *snapc)
+{
+	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+	return !oldest || snapc->seq <= oldest->seq;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct page *page;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	loff_t page_off = pos & PAGE_CACHE_MASK;
+	int pos_in_page = pos & ~PAGE_CACHE_MASK;
+	int end_in_page = pos_in_page + len;
+	loff_t i_size;
+	struct ceph_snap_context *snapc;
+	int r;
+
+	/* get a page*/
+retry:
+	page = grab_cache_page_write_begin(mapping, index, 0);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	dout("write_begin file %p inode %p page %p %d~%d\n", file,
+	     inode, page, (int)pos, (int)len);
+
+retry_locked:
+	/* writepages currently holds page lock, but if we change that later, */
+	wait_on_page_writeback(page);
+
+	/* check snap context */
+	BUG_ON(!ci->i_snap_realm);
+	down_read(&mdsc->snap_rwsem);
+	BUG_ON(!ci->i_snap_realm->cached_context);
+	if (page->private &&
+	    (void *)page->private != ci->i_snap_realm->cached_context) {
+		/*
+		 * this page is already dirty in another (older) snap
+		 * context!  is it writeable now?
+		 */
+		snapc = get_oldest_context(inode, NULL);
+		up_read(&mdsc->snap_rwsem);
+
+		if (snapc != (void *)page->private) {
+			dout(" page %p snapc %p not current or oldest\n",
+			     page, (void *)page->private);
+			/*
+			 * queue for writeback, and wait for snapc to
+			 * be writeable or written
+			 */
+			snapc = ceph_get_snap_context((void *)page->private);
+			unlock_page(page);
+			if (ceph_queue_writeback(inode))
+				igrab(inode);
+			wait_event_interruptible(ci->i_cap_wq,
+			       context_is_writeable_or_written(inode, snapc));
+			ceph_put_snap_context(snapc);
+			goto retry;
+		}
+
+		/* yay, writeable, do it now (without dropping page lock) */
+		dout(" page %p snapc %p not current, but oldest\n",
+		     page, snapc);
+		if (!clear_page_dirty_for_io(page))
+			goto retry_locked;
+		r = writepage_nounlock(page, NULL);
+		if (r < 0)
+			goto fail_nosnap;
+		goto retry_locked;
+	}
+
+	if (PageUptodate(page)) {
+		dout(" page %p already uptodate\n", page);
+		return 0;
+	}
+
+	/* full page? */
+	if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
+		return 0;
+
+	/* past end of file? */
+	i_size = inode->i_size;   /* caller holds i_mutex */
+
+	if (i_size + len > inode->i_sb->s_maxbytes) {
+		/* file is too big */
+		r = -EINVAL;
+		goto fail;
+	}
+
+	if (page_off >= i_size ||
+	    (pos_in_page == 0 && (pos+len) >= i_size &&
+	     end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
+		dout(" zeroing %p 0 - %d and %d - %d\n",
+		     page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
+		zero_user_segments(page,
+				   0, pos_in_page,
+				   end_in_page, PAGE_CACHE_SIZE);
+		return 0;
+	}
+
+	/* we need to read it. */
+	up_read(&mdsc->snap_rwsem);
+	r = readpage_nounlock(file, page);
+	if (r < 0)
+		goto fail_nosnap;
+	goto retry_locked;
+
+fail:
+	up_read(&mdsc->snap_rwsem);
+fail_nosnap:
+	unlock_page(page);
+	return r;
+}
+
+/*
+ * we don't do anything in here that simple_write_end doesn't do
+ * except adjust dirty page accounting and drop read lock on
+ * mdsc->snap_rwsem.
+ */
+static int ceph_write_end(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	int check_cap = 0;
+
+	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
+	     inode, page, (int)pos, (int)copied, (int)len);
+
+	/* zero the stale part of the page if we did a short copy */
+	if (copied < len)
+		zero_user_segment(page, from+copied, len);
+
+	/* did file size increase? */
+	/* (no need for i_size_read(); we caller holds i_mutex */
+	if (pos+copied > inode->i_size)
+		check_cap = ceph_inode_set_size(inode, pos+copied);
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+
+	set_page_dirty(page);
+
+	unlock_page(page);
+	up_read(&mdsc->snap_rwsem);
+	page_cache_release(page);
+
+	if (check_cap)
+		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+
+	return copied;
+}
+
+/*
+ * we set .direct_IO to indicate direct io is supported, but since we
+ * intercept O_DIRECT reads and writes early, this function should
+ * never get called.
+ */
+static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
+			      const struct iovec *iov,
+			      loff_t pos, unsigned long nr_segs)
+{
+	WARN_ON(1);
+	return -EINVAL;
+}
+
+const struct address_space_operations ceph_aops = {
+	.readpage = ceph_readpage,
+	.readpages = ceph_readpages,
+	.writepage = ceph_writepage,
+	.writepages = ceph_writepages_start,
+	.write_begin = ceph_write_begin,
+	.write_end = ceph_write_end,
+	.set_page_dirty = ceph_set_page_dirty,
+	.invalidatepage = ceph_invalidatepage,
+	.releasepage = ceph_releasepage,
+	.direct_IO = ceph_direct_io,
+};
+
+
+/*
+ * vm ops
+ */
+
+/*
+ * Reuse write_begin here for simplicity.
+ */
+static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct page *page = vmf->page;
+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	loff_t off = page->index << PAGE_CACHE_SHIFT;
+	loff_t size, len;
+	struct page *locked_page = NULL;
+	void *fsdata = NULL;
+	int ret;
+
+	size = i_size_read(inode);
+	if (off + PAGE_CACHE_SIZE <= size)
+		len = PAGE_CACHE_SIZE;
+	else
+		len = size & ~PAGE_CACHE_MASK;
+
+	dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
+	     off, len, page, page->index);
+	ret = ceph_write_begin(vma->vm_file, inode->i_mapping, off, len, 0,
+			       &locked_page, &fsdata);
+	WARN_ON(page != locked_page);
+	if (!ret) {
+		/*
+		 * doing the following, instead of calling
+		 * ceph_write_end. Note that we keep the
+		 * page locked
+		 */
+		set_page_dirty(page);
+		up_read(&mdsc->snap_rwsem);
+		page_cache_release(page);
+		ret = VM_FAULT_LOCKED;
+	} else {
+		ret = VM_FAULT_SIGBUS;
+	}
+	dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
+	return ret;
+}
+
+static struct vm_operations_struct ceph_vmops = {
+	.fault		= filemap_fault,
+	.page_mkwrite	= ceph_page_mkwrite,
+};
+
+int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	file_accessed(file);
+	vma->vm_ops = &ceph_vmops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
-- 
cgit v1.2.2


From 2f2dc053404febedc9c273452d9d518fb31fde72 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:09 -0700
Subject: ceph: MDS client

The MDS (metadata server) client is responsible for submitting
requests to the MDS cluster and parsing the response.  We decide which
MDS to submit each request to based on cached information about the
current partition of the directory hierarchy across the cluster.  A
stateful session is opened with each MDS before we submit requests to
it, and a mutex is used to control the ordering of messages within
each session.

An MDS request may generate two responses.  The first indicates the
operation was a success and returns any result.  A second reply is
sent when the operation commits to disk.  Note that locking on the MDS
ensures that the results of updates are visible only to the updating
client before the operation commits.  Requests are linked to the
containing directory so that an fsync will wait for them to commit.

If an MDS fails and/or recovers, we resubmit requests as needed.  We
also reconnect existing capabilities to a recovering MDS to
reestablish that shared session state.  Old dentry leases are
invalidated.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 2912 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/mds_client.h |  321 ++++++
 fs/ceph/mdsmap.c     |  166 +++
 fs/ceph/mdsmap.h     |   53 +
 4 files changed, 3452 insertions(+)
 create mode 100644 fs/ceph/mds_client.c
 create mode 100644 fs/ceph/mds_client.h
 create mode 100644 fs/ceph/mdsmap.c
 create mode 100644 fs/ceph/mdsmap.h

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..de8ba4a242ca
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,2912 @@
+#include "ceph_debug.h"
+
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include "mds_client.h"
+#include "mon_client.h"
+#include "super.h"
+#include "messenger.h"
+#include "decode.h"
+
+/*
+ * A cluster of MDS (metadata server) daemons is responsible for
+ * managing the file system namespace (the directory hierarchy and
+ * inodes) and for coordinating shared access to storage.  Metadata is
+ * partitioning hierarchically across a number of servers, and that
+ * partition varies over time as the cluster adjusts the distribution
+ * in order to balance load.
+ *
+ * The MDS client is primarily responsible to managing synchronous
+ * metadata requests for operations like open, unlink, and so forth.
+ * If there is a MDS failure, we find out about it when we (possibly
+ * request and) receive a new MDS map, and can resubmit affected
+ * requests.
+ *
+ * For the most part, though, we take advantage of a lossless
+ * communications channel to the MDS, and do not need to worry about
+ * timing out or resubmitting requests.
+ *
+ * We maintain a stateful "session" with each MDS we interact with.
+ * Within each session, we sent periodic heartbeat messages to ensure
+ * any capabilities or leases we have been issues remain valid.  If
+ * the session times out and goes stale, our leases and capabilities
+ * are no longer valid.
+ */
+
+static void __wake_requests(struct ceph_mds_client *mdsc,
+			    struct list_head *head);
+
+const static struct ceph_connection_operations mds_con_ops;
+
+
+/*
+ * mds reply parsing
+ */
+
+/*
+ * parse individual inode info
+ */
+static int parse_reply_info_in(void **p, void *end,
+			       struct ceph_mds_reply_info_in *info)
+{
+	int err = -EIO;
+
+	info->in = *p;
+	*p += sizeof(struct ceph_mds_reply_inode) +
+		sizeof(*info->in->fragtree.splits) *
+		le32_to_cpu(info->in->fragtree.nsplits);
+
+	ceph_decode_32_safe(p, end, info->symlink_len, bad);
+	ceph_decode_need(p, end, info->symlink_len, bad);
+	info->symlink = *p;
+	*p += info->symlink_len;
+
+	ceph_decode_32_safe(p, end, info->xattr_len, bad);
+	ceph_decode_need(p, end, info->xattr_len, bad);
+	info->xattr_data = *p;
+	*p += info->xattr_len;
+	return 0;
+bad:
+	return err;
+}
+
+/*
+ * parse a normal reply, which may contain a (dir+)dentry and/or a
+ * target inode.
+ */
+static int parse_reply_info_trace(void **p, void *end,
+				  struct ceph_mds_reply_info_parsed *info)
+{
+	int err;
+
+	if (info->head->is_dentry) {
+		err = parse_reply_info_in(p, end, &info->diri);
+		if (err < 0)
+			goto out_bad;
+
+		if (unlikely(*p + sizeof(*info->dirfrag) > end))
+			goto bad;
+		info->dirfrag = *p;
+		*p += sizeof(*info->dirfrag) +
+			sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
+		if (unlikely(*p > end))
+			goto bad;
+
+		ceph_decode_32_safe(p, end, info->dname_len, bad);
+		ceph_decode_need(p, end, info->dname_len, bad);
+		info->dname = *p;
+		*p += info->dname_len;
+		info->dlease = *p;
+		*p += sizeof(*info->dlease);
+	}
+
+	if (info->head->is_target) {
+		err = parse_reply_info_in(p, end, &info->targeti);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("problem parsing mds trace %d\n", err);
+	return err;
+}
+
+/*
+ * parse readdir results
+ */
+static int parse_reply_info_dir(void **p, void *end,
+				struct ceph_mds_reply_info_parsed *info)
+{
+	u32 num, i = 0;
+	int err;
+
+	info->dir_dir = *p;
+	if (*p + sizeof(*info->dir_dir) > end)
+		goto bad;
+	*p += sizeof(*info->dir_dir) +
+		sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
+	if (*p > end)
+		goto bad;
+
+	ceph_decode_need(p, end, sizeof(num) + 2, bad);
+	ceph_decode_32(p, num);
+	ceph_decode_8(p, info->dir_end);
+	ceph_decode_8(p, info->dir_complete);
+	if (num == 0)
+		goto done;
+
+	/* alloc large array */
+	info->dir_nr = num;
+	info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
+			       sizeof(*info->dir_dname) +
+			       sizeof(*info->dir_dname_len) +
+			       sizeof(*info->dir_dlease),
+			       GFP_NOFS);
+	if (info->dir_in == NULL) {
+		err = -ENOMEM;
+		goto out_bad;
+	}
+	info->dir_dname = (void *)(info->dir_in + num);
+	info->dir_dname_len = (void *)(info->dir_dname + num);
+	info->dir_dlease = (void *)(info->dir_dname_len + num);
+
+	while (num) {
+		/* dentry */
+		ceph_decode_need(p, end, sizeof(u32)*2, bad);
+		ceph_decode_32(p, info->dir_dname_len[i]);
+		ceph_decode_need(p, end, info->dir_dname_len[i], bad);
+		info->dir_dname[i] = *p;
+		*p += info->dir_dname_len[i];
+		dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
+		     info->dir_dname[i]);
+		info->dir_dlease[i] = *p;
+		*p += sizeof(struct ceph_mds_reply_lease);
+
+		/* inode */
+		err = parse_reply_info_in(p, end, &info->dir_in[i]);
+		if (err < 0)
+			goto out_bad;
+		i++;
+		num--;
+	}
+
+done:
+	if (*p != end)
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("problem parsing dir contents %d\n", err);
+	return err;
+}
+
+/*
+ * parse entire mds reply
+ */
+static int parse_reply_info(struct ceph_msg *msg,
+			    struct ceph_mds_reply_info_parsed *info)
+{
+	void *p, *end;
+	u32 len;
+	int err;
+
+	info->head = msg->front.iov_base;
+	p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
+	end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
+
+	/* trace */
+	ceph_decode_32_safe(&p, end, len, bad);
+	if (len > 0) {
+		err = parse_reply_info_trace(&p, p+len, info);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	/* dir content */
+	ceph_decode_32_safe(&p, end, len, bad);
+	if (len > 0) {
+		err = parse_reply_info_dir(&p, p+len, info);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	/* snap blob */
+	ceph_decode_32_safe(&p, end, len, bad);
+	info->snapblob_len = len;
+	info->snapblob = p;
+	p += len;
+
+	if (p != end)
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("mds parse_reply err %d\n", err);
+	return err;
+}
+
+static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
+{
+	kfree(info->dir_in);
+}
+
+
+/*
+ * sessions
+ */
+static const char *session_state_name(int s)
+{
+	switch (s) {
+	case CEPH_MDS_SESSION_NEW: return "new";
+	case CEPH_MDS_SESSION_OPENING: return "opening";
+	case CEPH_MDS_SESSION_OPEN: return "open";
+	case CEPH_MDS_SESSION_HUNG: return "hung";
+	case CEPH_MDS_SESSION_CLOSING: return "closing";
+	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
+	default: return "???";
+	}
+}
+
+static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
+{
+	if (atomic_inc_not_zero(&s->s_ref)) {
+		dout("mdsc get_session %p %d -> %d\n", s,
+		     atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
+		return s;
+	} else {
+		dout("mdsc get_session %p 0 -- FAIL", s);
+		return NULL;
+	}
+}
+
+void ceph_put_mds_session(struct ceph_mds_session *s)
+{
+	dout("mdsc put_session %p %d -> %d\n", s,
+	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
+	if (atomic_dec_and_test(&s->s_ref)) {
+		ceph_con_shutdown(&s->s_con);
+		kfree(s);
+	}
+}
+
+/*
+ * called under mdsc->mutex
+ */
+struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
+						   int mds)
+{
+	struct ceph_mds_session *session;
+
+	if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
+		return NULL;
+	session = mdsc->sessions[mds];
+	dout("lookup_mds_session %p %d\n", session,
+	     atomic_read(&session->s_ref));
+	get_session(session);
+	return session;
+}
+
+static bool __have_session(struct ceph_mds_client *mdsc, int mds)
+{
+	if (mds >= mdsc->max_sessions)
+		return false;
+	return mdsc->sessions[mds];
+}
+
+/*
+ * create+register a new session for given mds.
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
+						 int mds)
+{
+	struct ceph_mds_session *s;
+
+	s = kzalloc(sizeof(*s), GFP_NOFS);
+	s->s_mdsc = mdsc;
+	s->s_mds = mds;
+	s->s_state = CEPH_MDS_SESSION_NEW;
+	s->s_ttl = 0;
+	s->s_seq = 0;
+	mutex_init(&s->s_mutex);
+
+	ceph_con_init(mdsc->client->msgr, &s->s_con);
+	s->s_con.private = s;
+	s->s_con.ops = &mds_con_ops;
+	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
+	s->s_con.peer_name.num = cpu_to_le64(mds);
+	ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+	spin_lock_init(&s->s_cap_lock);
+	s->s_cap_gen = 0;
+	s->s_cap_ttl = 0;
+	s->s_renew_requested = 0;
+	s->s_renew_seq = 0;
+	INIT_LIST_HEAD(&s->s_caps);
+	s->s_nr_caps = 0;
+	atomic_set(&s->s_ref, 1);
+	INIT_LIST_HEAD(&s->s_waiting);
+	INIT_LIST_HEAD(&s->s_unsafe);
+	s->s_num_cap_releases = 0;
+	INIT_LIST_HEAD(&s->s_cap_releases);
+	INIT_LIST_HEAD(&s->s_cap_releases_done);
+	INIT_LIST_HEAD(&s->s_cap_flushing);
+	INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
+
+	dout("register_session mds%d\n", mds);
+	if (mds >= mdsc->max_sessions) {
+		int newmax = 1 << get_count_order(mds+1);
+		struct ceph_mds_session **sa;
+
+		dout("register_session realloc to %d\n", newmax);
+		sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
+		if (sa == NULL)
+			return ERR_PTR(-ENOMEM);
+		if (mdsc->sessions) {
+			memcpy(sa, mdsc->sessions,
+			       mdsc->max_sessions * sizeof(void *));
+			kfree(mdsc->sessions);
+		}
+		mdsc->sessions = sa;
+		mdsc->max_sessions = newmax;
+	}
+	mdsc->sessions[mds] = s;
+	atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
+	return s;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void unregister_session(struct ceph_mds_client *mdsc, int mds)
+{
+	dout("unregister_session mds%d %p\n", mds, mdsc->sessions[mds]);
+	ceph_put_mds_session(mdsc->sessions[mds]);
+	mdsc->sessions[mds] = NULL;
+}
+
+/*
+ * drop session refs in request.
+ *
+ * should be last request ref, or hold mdsc->mutex
+ */
+static void put_request_session(struct ceph_mds_request *req)
+{
+	if (req->r_session) {
+		ceph_put_mds_session(req->r_session);
+		req->r_session = NULL;
+	}
+}
+
+void ceph_mdsc_put_request(struct ceph_mds_request *req)
+{
+	dout("mdsc put_request %p %d -> %d\n", req,
+	     atomic_read(&req->r_ref), atomic_read(&req->r_ref)-1);
+	if (atomic_dec_and_test(&req->r_ref)) {
+		if (req->r_request)
+			ceph_msg_put(req->r_request);
+		if (req->r_reply) {
+			ceph_msg_put(req->r_reply);
+			destroy_reply_info(&req->r_reply_info);
+		}
+		if (req->r_inode) {
+			ceph_put_cap_refs(ceph_inode(req->r_inode),
+					  CEPH_CAP_PIN);
+			iput(req->r_inode);
+		}
+		if (req->r_locked_dir)
+			ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
+					  CEPH_CAP_PIN);
+		if (req->r_target_inode)
+			iput(req->r_target_inode);
+		if (req->r_dentry)
+			dput(req->r_dentry);
+		if (req->r_old_dentry) {
+			ceph_put_cap_refs(
+			     ceph_inode(req->r_old_dentry->d_parent->d_inode),
+			     CEPH_CAP_PIN);
+			dput(req->r_old_dentry);
+		}
+		kfree(req->r_path1);
+		kfree(req->r_path2);
+		put_request_session(req);
+		ceph_unreserve_caps(&req->r_caps_reservation);
+		kfree(req);
+	}
+}
+
+/*
+ * lookup session, bump ref if found.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
+					     u64 tid)
+{
+	struct ceph_mds_request *req;
+	req = radix_tree_lookup(&mdsc->request_tree, tid);
+	if (req)
+		ceph_mdsc_get_request(req);
+	return req;
+}
+
+/*
+ * Register an in-flight request, and assign a tid.  Link to directory
+ * are modifying (if any).
+ *
+ * Called under mdsc->mutex.
+ */
+static void __register_request(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_request *req,
+			       struct inode *dir)
+{
+	req->r_tid = ++mdsc->last_tid;
+	if (req->r_num_caps)
+		ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
+	dout("__register_request %p tid %lld\n", req, req->r_tid);
+	ceph_mdsc_get_request(req);
+	radix_tree_insert(&mdsc->request_tree, req->r_tid, (void *)req);
+
+	if (dir) {
+		struct ceph_inode_info *ci = ceph_inode(dir);
+
+		spin_lock(&ci->i_unsafe_lock);
+		req->r_unsafe_dir = dir;
+		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+		spin_unlock(&ci->i_unsafe_lock);
+	}
+}
+
+static void __unregister_request(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_request *req)
+{
+	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+	radix_tree_delete(&mdsc->request_tree, req->r_tid);
+	ceph_mdsc_put_request(req);
+
+	if (req->r_unsafe_dir) {
+		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
+
+		spin_lock(&ci->i_unsafe_lock);
+		list_del_init(&req->r_unsafe_dir_item);
+		spin_unlock(&ci->i_unsafe_lock);
+	}
+}
+
+/*
+ * Choose mds to send request to next.  If there is a hint set in the
+ * request (e.g., due to a prior forward hint from the mds), use that.
+ * Otherwise, consult frag tree and/or caps to identify the
+ * appropriate mds.  If all else fails, choose randomly.
+ *
+ * Called under mdsc->mutex.
+ */
+static int __choose_mds(struct ceph_mds_client *mdsc,
+			struct ceph_mds_request *req)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_cap *cap;
+	int mode = req->r_direct_mode;
+	int mds = -1;
+	u32 hash = req->r_direct_hash;
+	bool is_hash = req->r_direct_is_hash;
+
+	/*
+	 * is there a specific mds we should try?  ignore hint if we have
+	 * no session and the mds is not up (active or recovering).
+	 */
+	if (req->r_resend_mds >= 0 &&
+	    (__have_session(mdsc, req->r_resend_mds) ||
+	     ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
+		dout("choose_mds using resend_mds mds%d\n",
+		     req->r_resend_mds);
+		return req->r_resend_mds;
+	}
+
+	if (mode == USE_RANDOM_MDS)
+		goto random;
+
+	inode = NULL;
+	if (req->r_inode) {
+		inode = req->r_inode;
+	} else if (req->r_dentry) {
+		if (req->r_dentry->d_inode) {
+			inode = req->r_dentry->d_inode;
+		} else {
+			inode = req->r_dentry->d_parent->d_inode;
+			hash = req->r_dentry->d_name.hash;
+			is_hash = true;
+		}
+	}
+	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
+	     (int)hash, mode);
+	if (!inode)
+		goto random;
+	ci = ceph_inode(inode);
+
+	if (is_hash && S_ISDIR(inode->i_mode)) {
+		struct ceph_inode_frag frag;
+		int found;
+
+		ceph_choose_frag(ci, hash, &frag, &found);
+		if (found) {
+			if (mode == USE_ANY_MDS && frag.ndist > 0) {
+				u8 r;
+
+				/* choose a random replica */
+				get_random_bytes(&r, 1);
+				r %= frag.ndist;
+				mds = frag.dist[r];
+				dout("choose_mds %p %llx.%llx "
+				     "frag %u mds%d (%d/%d)\n",
+				     inode, ceph_vinop(inode),
+				     frag.frag, frag.mds,
+				     (int)r, frag.ndist);
+				return mds;
+			}
+
+			/* since this file/dir wasn't known to be
+			 * replicated, then we want to look for the
+			 * authoritative mds. */
+			mode = USE_AUTH_MDS;
+			if (frag.mds >= 0) {
+				/* choose auth mds */
+				mds = frag.mds;
+				dout("choose_mds %p %llx.%llx "
+				     "frag %u mds%d (auth)\n",
+				     inode, ceph_vinop(inode), frag.frag, mds);
+				return mds;
+			}
+		}
+	}
+
+	spin_lock(&inode->i_lock);
+	cap = NULL;
+	if (mode == USE_AUTH_MDS)
+		cap = ci->i_auth_cap;
+	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
+		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
+	if (!cap) {
+		spin_unlock(&inode->i_lock);
+		goto random;
+	}
+	mds = cap->session->s_mds;
+	dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
+	     inode, ceph_vinop(inode), mds,
+	     cap == ci->i_auth_cap ? "auth " : "", cap);
+	spin_unlock(&inode->i_lock);
+	return mds;
+
+random:
+	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
+	dout("choose_mds chose random mds%d\n", mds);
+	return mds;
+}
+
+
+/*
+ * session messages
+ */
+static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_session_head *h;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
+	if (IS_ERR(msg)) {
+		pr_err("create_session_msg ENOMEM creating msg\n");
+		return ERR_PTR(PTR_ERR(msg));
+	}
+	h = msg->front.iov_base;
+	h->op = cpu_to_le32(op);
+	h->seq = cpu_to_le64(seq);
+	return msg;
+}
+
+/*
+ * send session open request.
+ *
+ * called under mdsc->mutex
+ */
+static int __open_session(struct ceph_mds_client *mdsc,
+			  struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	int mstate;
+	int mds = session->s_mds;
+	int err = 0;
+
+	/* wait for mds to go active? */
+	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
+	dout("open_session to mds%d (%s)\n", mds,
+	     ceph_mds_state_name(mstate));
+	session->s_state = CEPH_MDS_SESSION_OPENING;
+	session->s_renew_requested = jiffies;
+
+	/* send connect message */
+	msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
+	if (IS_ERR(msg)) {
+		err = PTR_ERR(msg);
+		goto out;
+	}
+	ceph_con_send(&session->s_con, msg);
+
+out:
+	return 0;
+}
+
+/*
+ * session caps
+ */
+
+/*
+ * Free preallocated cap messages assigned to this session
+ */
+static void cleanup_cap_releases(struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+
+	spin_lock(&session->s_cap_lock);
+	while (!list_empty(&session->s_cap_releases)) {
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+	}
+	while (!list_empty(&session->s_cap_releases_done)) {
+		msg = list_first_entry(&session->s_cap_releases_done,
+				       struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+	}
+	spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * Helper to safely iterate over all caps associated with a session.
+ *
+ * caller must hold session s_mutex
+ */
+static int iterate_session_caps(struct ceph_mds_session *session,
+				 int (*cb)(struct inode *, struct ceph_cap *,
+					    void *), void *arg)
+{
+	struct ceph_cap *cap, *ncap;
+	struct inode *inode;
+	int ret;
+
+	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+	spin_lock(&session->s_cap_lock);
+	list_for_each_entry_safe(cap, ncap, &session->s_caps, session_caps) {
+		inode = igrab(&cap->ci->vfs_inode);
+		if (!inode)
+			continue;
+		spin_unlock(&session->s_cap_lock);
+		ret = cb(inode, cap, arg);
+		iput(inode);
+		if (ret < 0)
+			return ret;
+		spin_lock(&session->s_cap_lock);
+	}
+	spin_unlock(&session->s_cap_lock);
+
+	return 0;
+}
+
+static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
+				   void *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	dout("removing cap %p, ci is %p, inode is %p\n",
+	     cap, ci, &ci->vfs_inode);
+	ceph_remove_cap(cap);
+	return 0;
+}
+
+/*
+ * caller must hold session s_mutex
+ */
+static void remove_session_caps(struct ceph_mds_session *session)
+{
+	dout("remove_session_caps on %p\n", session);
+	iterate_session_caps(session, remove_session_caps_cb, NULL);
+	BUG_ON(session->s_nr_caps > 0);
+	cleanup_cap_releases(session);
+}
+
+/*
+ * wake up any threads waiting on this session's caps.  if the cap is
+ * old (didn't get renewed on the client reconnect), remove it now.
+ *
+ * caller must hold s_mutex.
+ */
+static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
+			      void *arg)
+{
+	struct ceph_mds_session *session = arg;
+
+	spin_lock(&inode->i_lock);
+	if (cap->gen != session->s_cap_gen) {
+		pr_err("failed reconnect %p %llx.%llx cap %p "
+		       "(gen %d < session %d)\n", inode, ceph_vinop(inode),
+		       cap, cap->gen, session->s_cap_gen);
+		__ceph_remove_cap(cap, NULL);
+	}
+	wake_up(&ceph_inode(inode)->i_cap_wq);
+	spin_unlock(&inode->i_lock);
+	return 0;
+}
+
+static void wake_up_session_caps(struct ceph_mds_session *session)
+{
+	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
+	iterate_session_caps(session, wake_up_session_cb, session);
+}
+
+/*
+ * Send periodic message to MDS renewing all currently held caps.  The
+ * ack will reset the expiration for all caps from this session.
+ *
+ * caller holds s_mutex
+ */
+static int send_renew_caps(struct ceph_mds_client *mdsc,
+			   struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	int state;
+
+	if (time_after_eq(jiffies, session->s_cap_ttl) &&
+	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
+		pr_info("mds%d caps stale\n", session->s_mds);
+
+	/* do not try to renew caps until a recovering mds has reconnected
+	 * with its clients. */
+	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
+	if (state < CEPH_MDS_STATE_RECONNECT) {
+		dout("send_renew_caps ignoring mds%d (%s)\n",
+		     session->s_mds, ceph_mds_state_name(state));
+		return 0;
+	}
+
+	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
+		ceph_mds_state_name(state));
+	session->s_renew_requested = jiffies;
+	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+				 ++session->s_renew_seq);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+/*
+ * Note new cap ttl, and any transition from stale -> not stale (fresh?).
+ */
+static void renewed_caps(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session, int is_renew)
+{
+	int was_stale;
+	int wake = 0;
+
+	spin_lock(&session->s_cap_lock);
+	was_stale = is_renew && (session->s_cap_ttl == 0 ||
+				 time_after_eq(jiffies, session->s_cap_ttl));
+
+	session->s_cap_ttl = session->s_renew_requested +
+		mdsc->mdsmap->m_session_timeout*HZ;
+
+	if (was_stale) {
+		if (time_before(jiffies, session->s_cap_ttl)) {
+			pr_info("mds%d caps renewed\n", session->s_mds);
+			wake = 1;
+		} else {
+			pr_info("mds%d caps still stale\n", session->s_mds);
+		}
+	}
+	dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
+	     session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
+	     time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+	spin_unlock(&session->s_cap_lock);
+
+	if (wake)
+		wake_up_session_caps(session);
+}
+
+/*
+ * send a session close request
+ */
+static int request_close_session(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	int err = 0;
+
+	dout("request_close_session mds%d state %s seq %lld\n",
+	     session->s_mds, session_state_name(session->s_state),
+	     session->s_seq);
+	msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+	if (IS_ERR(msg))
+		err = PTR_ERR(msg);
+	else
+		ceph_con_send(&session->s_con, msg);
+	return err;
+}
+
+/*
+ * Called with s_mutex held.
+ */
+static int __close_session(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session)
+{
+	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
+		return 0;
+	session->s_state = CEPH_MDS_SESSION_CLOSING;
+	return request_close_session(mdsc, session);
+}
+
+/*
+ * Trim old(er) caps.
+ *
+ * Because we can't cache an inode without one or more caps, we do
+ * this indirectly: if a cap is unused, we prune its aliases, at which
+ * point the inode will hopefully get dropped to.
+ *
+ * Yes, this is a bit sloppy.  Our only real goal here is to respond to
+ * memory pressure from the MDS, though, so it needn't be perfect.
+ */
+static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+{
+	struct ceph_mds_session *session = arg;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int used, oissued, mine;
+
+	if (session->s_trim_caps <= 0)
+		return -1;
+
+	spin_lock(&inode->i_lock);
+	mine = cap->issued | cap->implemented;
+	used = __ceph_caps_used(ci);
+	oissued = __ceph_caps_issued_other(ci, cap);
+
+	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
+	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
+	     ceph_cap_string(used));
+	if (ci->i_dirty_caps)
+		goto out;   /* dirty caps */
+	if ((used & ~oissued) & mine)
+		goto out;   /* we need these caps */
+
+	session->s_trim_caps--;
+	if (oissued) {
+		/* we aren't the only cap.. just remove us */
+		__ceph_remove_cap(cap, NULL);
+	} else {
+		/* try to drop referring dentries */
+		spin_unlock(&inode->i_lock);
+		d_prune_aliases(inode);
+		dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
+		     inode, cap, atomic_read(&inode->i_count));
+		return 0;
+	}
+
+out:
+	spin_unlock(&inode->i_lock);
+	return 0;
+}
+
+/*
+ * Trim session cap count down to some max number.
+ */
+static int trim_caps(struct ceph_mds_client *mdsc,
+		     struct ceph_mds_session *session,
+		     int max_caps)
+{
+	int trim_caps = session->s_nr_caps - max_caps;
+
+	dout("trim_caps mds%d start: %d / %d, trim %d\n",
+	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+	if (trim_caps > 0) {
+		session->s_trim_caps = trim_caps;
+		iterate_session_caps(session, trim_caps_cb, session);
+		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
+		     session->s_mds, session->s_nr_caps, max_caps,
+			trim_caps - session->s_trim_caps);
+	}
+	return 0;
+}
+
+/*
+ * Allocate cap_release messages.  If there is a partially full message
+ * in the queue, try to allocate enough to cover it's remainder, so that
+ * we can send it immediately.
+ *
+ * Called under s_mutex.
+ */
+static int add_cap_releases(struct ceph_mds_client *mdsc,
+			    struct ceph_mds_session *session,
+			    int extra)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_cap_release *head;
+	int err = -ENOMEM;
+
+	if (extra < 0)
+		extra = mdsc->client->mount_args.cap_release_safety;
+
+	spin_lock(&session->s_cap_lock);
+
+	if (!list_empty(&session->s_cap_releases)) {
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg,
+				 list_head);
+		head = msg->front.iov_base;
+		extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+	}
+
+	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
+		spin_unlock(&session->s_cap_lock);
+		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
+				   0, 0, NULL);
+		if (!msg)
+			goto out_unlocked;
+		dout("add_cap_releases %p msg %p now %d\n", session, msg,
+		     (int)msg->front.iov_len);
+		head = msg->front.iov_base;
+		head->num = cpu_to_le32(0);
+		msg->front.iov_len = sizeof(*head);
+		spin_lock(&session->s_cap_lock);
+		list_add(&msg->list_head, &session->s_cap_releases);
+		session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
+	}
+
+	if (!list_empty(&session->s_cap_releases)) {
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg,
+				       list_head);
+		head = msg->front.iov_base;
+		if (head->num) {
+			dout(" queueing non-full %p (%d)\n", msg,
+			     le32_to_cpu(head->num));
+			list_move_tail(&msg->list_head,
+				      &session->s_cap_releases_done);
+			session->s_num_cap_releases -=
+				CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+		}
+	}
+	err = 0;
+	spin_unlock(&session->s_cap_lock);
+out_unlocked:
+	return err;
+}
+
+/*
+ * flush all dirty inode data to disk.
+ *
+ * returns true if we've flushed through want_flush_seq
+ */
+static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+{
+	int mds, ret = 1;
+
+	dout("check_cap_flush want %lld\n", want_flush_seq);
+	mutex_lock(&mdsc->mutex);
+	for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+		struct ceph_mds_session *session = mdsc->sessions[mds];
+
+		if (!session)
+			continue;
+		get_session(session);
+		mutex_unlock(&mdsc->mutex);
+
+		mutex_lock(&session->s_mutex);
+		if (!list_empty(&session->s_cap_flushing)) {
+			struct ceph_inode_info *ci =
+				list_entry(session->s_cap_flushing.next,
+					   struct ceph_inode_info,
+					   i_flushing_item);
+			struct inode *inode = &ci->vfs_inode;
+
+			spin_lock(&inode->i_lock);
+			if (ci->i_cap_flush_seq <= want_flush_seq) {
+				dout("check_cap_flush still flushing %p "
+				     "seq %lld <= %lld to mds%d\n", inode,
+				     ci->i_cap_flush_seq, want_flush_seq,
+				     session->s_mds);
+				ret = 0;
+			}
+			spin_unlock(&inode->i_lock);
+		}
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+
+		if (!ret)
+			return ret;
+		mutex_lock(&mdsc->mutex);
+	}
+
+	mutex_unlock(&mdsc->mutex);
+	dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+	return ret;
+}
+
+/*
+ * called under s_mutex
+ */
+static void send_cap_releases(struct ceph_mds_client *mdsc,
+		       struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+
+	dout("send_cap_releases mds%d\n", session->s_mds);
+	while (1) {
+		spin_lock(&session->s_cap_lock);
+		if (list_empty(&session->s_cap_releases_done))
+			break;
+		msg = list_first_entry(&session->s_cap_releases_done,
+				 struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		spin_unlock(&session->s_cap_lock);
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+		ceph_con_send(&session->s_con, msg);
+	}
+	spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * requests
+ */
+
+/*
+ * Create an mds request.
+ */
+struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
+{
+	struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+
+	if (!req)
+		return ERR_PTR(-ENOMEM);
+
+	req->r_started = jiffies;
+	req->r_resend_mds = -1;
+	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+	req->r_fmode = -1;
+	atomic_set(&req->r_ref, 1);  /* one for request_tree, one for caller */
+	INIT_LIST_HEAD(&req->r_wait);
+	init_completion(&req->r_completion);
+	init_completion(&req->r_safe_completion);
+	INIT_LIST_HEAD(&req->r_unsafe_item);
+
+	req->r_op = op;
+	req->r_direct_mode = mode;
+	return req;
+}
+
+/*
+ * return oldest (lowest) tid in request tree, 0 if none.
+ *
+ * called under mdsc->mutex.
+ */
+static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_request *first;
+	if (radix_tree_gang_lookup(&mdsc->request_tree,
+				   (void **)&first, 0, 1) <= 0)
+		return 0;
+	return first->r_tid;
+}
+
+/*
+ * Build a dentry's path.  Allocate on heap; caller must kfree.  Based
+ * on build_path_from_dentry in fs/cifs/dir.c.
+ *
+ * If @stop_on_nosnap, generate path relative to the first non-snapped
+ * inode.
+ *
+ * Encode hidden .snap dirs as a double /, i.e.
+ *   foo/.snap/bar -> foo//bar
+ */
+char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+			   int stop_on_nosnap)
+{
+	struct dentry *temp;
+	char *path;
+	int len, pos;
+
+	if (dentry == NULL)
+		return ERR_PTR(-EINVAL);
+
+retry:
+	len = 0;
+	for (temp = dentry; !IS_ROOT(temp);) {
+		struct inode *inode = temp->d_inode;
+		if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
+			len++;  /* slash only */
+		else if (stop_on_nosnap && inode &&
+			 ceph_snap(inode) == CEPH_NOSNAP)
+			break;
+		else
+			len += 1 + temp->d_name.len;
+		temp = temp->d_parent;
+		if (temp == NULL) {
+			pr_err("build_path_dentry corrupt dentry %p\n", dentry);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+	if (len)
+		len--;  /* no leading '/' */
+
+	path = kmalloc(len+1, GFP_NOFS);
+	if (path == NULL)
+		return ERR_PTR(-ENOMEM);
+	pos = len;
+	path[pos] = 0;	/* trailing null */
+	for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
+		struct inode *inode = temp->d_inode;
+
+		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
+			dout("build_path_dentry path+%d: %p SNAPDIR\n",
+			     pos, temp);
+		} else if (stop_on_nosnap && inode &&
+			   ceph_snap(inode) == CEPH_NOSNAP) {
+			break;
+		} else {
+			pos -= temp->d_name.len;
+			if (pos < 0)
+				break;
+			strncpy(path + pos, temp->d_name.name,
+				temp->d_name.len);
+			dout("build_path_dentry path+%d: %p '%.*s'\n",
+			     pos, temp, temp->d_name.len, path + pos);
+		}
+		if (pos)
+			path[--pos] = '/';
+		temp = temp->d_parent;
+		if (temp == NULL) {
+			pr_err("build_path_dentry corrupt dentry\n");
+			kfree(path);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+	if (pos != 0) {
+		pr_err("build_path_dentry did not end path lookup where "
+		       "expected, namelen is %d, pos is %d\n", len, pos);
+		/* presumably this is only possible if racing with a
+		   rename of one of the parent directories (we can not
+		   lock the dentries above us to prevent this, but
+		   retrying should be harmless) */
+		kfree(path);
+		goto retry;
+	}
+
+	*base = ceph_ino(temp->d_inode);
+	*plen = len;
+	dout("build_path_dentry on %p %d built %llx '%.*s'\n",
+	     dentry, atomic_read(&dentry->d_count), *base, len, path);
+	return path;
+}
+
+static int build_dentry_path(struct dentry *dentry,
+			     const char **ppath, int *ppathlen, u64 *pino,
+			     int *pfreepath)
+{
+	char *path;
+
+	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
+		*pino = ceph_ino(dentry->d_parent->d_inode);
+		*ppath = dentry->d_name.name;
+		*ppathlen = dentry->d_name.len;
+		return 0;
+	}
+	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	*ppath = path;
+	*pfreepath = 1;
+	return 0;
+}
+
+static int build_inode_path(struct inode *inode,
+			    const char **ppath, int *ppathlen, u64 *pino,
+			    int *pfreepath)
+{
+	struct dentry *dentry;
+	char *path;
+
+	if (ceph_snap(inode) == CEPH_NOSNAP) {
+		*pino = ceph_ino(inode);
+		*ppathlen = 0;
+		return 0;
+	}
+	dentry = d_find_alias(inode);
+	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	dput(dentry);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	*ppath = path;
+	*pfreepath = 1;
+	return 0;
+}
+
+/*
+ * request arguments may be specified via an inode *, a dentry *, or
+ * an explicit ino+path.
+ */
+static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
+				  const char *rpath, u64 rino,
+				  const char **ppath, int *pathlen,
+				  u64 *ino, int *freepath)
+{
+	int r = 0;
+
+	if (rinode) {
+		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+		dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+		     ceph_snap(rinode));
+	} else if (rdentry) {
+		r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
+		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
+		     *ppath);
+	} else if (rpath) {
+		*ino = rino;
+		*ppath = rpath;
+		*pathlen = strlen(rpath);
+		dout(" path %.*s\n", *pathlen, rpath);
+	}
+
+	return r;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+					       struct ceph_mds_request *req,
+					       int mds)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_request_head *head;
+	const char *path1 = NULL;
+	const char *path2 = NULL;
+	u64 ino1 = 0, ino2 = 0;
+	int pathlen1 = 0, pathlen2 = 0;
+	int freepath1 = 0, freepath2 = 0;
+	int len;
+	u16 releases;
+	void *p, *end;
+	int ret;
+
+	ret = set_request_path_attr(req->r_inode, req->r_dentry,
+			      req->r_path1, req->r_ino1.ino,
+			      &path1, &pathlen1, &ino1, &freepath1);
+	if (ret < 0) {
+		msg = ERR_PTR(ret);
+		goto out;
+	}
+
+	ret = set_request_path_attr(NULL, req->r_old_dentry,
+			      req->r_path2, req->r_ino2.ino,
+			      &path2, &pathlen2, &ino2, &freepath2);
+	if (ret < 0) {
+		msg = ERR_PTR(ret);
+		goto out_free1;
+	}
+
+	len = sizeof(*head) +
+		pathlen1 + pathlen2 + 2*(sizeof(u32) + sizeof(u64));
+
+	/* calculate (max) length for cap releases */
+	len += sizeof(struct ceph_mds_request_release) *
+		(!!req->r_inode_drop + !!req->r_dentry_drop +
+		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+	if (req->r_dentry_drop)
+		len += req->r_dentry->d_name.len;
+	if (req->r_old_dentry_drop)
+		len += req->r_old_dentry->d_name.len;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
+	if (IS_ERR(msg))
+		goto out_free2;
+
+	head = msg->front.iov_base;
+	p = msg->front.iov_base + sizeof(*head);
+	end = msg->front.iov_base + msg->front.iov_len;
+
+	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+	head->op = cpu_to_le32(req->r_op);
+	head->caller_uid = cpu_to_le32(current_fsuid());
+	head->caller_gid = cpu_to_le32(current_fsgid());
+	head->args = req->r_args;
+
+	ceph_encode_filepath(&p, end, ino1, path1);
+	ceph_encode_filepath(&p, end, ino2, path2);
+
+	/* cap releases */
+	releases = 0;
+	if (req->r_inode_drop)
+		releases += ceph_encode_inode_release(&p,
+		      req->r_inode ? req->r_inode : req->r_dentry->d_inode,
+		      mds, req->r_inode_drop, req->r_inode_unless, 0);
+	if (req->r_dentry_drop)
+		releases += ceph_encode_dentry_release(&p, req->r_dentry,
+		       mds, req->r_dentry_drop, req->r_dentry_unless);
+	if (req->r_old_dentry_drop)
+		releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
+		       mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
+	if (req->r_old_inode_drop)
+		releases += ceph_encode_inode_release(&p,
+		      req->r_old_dentry->d_inode,
+		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
+	head->num_releases = cpu_to_le16(releases);
+
+	BUG_ON(p > end);
+	msg->front.iov_len = p - msg->front.iov_base;
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+
+	msg->pages = req->r_pages;
+	msg->nr_pages = req->r_num_pages;
+	msg->hdr.data_len = cpu_to_le32(req->r_data_len);
+	msg->hdr.data_off = cpu_to_le16(0);
+
+out_free2:
+	if (freepath2)
+		kfree((char *)path2);
+out_free1:
+	if (freepath1)
+		kfree((char *)path1);
+out:
+	return msg;
+}
+
+/*
+ * called under mdsc->mutex if error, under no mutex if
+ * success.
+ */
+static void complete_request(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_request *req)
+{
+	if (req->r_callback)
+		req->r_callback(mdsc, req);
+	else
+		complete(&req->r_completion);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static int __prepare_send_request(struct ceph_mds_client *mdsc,
+				  struct ceph_mds_request *req,
+				  int mds)
+{
+	struct ceph_mds_request_head *rhead;
+	struct ceph_msg *msg;
+	int flags = 0;
+
+	req->r_mds = mds;
+	req->r_attempts++;
+	dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
+	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+
+	if (req->r_request) {
+		ceph_msg_put(req->r_request);
+		req->r_request = NULL;
+	}
+	msg = create_request_message(mdsc, req, mds);
+	if (IS_ERR(msg)) {
+		req->r_reply = ERR_PTR(PTR_ERR(msg));
+		complete_request(mdsc, req);
+		return -PTR_ERR(msg);
+	}
+	req->r_request = msg;
+
+	rhead = msg->front.iov_base;
+	rhead->tid = cpu_to_le64(req->r_tid);
+	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+	if (req->r_got_unsafe)
+		flags |= CEPH_MDS_FLAG_REPLAY;
+	if (req->r_locked_dir)
+		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
+	rhead->flags = cpu_to_le32(flags);
+	rhead->num_fwd = req->r_num_fwd;
+	rhead->num_retry = req->r_attempts - 1;
+
+	dout(" r_locked_dir = %p\n", req->r_locked_dir);
+
+	if (req->r_target_inode && req->r_got_unsafe)
+		rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+	else
+		rhead->ino = 0;
+	return 0;
+}
+
+/*
+ * send request, or put it on the appropriate wait list.
+ */
+static int __do_request(struct ceph_mds_client *mdsc,
+			struct ceph_mds_request *req)
+{
+	struct ceph_mds_session *session = NULL;
+	int mds = -1;
+	int err = -EAGAIN;
+
+	if (req->r_reply)
+		goto out;
+
+	if (req->r_timeout &&
+	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
+		dout("do_request timed out\n");
+		err = -EIO;
+		goto finish;
+	}
+
+	mds = __choose_mds(mdsc, req);
+	if (mds < 0 ||
+	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+		dout("do_request no mds or not active, waiting for map\n");
+		list_add(&req->r_wait, &mdsc->waiting_for_map);
+		goto out;
+	}
+
+	/* get, open session */
+	session = __ceph_lookup_mds_session(mdsc, mds);
+	if (!session)
+		session = register_session(mdsc, mds);
+	dout("do_request mds%d session %p state %s\n", mds, session,
+	     session_state_name(session->s_state));
+	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
+	    session->s_state != CEPH_MDS_SESSION_HUNG) {
+		if (session->s_state == CEPH_MDS_SESSION_NEW ||
+		    session->s_state == CEPH_MDS_SESSION_CLOSING)
+			__open_session(mdsc, session);
+		list_add(&req->r_wait, &session->s_waiting);
+		goto out_session;
+	}
+
+	/* send request */
+	req->r_session = get_session(session);
+	req->r_resend_mds = -1;   /* forget any previous mds hint */
+
+	if (req->r_request_started == 0)   /* note request start time */
+		req->r_request_started = jiffies;
+
+	err = __prepare_send_request(mdsc, req, mds);
+	if (!err) {
+		ceph_msg_get(req->r_request);
+		ceph_con_send(&session->s_con, req->r_request);
+	}
+
+out_session:
+	ceph_put_mds_session(session);
+out:
+	return err;
+
+finish:
+	req->r_reply = ERR_PTR(err);
+	complete_request(mdsc, req);
+	goto out;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void __wake_requests(struct ceph_mds_client *mdsc,
+			    struct list_head *head)
+{
+	struct ceph_mds_request *req, *nreq;
+
+	list_for_each_entry_safe(req, nreq, head, r_wait) {
+		list_del_init(&req->r_wait);
+		__do_request(mdsc, req);
+	}
+}
+
+/*
+ * Wake up threads with requests pending for @mds, so that they can
+ * resubmit their requests to a possibly different mds.  If @all is set,
+ * wake up if their requests has been forwarded to @mds, too.
+ */
+static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
+{
+	struct ceph_mds_request *reqs[10];
+	u64 nexttid = 0;
+	int i, got;
+
+	dout("kick_requests mds%d\n", mds);
+	while (nexttid <= mdsc->last_tid) {
+		got = radix_tree_gang_lookup(&mdsc->request_tree,
+					     (void **)&reqs, nexttid, 10);
+		if (got == 0)
+			break;
+		nexttid = reqs[got-1]->r_tid + 1;
+		for (i = 0; i < got; i++) {
+			if (reqs[i]->r_got_unsafe)
+				continue;
+			if (reqs[i]->r_session &&
+			    reqs[i]->r_session->s_mds == mds) {
+				dout(" kicking tid %llu\n", reqs[i]->r_tid);
+				put_request_session(reqs[i]);
+				__do_request(mdsc, reqs[i]);
+			}
+		}
+	}
+}
+
+void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+			      struct ceph_mds_request *req)
+{
+	dout("submit_request on %p\n", req);
+	mutex_lock(&mdsc->mutex);
+	__register_request(mdsc, req, NULL);
+	__do_request(mdsc, req);
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Synchrously perform an mds request.  Take care of all of the
+ * session setup, forwarding, retry details.
+ */
+int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+			 struct inode *dir,
+			 struct ceph_mds_request *req)
+{
+	int err;
+
+	dout("do_request on %p\n", req);
+
+	/* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
+	if (req->r_inode)
+		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
+	if (req->r_locked_dir)
+		ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+	if (req->r_old_dentry)
+		ceph_get_cap_refs(
+			ceph_inode(req->r_old_dentry->d_parent->d_inode),
+			CEPH_CAP_PIN);
+
+	/* issue */
+	mutex_lock(&mdsc->mutex);
+	__register_request(mdsc, req, dir);
+	__do_request(mdsc, req);
+
+	/* wait */
+	if (!req->r_reply) {
+		mutex_unlock(&mdsc->mutex);
+		if (req->r_timeout) {
+			err = wait_for_completion_timeout(&req->r_completion,
+							  req->r_timeout);
+			if (err > 0)
+				err = 0;
+			else if (err == 0)
+				req->r_reply = ERR_PTR(-EIO);
+		} else {
+			wait_for_completion(&req->r_completion);
+		}
+		mutex_lock(&mdsc->mutex);
+	}
+
+	if (IS_ERR(req->r_reply)) {
+		err = PTR_ERR(req->r_reply);
+		req->r_reply = NULL;
+
+		/* clean up */
+		__unregister_request(mdsc, req);
+		if (!list_empty(&req->r_unsafe_item))
+			list_del_init(&req->r_unsafe_item);
+		complete(&req->r_safe_completion);
+	} else if (req->r_err) {
+		err = req->r_err;
+	} else {
+		err = le32_to_cpu(req->r_reply_info.head->result);
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	dout("do_request %p done, result %d\n", req, err);
+	return err;
+}
+
+/*
+ * Handle mds reply.
+ *
+ * We take the session mutex and parse and process the reply immediately.
+ * This preserves the logical ordering of replies, capabilities, etc., sent
+ * by the MDS as they are applied to our local cache.
+ */
+static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_mds_reply_head *head = msg->front.iov_base;
+	struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
+	u64 tid;
+	int err, result;
+	int mds;
+
+	if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
+		return;
+	if (msg->front.iov_len < sizeof(*head)) {
+		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+		return;
+	}
+
+	/* get request, session */
+	tid = le64_to_cpu(head->tid);
+	mutex_lock(&mdsc->mutex);
+	req = __lookup_request(mdsc, tid);
+	if (!req) {
+		dout("handle_reply on unknown tid %llu\n", tid);
+		mutex_unlock(&mdsc->mutex);
+		return;
+	}
+	dout("handle_reply %p\n", req);
+	mds = le64_to_cpu(msg->hdr.src.name.num);
+
+	/* correct session? */
+	if (!req->r_session && req->r_session != session) {
+		pr_err("mdsc_handle_reply got %llu on session mds%d"
+		       " not mds%d\n", tid, session->s_mds,
+		       req->r_session ? req->r_session->s_mds : -1);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+
+	/* dup? */
+	if ((req->r_got_unsafe && !head->safe) ||
+	    (req->r_got_safe && head->safe)) {
+		pr_warning("got a dup %s reply on %llu from mds%d\n",
+			   head->safe ? "safe" : "unsafe", tid, mds);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+
+	result = le32_to_cpu(head->result);
+
+	/*
+	 * Tolerate 2 consecutive ESTALEs from the same mds.
+	 * FIXME: we should be looking at the cap migrate_seq.
+	 */
+	if (result == -ESTALE) {
+		req->r_direct_mode = USE_AUTH_MDS;
+		req->r_num_stale++;
+		if (req->r_num_stale <= 2) {
+			__do_request(mdsc, req);
+			mutex_unlock(&mdsc->mutex);
+			goto out;
+		}
+	} else {
+		req->r_num_stale = 0;
+	}
+
+	if (head->safe) {
+		req->r_got_safe = true;
+		__unregister_request(mdsc, req);
+		complete(&req->r_safe_completion);
+
+		if (req->r_got_unsafe) {
+			/*
+			 * We already handled the unsafe response, now do the
+			 * cleanup.  No need to examine the response; the MDS
+			 * doesn't include any result info in the safe
+			 * response.  And even if it did, there is nothing
+			 * useful we could do with a revised return value.
+			 */
+			dout("got safe reply %llu, mds%d\n", tid, mds);
+			list_del_init(&req->r_unsafe_item);
+
+			/* last unsafe request during umount? */
+			if (mdsc->stopping && !__get_oldest_tid(mdsc))
+				complete(&mdsc->safe_umount_waiters);
+			mutex_unlock(&mdsc->mutex);
+			goto out;
+		}
+	}
+
+	BUG_ON(req->r_reply);
+
+	if (!head->safe) {
+		req->r_got_unsafe = true;
+		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+	}
+
+	dout("handle_reply tid %lld result %d\n", tid, result);
+	rinfo = &req->r_reply_info;
+	err = parse_reply_info(msg, rinfo);
+	mutex_unlock(&mdsc->mutex);
+
+	mutex_lock(&session->s_mutex);
+	if (err < 0) {
+		pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+		goto out_err;
+	}
+
+	/* snap trace */
+	if (rinfo->snapblob_len) {
+		down_write(&mdsc->snap_rwsem);
+		ceph_update_snap_trace(mdsc, rinfo->snapblob,
+			       rinfo->snapblob + rinfo->snapblob_len,
+			       le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
+		downgrade_write(&mdsc->snap_rwsem);
+	} else {
+		down_read(&mdsc->snap_rwsem);
+	}
+
+	/* insert trace into our cache */
+	err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+	if (err == 0) {
+		if (result == 0 && rinfo->dir_nr)
+			ceph_readdir_prepopulate(req, req->r_session);
+		ceph_unreserve_caps(&req->r_caps_reservation);
+	}
+
+	up_read(&mdsc->snap_rwsem);
+out_err:
+	if (err) {
+		req->r_err = err;
+	} else {
+		req->r_reply = msg;
+		ceph_msg_get(msg);
+	}
+
+	add_cap_releases(mdsc, req->r_session, -1);
+	mutex_unlock(&session->s_mutex);
+
+	/* kick calling process */
+	complete_request(mdsc, req);
+out:
+	ceph_mdsc_put_request(req);
+	return;
+}
+
+
+
+/*
+ * handle mds notification that our request has been forwarded.
+ */
+static void handle_forward(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+	struct ceph_mds_request *req;
+	u64 tid;
+	u32 next_mds;
+	u32 fwd_seq;
+	u8 must_resend;
+	int err = -EINVAL;
+	void *p = msg->front.iov_base;
+	void *end = p + msg->front.iov_len;
+	int from_mds, state;
+
+	if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
+		goto bad;
+	from_mds = le64_to_cpu(msg->hdr.src.name.num);
+
+	ceph_decode_need(&p, end, sizeof(u64)+2*sizeof(u32), bad);
+	ceph_decode_64(&p, tid);
+	ceph_decode_32(&p, next_mds);
+	ceph_decode_32(&p, fwd_seq);
+	ceph_decode_8(&p, must_resend);
+
+	WARN_ON(must_resend);  /* shouldn't happen. */
+
+	mutex_lock(&mdsc->mutex);
+	req = __lookup_request(mdsc, tid);
+	if (!req) {
+		dout("forward %llu dne\n", tid);
+		goto out;  /* dup reply? */
+	}
+
+	state = mdsc->sessions[next_mds]->s_state;
+	if (fwd_seq <= req->r_num_fwd) {
+		dout("forward %llu to mds%d - old seq %d <= %d\n",
+		     tid, next_mds, req->r_num_fwd, fwd_seq);
+	} else {
+		/* resend. forward race not possible; mds would drop */
+		dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
+		req->r_num_fwd = fwd_seq;
+		req->r_resend_mds = next_mds;
+		put_request_session(req);
+		__do_request(mdsc, req);
+	}
+	ceph_mdsc_put_request(req);
+out:
+	mutex_unlock(&mdsc->mutex);
+	return;
+
+bad:
+	pr_err("mdsc_handle_forward decode error err=%d\n", err);
+}
+
+/*
+ * handle a mds session control message
+ */
+static void handle_session(struct ceph_mds_session *session,
+			   struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	u32 op;
+	u64 seq;
+	int mds;
+	struct ceph_mds_session_head *h = msg->front.iov_base;
+	int wake = 0;
+
+	if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
+		return;
+	mds = le64_to_cpu(msg->hdr.src.name.num);
+
+	/* decode */
+	if (msg->front.iov_len != sizeof(*h))
+		goto bad;
+	op = le32_to_cpu(h->op);
+	seq = le64_to_cpu(h->seq);
+
+	mutex_lock(&mdsc->mutex);
+	/* FIXME: this ttl calculation is generous */
+	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
+	mutex_unlock(&mdsc->mutex);
+
+	mutex_lock(&session->s_mutex);
+
+	dout("handle_session mds%d %s %p state %s seq %llu\n",
+	     mds, ceph_session_op_name(op), session,
+	     session_state_name(session->s_state), seq);
+
+	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
+		session->s_state = CEPH_MDS_SESSION_OPEN;
+		pr_info("mds%d came back\n", session->s_mds);
+	}
+
+	switch (op) {
+	case CEPH_SESSION_OPEN:
+		session->s_state = CEPH_MDS_SESSION_OPEN;
+		renewed_caps(mdsc, session, 0);
+		wake = 1;
+		if (mdsc->stopping)
+			__close_session(mdsc, session);
+		break;
+
+	case CEPH_SESSION_RENEWCAPS:
+		if (session->s_renew_seq == seq)
+			renewed_caps(mdsc, session, 1);
+		break;
+
+	case CEPH_SESSION_CLOSE:
+		unregister_session(mdsc, mds);
+		remove_session_caps(session);
+		wake = 1; /* for good measure */
+		complete(&mdsc->session_close_waiters);
+		kick_requests(mdsc, mds, 0);      /* cur only */
+		break;
+
+	case CEPH_SESSION_STALE:
+		pr_info("mds%d caps went stale, renewing\n",
+			session->s_mds);
+		spin_lock(&session->s_cap_lock);
+		session->s_cap_gen++;
+		session->s_cap_ttl = 0;
+		spin_unlock(&session->s_cap_lock);
+		send_renew_caps(mdsc, session);
+		break;
+
+	case CEPH_SESSION_RECALL_STATE:
+		trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
+		break;
+
+	default:
+		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
+		WARN_ON(1);
+	}
+
+	mutex_unlock(&session->s_mutex);
+	if (wake) {
+		mutex_lock(&mdsc->mutex);
+		__wake_requests(mdsc, &session->s_waiting);
+		mutex_unlock(&mdsc->mutex);
+	}
+	return;
+
+bad:
+	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
+	       (int)msg->front.iov_len);
+	return;
+}
+
+
+/*
+ * called under session->mutex.
+ */
+static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session)
+{
+	struct ceph_mds_request *req, *nreq;
+	int err;
+
+	dout("replay_unsafe_requests mds%d\n", session->s_mds);
+
+	mutex_lock(&mdsc->mutex);
+	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
+		err = __prepare_send_request(mdsc, req, session->s_mds);
+		if (!err) {
+			ceph_msg_get(req->r_request);
+			ceph_con_send(&session->s_con, req->r_request);
+		}
+	}
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Encode information about a cap for a reconnect with the MDS.
+ */
+struct encode_caps_data {
+	void **pp;
+	void *end;
+	int *num_caps;
+};
+
+static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+			  void *arg)
+{
+	struct ceph_mds_cap_reconnect *rec;
+	struct ceph_inode_info *ci;
+	struct encode_caps_data *data = (struct encode_caps_data *)arg;
+	void *p = *(data->pp);
+	void *end = data->end;
+	char *path;
+	int pathlen, err;
+	u64 pathbase;
+	struct dentry *dentry;
+
+	ci = cap->ci;
+
+	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
+	     inode, ceph_vinop(inode), cap, cap->cap_id,
+	     ceph_cap_string(cap->issued));
+	ceph_decode_need(&p, end, sizeof(u64), needmore);
+	ceph_encode_64(&p, ceph_ino(inode));
+
+	dentry = d_find_alias(inode);
+	if (dentry) {
+		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			BUG_ON(err);
+		}
+	} else {
+		path = NULL;
+		pathlen = 0;
+	}
+	ceph_decode_need(&p, end, pathlen+4, needmore);
+	ceph_encode_string(&p, end, path, pathlen);
+
+	ceph_decode_need(&p, end, sizeof(*rec), needmore);
+	rec = p;
+	p += sizeof(*rec);
+	BUG_ON(p > end);
+	spin_lock(&inode->i_lock);
+	cap->seq = 0;        /* reset cap seq */
+	cap->issue_seq = 0;  /* and issue_seq */
+	rec->cap_id = cpu_to_le64(cap->cap_id);
+	rec->pathbase = cpu_to_le64(pathbase);
+	rec->wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+	rec->issued = cpu_to_le32(cap->issued);
+	rec->size = cpu_to_le64(inode->i_size);
+	ceph_encode_timespec(&rec->mtime, &inode->i_mtime);
+	ceph_encode_timespec(&rec->atime, &inode->i_atime);
+	rec->snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+	spin_unlock(&inode->i_lock);
+
+	kfree(path);
+	dput(dentry);
+	(*data->num_caps)++;
+	*(data->pp) = p;
+	return 0;
+needmore:
+	return -ENOSPC;
+}
+
+
+/*
+ * If an MDS fails and recovers, clients need to reconnect in order to
+ * reestablish shared state.  This includes all caps issued through
+ * this session _and_ the snap_realm hierarchy.  Because it's not
+ * clear which snap realms the mds cares about, we send everything we
+ * know about.. that ensures we'll then get any new info the
+ * recovering MDS might have.
+ *
+ * This is a relatively heavyweight operation, but it's rare.
+ *
+ * called with mdsc->mutex held.
+ */
+static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
+{
+	struct ceph_mds_session *session;
+	struct ceph_msg *reply;
+	int newlen, len = 4 + 1;
+	void *p, *end;
+	int err;
+	int num_caps, num_realms = 0;
+	int got;
+	u64 next_snap_ino = 0;
+	__le32 *pnum_caps, *pnum_realms;
+	struct encode_caps_data iter_args;
+
+	pr_info("reconnect to recovering mds%d\n", mds);
+
+	/* find session */
+	session = __ceph_lookup_mds_session(mdsc, mds);
+	mutex_unlock(&mdsc->mutex);    /* drop lock for duration */
+
+	if (session) {
+		mutex_lock(&session->s_mutex);
+
+		session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+		session->s_seq = 0;
+
+		ceph_con_open(&session->s_con,
+			      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+		/* replay unsafe requests */
+		replay_unsafe_requests(mdsc, session);
+
+		/* estimate needed space */
+		len += session->s_nr_caps *
+			(100+sizeof(struct ceph_mds_cap_reconnect));
+		pr_info("estimating i need %d bytes for %d caps\n",
+		     len, session->s_nr_caps);
+	} else {
+		dout("no session for mds%d, will send short reconnect\n",
+		     mds);
+	}
+
+	down_read(&mdsc->snap_rwsem);
+
+retry:
+	/* build reply */
+	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, len, 0, 0, NULL);
+	if (IS_ERR(reply)) {
+		err = PTR_ERR(reply);
+		pr_err("send_mds_reconnect ENOMEM on %d for mds%d\n",
+		       len, mds);
+		goto out;
+	}
+	p = reply->front.iov_base;
+	end = p + len;
+
+	if (!session) {
+		ceph_encode_8(&p, 1); /* session was closed */
+		ceph_encode_32(&p, 0);
+		goto send;
+	}
+	dout("session %p state %s\n", session,
+	     session_state_name(session->s_state));
+
+	/* traverse this session's caps */
+	ceph_encode_8(&p, 0);
+	pnum_caps = p;
+	ceph_encode_32(&p, session->s_nr_caps);
+	num_caps = 0;
+
+	iter_args.pp = &p;
+	iter_args.end = end;
+	iter_args.num_caps = &num_caps;
+	err = iterate_session_caps(session, encode_caps_cb, &iter_args);
+	if (err == -ENOSPC)
+		goto needmore;
+	if (err < 0)
+		goto out;
+	*pnum_caps = cpu_to_le32(num_caps);
+
+	/*
+	 * snaprealms.  we provide mds with the ino, seq (version), and
+	 * parent for all of our realms.  If the mds has any newer info,
+	 * it will tell us.
+	 */
+	next_snap_ino = 0;
+	/* save some space for the snaprealm count */
+	pnum_realms = p;
+	ceph_decode_need(&p, end, sizeof(*pnum_realms), needmore);
+	p += sizeof(*pnum_realms);
+	num_realms = 0;
+	while (1) {
+		struct ceph_snap_realm *realm;
+		struct ceph_mds_snaprealm_reconnect *sr_rec;
+		got = radix_tree_gang_lookup(&mdsc->snap_realms,
+					     (void **)&realm, next_snap_ino, 1);
+		if (!got)
+			break;
+
+		dout(" adding snap realm %llx seq %lld parent %llx\n",
+		     realm->ino, realm->seq, realm->parent_ino);
+		ceph_decode_need(&p, end, sizeof(*sr_rec), needmore);
+		sr_rec = p;
+		sr_rec->ino = cpu_to_le64(realm->ino);
+		sr_rec->seq = cpu_to_le64(realm->seq);
+		sr_rec->parent = cpu_to_le64(realm->parent_ino);
+		p += sizeof(*sr_rec);
+		num_realms++;
+		next_snap_ino = realm->ino + 1;
+	}
+	*pnum_realms = cpu_to_le32(num_realms);
+
+send:
+	reply->front.iov_len = p - reply->front.iov_base;
+	reply->hdr.front_len = cpu_to_le32(reply->front.iov_len);
+	dout("final len was %u (guessed %d)\n",
+	     (unsigned)reply->front.iov_len, len);
+	ceph_con_send(&session->s_con, reply);
+
+	if (session) {
+		session->s_state = CEPH_MDS_SESSION_OPEN;
+		__wake_requests(mdsc, &session->s_waiting);
+	}
+
+out:
+	up_read(&mdsc->snap_rwsem);
+	if (session) {
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+	}
+	mutex_lock(&mdsc->mutex);
+	return;
+
+needmore:
+	/*
+	 * we need a larger buffer.  this doesn't very accurately
+	 * factor in snap realms, but it's safe.
+	 */
+	num_caps += num_realms;
+	newlen = len * ((100 * (session->s_nr_caps+3)) / (num_caps + 1)) / 100;
+	pr_info("i guessed %d, and did %d of %d caps, retrying with %d\n",
+	     len, num_caps, session->s_nr_caps, newlen);
+	len = newlen;
+	ceph_msg_put(reply);
+	goto retry;
+}
+
+
+/*
+ * compare old and new mdsmaps, kicking requests
+ * and closing out old connections as necessary
+ *
+ * called under mdsc->mutex.
+ */
+static void check_new_map(struct ceph_mds_client *mdsc,
+			  struct ceph_mdsmap *newmap,
+			  struct ceph_mdsmap *oldmap)
+{
+	int i;
+	int oldstate, newstate;
+	struct ceph_mds_session *s;
+
+	dout("check_new_map new %u old %u\n",
+	     newmap->m_epoch, oldmap->m_epoch);
+
+	for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
+		if (mdsc->sessions[i] == NULL)
+			continue;
+		s = mdsc->sessions[i];
+		oldstate = ceph_mdsmap_get_state(oldmap, i);
+		newstate = ceph_mdsmap_get_state(newmap, i);
+
+		dout("check_new_map mds%d state %s -> %s (session %s)\n",
+		     i, ceph_mds_state_name(oldstate),
+		     ceph_mds_state_name(newstate),
+		     session_state_name(s->s_state));
+
+		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
+			   ceph_mdsmap_get_addr(newmap, i),
+			   sizeof(struct ceph_entity_addr))) {
+			if (s->s_state == CEPH_MDS_SESSION_OPENING) {
+				/* the session never opened, just close it
+				 * out now */
+				__wake_requests(mdsc, &s->s_waiting);
+				unregister_session(mdsc, i);
+			} else {
+				/* just close it */
+				mutex_unlock(&mdsc->mutex);
+				mutex_lock(&s->s_mutex);
+				mutex_lock(&mdsc->mutex);
+				ceph_con_close(&s->s_con);
+				mutex_unlock(&s->s_mutex);
+				s->s_state = CEPH_MDS_SESSION_RESTARTING;
+			}
+
+			/* kick any requests waiting on the recovering mds */
+			kick_requests(mdsc, i, 1);
+		} else if (oldstate == newstate) {
+			continue;  /* nothing new with this mds */
+		}
+
+		/*
+		 * send reconnect?
+		 */
+		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
+		    newstate >= CEPH_MDS_STATE_RECONNECT)
+			send_mds_reconnect(mdsc, i);
+
+		/*
+		 * kick requests on any mds that has gone active.
+		 *
+		 * kick requests on cur or forwarder: we may have sent
+		 * the request to mds1, mds1 told us it forwarded it
+		 * to mds2, but then we learn mds1 failed and can't be
+		 * sure it successfully forwarded our request before
+		 * it died.
+		 */
+		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
+		    newstate >= CEPH_MDS_STATE_ACTIVE) {
+			kick_requests(mdsc, i, 1);
+			ceph_kick_flushing_caps(mdsc, s);
+		}
+	}
+}
+
+
+
+/*
+ * leases
+ */
+
+/*
+ * caller must hold session s_mutex, dentry->d_lock
+ */
+void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+	ceph_put_mds_session(di->lease_session);
+	di->lease_session = NULL;
+}
+
+static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+	struct super_block *sb = mdsc->client->sb;
+	struct inode *inode;
+	struct ceph_mds_session *session;
+	struct ceph_inode_info *ci;
+	struct dentry *parent, *dentry;
+	struct ceph_dentry_info *di;
+	int mds;
+	struct ceph_mds_lease *h = msg->front.iov_base;
+	struct ceph_vino vino;
+	int mask;
+	struct qstr dname;
+	int release = 0;
+
+	if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
+		return;
+	mds = le64_to_cpu(msg->hdr.src.name.num);
+	dout("handle_lease from mds%d\n", mds);
+
+	/* decode */
+	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
+		goto bad;
+	vino.ino = le64_to_cpu(h->ino);
+	vino.snap = CEPH_NOSNAP;
+	mask = le16_to_cpu(h->mask);
+	dname.name = (void *)h + sizeof(*h) + sizeof(u32);
+	dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
+	if (dname.len != get_unaligned_le32(h+1))
+		goto bad;
+
+	/* find session */
+	mutex_lock(&mdsc->mutex);
+	session = __ceph_lookup_mds_session(mdsc, mds);
+	mutex_unlock(&mdsc->mutex);
+	if (!session) {
+		pr_err("handle_lease got lease but no session mds%d\n", mds);
+		return;
+	}
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+
+	/* lookup inode */
+	inode = ceph_find_inode(sb, vino);
+	dout("handle_lease '%s', mask %d, ino %llx %p\n",
+	     ceph_lease_op_name(h->action), mask, vino.ino, inode);
+	if (inode == NULL) {
+		dout("handle_lease no inode %llx\n", vino.ino);
+		goto release;
+	}
+	ci = ceph_inode(inode);
+
+	/* dentry */
+	parent = d_find_alias(inode);
+	if (!parent) {
+		dout("no parent dentry on inode %p\n", inode);
+		WARN_ON(1);
+		goto release;  /* hrm... */
+	}
+	dname.hash = full_name_hash(dname.name, dname.len);
+	dentry = d_lookup(parent, &dname);
+	dput(parent);
+	if (!dentry)
+		goto release;
+
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	switch (h->action) {
+	case CEPH_MDS_LEASE_REVOKE:
+		if (di && di->lease_session == session) {
+			h->seq = cpu_to_le32(di->lease_seq);
+			__ceph_mdsc_drop_dentry_lease(dentry);
+		}
+		release = 1;
+		break;
+
+	case CEPH_MDS_LEASE_RENEW:
+		if (di && di->lease_session == session &&
+		    di->lease_gen == session->s_cap_gen &&
+		    di->lease_renew_from &&
+		    di->lease_renew_after == 0) {
+			unsigned long duration =
+				le32_to_cpu(h->duration_ms) * HZ / 1000;
+
+			di->lease_seq = le32_to_cpu(h->seq);
+			dentry->d_time = di->lease_renew_from + duration;
+			di->lease_renew_after = di->lease_renew_from +
+				(duration >> 1);
+			di->lease_renew_from = 0;
+		}
+		break;
+	}
+	spin_unlock(&dentry->d_lock);
+	dput(dentry);
+
+	if (!release)
+		goto out;
+
+release:
+	/* let's just reuse the same message */
+	h->action = CEPH_MDS_LEASE_REVOKE_ACK;
+	ceph_msg_get(msg);
+	ceph_con_send(&session->s_con, msg);
+
+out:
+	iput(inode);
+	mutex_unlock(&session->s_mutex);
+	ceph_put_mds_session(session);
+	return;
+
+bad:
+	pr_err("corrupt lease message\n");
+}
+
+void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+			      struct inode *inode,
+			      struct dentry *dentry, char action,
+			      u32 seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_lease *lease;
+	int len = sizeof(*lease) + sizeof(u32);
+	int dnamelen = 0;
+
+	dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
+	     inode, dentry, ceph_lease_op_name(action), session->s_mds);
+	dnamelen = dentry->d_name.len;
+	len += dnamelen;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
+	if (IS_ERR(msg))
+		return;
+	lease = msg->front.iov_base;
+	lease->action = action;
+	lease->mask = cpu_to_le16(CEPH_LOCK_DN);
+	lease->ino = cpu_to_le64(ceph_vino(inode).ino);
+	lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
+	lease->seq = cpu_to_le32(seq);
+	put_unaligned_le32(dnamelen, lease + 1);
+	memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
+
+	/*
+	 * if this is a preemptive lease RELEASE, no need to
+	 * flush request stream, since the actual request will
+	 * soon follow.
+	 */
+	msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
+
+	ceph_con_send(&session->s_con, msg);
+}
+
+/*
+ * Preemptively release a lease we expect to invalidate anyway.
+ * Pass @inode always, @dentry is optional.
+ */
+void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
+			     struct dentry *dentry, int mask)
+{
+	struct ceph_dentry_info *di;
+	struct ceph_mds_session *session;
+	u32 seq;
+
+	BUG_ON(inode == NULL);
+	BUG_ON(dentry == NULL);
+	BUG_ON(mask != CEPH_LOCK_DN);
+
+	/* is dentry lease valid? */
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	if (!di || !di->lease_session ||
+	    di->lease_session->s_mds < 0 ||
+	    di->lease_gen != di->lease_session->s_cap_gen ||
+	    !time_before(jiffies, dentry->d_time)) {
+		dout("lease_release inode %p dentry %p -- "
+		     "no lease on %d\n",
+		     inode, dentry, mask);
+		spin_unlock(&dentry->d_lock);
+		return;
+	}
+
+	/* we do have a lease on this dentry; note mds and seq */
+	session = ceph_get_mds_session(di->lease_session);
+	seq = di->lease_seq;
+	__ceph_mdsc_drop_dentry_lease(dentry);
+	spin_unlock(&dentry->d_lock);
+
+	dout("lease_release inode %p dentry %p mask %d to mds%d\n",
+	     inode, dentry, mask, session->s_mds);
+	ceph_mdsc_lease_send_msg(session, inode, dentry,
+				 CEPH_MDS_LEASE_RELEASE, seq);
+	ceph_put_mds_session(session);
+}
+
+/*
+ * drop all leases (and dentry refs) in preparation for umount
+ */
+static void drop_leases(struct ceph_mds_client *mdsc)
+{
+	int i;
+
+	dout("drop_leases\n");
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+		if (!s)
+			continue;
+		mutex_unlock(&mdsc->mutex);
+		mutex_lock(&s->s_mutex);
+		mutex_unlock(&s->s_mutex);
+		ceph_put_mds_session(s);
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+}
+
+
+
+/*
+ * delayed work -- periodically trim expired leases, renew caps with mds
+ */
+static void schedule_delayed(struct ceph_mds_client *mdsc)
+{
+	int delay = 5;
+	unsigned hz = round_jiffies_relative(HZ * delay);
+	schedule_delayed_work(&mdsc->delayed_work, hz);
+}
+
+static void delayed_work(struct work_struct *work)
+{
+	int i;
+	struct ceph_mds_client *mdsc =
+		container_of(work, struct ceph_mds_client, delayed_work.work);
+	int renew_interval;
+	int renew_caps;
+
+	dout("mdsc delayed_work\n");
+	ceph_check_delayed_caps(mdsc, 0);
+
+	mutex_lock(&mdsc->mutex);
+	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
+	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
+				   mdsc->last_renew_caps);
+	if (renew_caps)
+		mdsc->last_renew_caps = jiffies;
+
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+		if (s == NULL)
+			continue;
+		if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+			dout("resending session close request for mds%d\n",
+			     s->s_mds);
+			request_close_session(mdsc, s);
+			ceph_put_mds_session(s);
+			continue;
+		}
+		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+			if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+				s->s_state = CEPH_MDS_SESSION_HUNG;
+				pr_info("mds%d hung\n", s->s_mds);
+			}
+		}
+		if (s->s_state < CEPH_MDS_SESSION_OPEN) {
+			/* this mds is failed or recovering, just wait */
+			ceph_put_mds_session(s);
+			continue;
+		}
+		mutex_unlock(&mdsc->mutex);
+
+		mutex_lock(&s->s_mutex);
+		if (renew_caps)
+			send_renew_caps(mdsc, s);
+		else
+			ceph_con_keepalive(&s->s_con);
+		add_cap_releases(mdsc, s, -1);
+		send_cap_releases(mdsc, s);
+		mutex_unlock(&s->s_mutex);
+		ceph_put_mds_session(s);
+
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	schedule_delayed(mdsc);
+}
+
+
+void ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
+{
+	mdsc->client = client;
+	mutex_init(&mdsc->mutex);
+	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+	init_completion(&mdsc->safe_umount_waiters);
+	init_completion(&mdsc->session_close_waiters);
+	INIT_LIST_HEAD(&mdsc->waiting_for_map);
+	mdsc->sessions = NULL;
+	mdsc->max_sessions = 0;
+	mdsc->stopping = 0;
+	init_rwsem(&mdsc->snap_rwsem);
+	INIT_RADIX_TREE(&mdsc->snap_realms, GFP_NOFS);
+	INIT_LIST_HEAD(&mdsc->snap_empty);
+	spin_lock_init(&mdsc->snap_empty_lock);
+	mdsc->last_tid = 0;
+	INIT_RADIX_TREE(&mdsc->request_tree, GFP_NOFS);
+	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
+	mdsc->last_renew_caps = jiffies;
+	INIT_LIST_HEAD(&mdsc->cap_delay_list);
+	spin_lock_init(&mdsc->cap_delay_lock);
+	INIT_LIST_HEAD(&mdsc->snap_flush_list);
+	spin_lock_init(&mdsc->snap_flush_lock);
+	mdsc->cap_flush_seq = 0;
+	INIT_LIST_HEAD(&mdsc->cap_dirty);
+	mdsc->num_cap_flushing = 0;
+	spin_lock_init(&mdsc->cap_dirty_lock);
+	init_waitqueue_head(&mdsc->cap_flushing_wq);
+	spin_lock_init(&mdsc->dentry_lru_lock);
+	INIT_LIST_HEAD(&mdsc->dentry_lru);
+}
+
+/*
+ * Wait for safe replies on open mds requests.  If we time out, drop
+ * all requests from the tree to avoid dangling dentry refs.
+ */
+static void wait_requests(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_request *req;
+	struct ceph_client *client = mdsc->client;
+
+	mutex_lock(&mdsc->mutex);
+	if (__get_oldest_tid(mdsc)) {
+		mutex_unlock(&mdsc->mutex);
+		dout("wait_requests waiting for requests\n");
+		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
+				    client->mount_args.mount_timeout * HZ);
+		mutex_lock(&mdsc->mutex);
+
+		/* tear down remaining requests */
+		while (radix_tree_gang_lookup(&mdsc->request_tree,
+					      (void **)&req, 0, 1)) {
+			dout("wait_requests timed out on tid %llu\n",
+			     req->r_tid);
+			radix_tree_delete(&mdsc->request_tree, req->r_tid);
+			ceph_mdsc_put_request(req);
+		}
+	}
+	mutex_unlock(&mdsc->mutex);
+	dout("wait_requests done\n");
+}
+
+/*
+ * called before mount is ro, and before dentries are torn down.
+ * (hmm, does this still race with new lookups?)
+ */
+void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
+{
+	dout("pre_umount\n");
+	mdsc->stopping = 1;
+
+	drop_leases(mdsc);
+	ceph_check_delayed_caps(mdsc, 1);
+	wait_requests(mdsc);
+}
+
+/*
+ * wait for all write mds requests to flush.
+ */
+static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+{
+	struct ceph_mds_request *req;
+	u64 next_tid = 0;
+	int got;
+
+	mutex_lock(&mdsc->mutex);
+	dout("wait_unsafe_requests want %lld\n", want_tid);
+	while (1) {
+		got = radix_tree_gang_lookup(&mdsc->request_tree, (void **)&req,
+					     next_tid, 1);
+		if (!got)
+			break;
+		if (req->r_tid > want_tid)
+			break;
+
+		next_tid = req->r_tid + 1;
+		if ((req->r_op & CEPH_MDS_OP_WRITE) == 0)
+			continue;  /* not a write op */
+
+		ceph_mdsc_get_request(req);
+		mutex_unlock(&mdsc->mutex);
+		dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
+		     req->r_tid, want_tid);
+		wait_for_completion(&req->r_safe_completion);
+		mutex_lock(&mdsc->mutex);
+		ceph_mdsc_put_request(req);
+	}
+	mutex_unlock(&mdsc->mutex);
+	dout("wait_unsafe_requests done\n");
+}
+
+void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
+{
+	u64 want_tid, want_flush;
+
+	dout("sync\n");
+	mutex_lock(&mdsc->mutex);
+	want_tid = mdsc->last_tid;
+	want_flush = mdsc->cap_flush_seq;
+	mutex_unlock(&mdsc->mutex);
+	dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
+
+	ceph_check_delayed_caps(mdsc, 1);
+
+	wait_unsafe_requests(mdsc, want_tid);
+	wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
+}
+
+
+/*
+ * called after sb is ro.
+ */
+void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_session *session;
+	int i;
+	int n;
+	struct ceph_client *client = mdsc->client;
+	unsigned long started, timeout = client->mount_args.mount_timeout * HZ;
+
+	dout("close_sessions\n");
+
+	mutex_lock(&mdsc->mutex);
+
+	/* close sessions */
+	started = jiffies;
+	while (time_before(jiffies, started + timeout)) {
+		dout("closing sessions\n");
+		n = 0;
+		for (i = 0; i < mdsc->max_sessions; i++) {
+			session = __ceph_lookup_mds_session(mdsc, i);
+			if (!session)
+				continue;
+			mutex_unlock(&mdsc->mutex);
+			mutex_lock(&session->s_mutex);
+			__close_session(mdsc, session);
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			mutex_lock(&mdsc->mutex);
+			n++;
+		}
+		if (n == 0)
+			break;
+
+		if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
+			break;
+
+		dout("waiting for sessions to close\n");
+		mutex_unlock(&mdsc->mutex);
+		wait_for_completion_timeout(&mdsc->session_close_waiters,
+					    timeout);
+		mutex_lock(&mdsc->mutex);
+	}
+
+	/* tear down remaining sessions */
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		if (mdsc->sessions[i]) {
+			session = get_session(mdsc->sessions[i]);
+			unregister_session(mdsc, i);
+			mutex_unlock(&mdsc->mutex);
+			mutex_lock(&session->s_mutex);
+			remove_session_caps(session);
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			mutex_lock(&mdsc->mutex);
+		}
+	}
+
+	WARN_ON(!list_empty(&mdsc->cap_delay_list));
+
+	mutex_unlock(&mdsc->mutex);
+
+	ceph_cleanup_empty_realms(mdsc);
+
+	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+
+	dout("stopped\n");
+}
+
+void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+{
+	dout("stop\n");
+	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+	if (mdsc->mdsmap)
+		ceph_mdsmap_destroy(mdsc->mdsmap);
+	kfree(mdsc->sessions);
+}
+
+
+/*
+ * handle mds map update.
+ */
+void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+	u32 epoch;
+	u32 maplen;
+	void *p = msg->front.iov_base;
+	void *end = p + msg->front.iov_len;
+	struct ceph_mdsmap *newmap, *oldmap;
+	struct ceph_fsid fsid;
+	int err = -EINVAL;
+
+	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	if (ceph_fsid_compare(&fsid, &mdsc->client->monc.monmap->fsid)) {
+		pr_err("got mdsmap with wrong fsid\n");
+		return;
+	}
+	ceph_decode_32(&p, epoch);
+	ceph_decode_32(&p, maplen);
+	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
+
+	/* do we need it? */
+	ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+	mutex_lock(&mdsc->mutex);
+	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
+		dout("handle_map epoch %u <= our %u\n",
+		     epoch, mdsc->mdsmap->m_epoch);
+		mutex_unlock(&mdsc->mutex);
+		return;
+	}
+
+	newmap = ceph_mdsmap_decode(&p, end);
+	if (IS_ERR(newmap)) {
+		err = PTR_ERR(newmap);
+		goto bad_unlock;
+	}
+
+	/* swap into place */
+	if (mdsc->mdsmap) {
+		oldmap = mdsc->mdsmap;
+		mdsc->mdsmap = newmap;
+		check_new_map(mdsc, newmap, oldmap);
+		ceph_mdsmap_destroy(oldmap);
+	} else {
+		mdsc->mdsmap = newmap;  /* first mds map */
+	}
+	mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+
+	__wake_requests(mdsc, &mdsc->waiting_for_map);
+
+	mutex_unlock(&mdsc->mutex);
+	schedule_delayed(mdsc);
+	return;
+
+bad_unlock:
+	mutex_unlock(&mdsc->mutex);
+bad:
+	pr_err("error decoding mdsmap %d\n", err);
+	return;
+}
+
+static struct ceph_connection *con_get(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+
+	if (get_session(s)) {
+		dout("mdsc con_get %p %d -> %d\n", s,
+		     atomic_read(&s->s_ref) - 1, atomic_read(&s->s_ref));
+		return con;
+	}
+	dout("mdsc con_get %p FAIL\n", s);
+	return NULL;
+}
+
+static void con_put(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+
+	dout("mdsc con_put %p %d -> %d\n", s, atomic_read(&s->s_ref),
+	     atomic_read(&s->s_ref) - 1);
+	ceph_put_mds_session(s);
+}
+
+/*
+ * if the client is unresponsive for long enough, the mds will kill
+ * the session entirely.
+ */
+static void peer_reset(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+
+	pr_err("mds%d gave us the boot.  IMPLEMENT RECONNECT.\n",
+	       s->s_mds);
+}
+
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	switch (type) {
+	case CEPH_MSG_MDS_MAP:
+		ceph_mdsc_handle_map(mdsc, msg);
+		break;
+	case CEPH_MSG_CLIENT_SESSION:
+		handle_session(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_REPLY:
+		handle_reply(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
+		handle_forward(mdsc, msg);
+		break;
+	case CEPH_MSG_CLIENT_CAPS:
+		ceph_handle_caps(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_SNAP:
+		ceph_handle_snap(mdsc, msg);
+		break;
+	case CEPH_MSG_CLIENT_LEASE:
+		handle_lease(mdsc, msg);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+	ceph_msg_put(msg);
+}
+
+const static struct ceph_connection_operations mds_con_ops = {
+	.get = con_get,
+	.put = con_put,
+	.dispatch = dispatch,
+	.peer_reset = peer_reset,
+	.alloc_msg = ceph_alloc_msg,
+	.alloc_middle = ceph_alloc_middle,
+};
+
+
+
+
+/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..f566e9c84295
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,321 @@
+#ifndef _FS_CEPH_MDS_CLIENT_H
+#define _FS_CEPH_MDS_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+#include <linux/spinlock.h>
+
+#include "types.h"
+#include "messenger.h"
+#include "mdsmap.h"
+
+/*
+ * Some lock dependencies:
+ *
+ * session->s_mutex
+ *         mdsc->mutex
+ *
+ *         mdsc->snap_rwsem
+ *
+ *         inode->i_lock
+ *                 mdsc->snap_flush_lock
+ *                 mdsc->cap_delay_lock
+ *
+ */
+
+struct ceph_client;
+struct ceph_cap;
+
+/*
+ * parsed info about a single inode.  pointers are into the encoded
+ * on-wire structures within the mds reply message payload.
+ */
+struct ceph_mds_reply_info_in {
+	struct ceph_mds_reply_inode *in;
+	u32 symlink_len;
+	char *symlink;
+	u32 xattr_len;
+	char *xattr_data;
+};
+
+/*
+ * parsed info about an mds reply, including information about the
+ * target inode and/or its parent directory and dentry, and directory
+ * contents (for readdir results).
+ */
+struct ceph_mds_reply_info_parsed {
+	struct ceph_mds_reply_head    *head;
+
+	struct ceph_mds_reply_info_in diri, targeti;
+	struct ceph_mds_reply_dirfrag *dirfrag;
+	char                          *dname;
+	u32                           dname_len;
+	struct ceph_mds_reply_lease   *dlease;
+
+	struct ceph_mds_reply_dirfrag *dir_dir;
+	int                           dir_nr;
+	char                          **dir_dname;
+	u32                           *dir_dname_len;
+	struct ceph_mds_reply_lease   **dir_dlease;
+	struct ceph_mds_reply_info_in *dir_in;
+	u8                            dir_complete, dir_end;
+
+	/* encoded blob describing snapshot contexts for certain
+	   operations (e.g., open) */
+	void *snapblob;
+	int snapblob_len;
+};
+
+
+/*
+ * cap releases are batched and sent to the MDS en masse.
+ */
+#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE -			\
+				sizeof(struct ceph_mds_cap_release)) /	\
+			       sizeof(struct ceph_mds_cap_item))
+
+
+/*
+ * state associated with each MDS<->client session
+ */
+enum {
+	CEPH_MDS_SESSION_NEW = 1,
+	CEPH_MDS_SESSION_OPENING = 2,
+	CEPH_MDS_SESSION_OPEN = 3,
+	CEPH_MDS_SESSION_HUNG = 4,
+	CEPH_MDS_SESSION_CLOSING = 5,
+	CEPH_MDS_SESSION_RESTARTING = 6,
+	CEPH_MDS_SESSION_RECONNECTING = 7,
+};
+
+struct ceph_mds_session {
+	struct ceph_mds_client *s_mdsc;
+	int               s_mds;
+	int               s_state;
+	unsigned long     s_ttl;      /* time until mds kills us */
+	u64               s_seq;      /* incoming msg seq # */
+	struct mutex      s_mutex;    /* serialize session messages */
+
+	struct ceph_connection s_con;
+
+	/* protected by s_cap_lock */
+	spinlock_t        s_cap_lock;
+	u32               s_cap_gen;  /* inc each time we get mds stale msg */
+	unsigned long     s_cap_ttl;  /* when session caps expire */
+	struct list_head  s_caps;     /* all caps issued by this session */
+	int               s_nr_caps, s_trim_caps;
+	int               s_num_cap_releases;
+	struct list_head  s_cap_releases; /* waiting cap_release messages */
+	struct list_head  s_cap_releases_done; /* ready to send */
+
+	/* protected by mutex */
+	struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
+	struct list_head  s_cap_snaps_flushing;
+	unsigned long     s_renew_requested; /* last time we sent a renew req */
+	u64               s_renew_seq;
+
+	atomic_t          s_ref;
+	struct list_head  s_waiting;  /* waiting requests */
+	struct list_head  s_unsafe;   /* unsafe requests */
+};
+
+/*
+ * modes of choosing which MDS to send a request to
+ */
+enum {
+	USE_ANY_MDS,
+	USE_RANDOM_MDS,
+	USE_AUTH_MDS,   /* prefer authoritative mds for this metadata item */
+};
+
+struct ceph_mds_request;
+struct ceph_mds_client;
+
+/*
+ * request completion callback
+ */
+typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
+					     struct ceph_mds_request *req);
+
+/*
+ * an in-flight mds request
+ */
+struct ceph_mds_request {
+	u64 r_tid;                   /* transaction id */
+
+	int r_op;                    /* mds op code */
+	int r_mds;
+
+	/* operation on what? */
+	struct inode *r_inode;              /* arg1 */
+	struct dentry *r_dentry;            /* arg1 */
+	struct dentry *r_old_dentry;        /* arg2: rename from or link from */
+	char *r_path1, *r_path2;
+	struct ceph_vino r_ino1, r_ino2;
+
+	struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+	struct inode *r_target_inode;       /* resulting inode */
+
+	union ceph_mds_request_args r_args;
+	int r_fmode;        /* file mode, if expecting cap */
+
+	/* for choosing which mds to send this request to */
+	int r_direct_mode;
+	u32 r_direct_hash;      /* choose dir frag based on this dentry hash */
+	bool r_direct_is_hash;  /* true if r_direct_hash is valid */
+
+	/* data payload is used for xattr ops */
+	struct page **r_pages;
+	int r_num_pages;
+	int r_data_len;
+
+	/* what caps shall we drop? */
+	int r_inode_drop, r_inode_unless;
+	int r_dentry_drop, r_dentry_unless;
+	int r_old_dentry_drop, r_old_dentry_unless;
+	struct inode *r_old_inode;
+	int r_old_inode_drop, r_old_inode_unless;
+
+	struct ceph_msg  *r_request;  /* original request */
+	struct ceph_msg  *r_reply;
+	struct ceph_mds_reply_info_parsed r_reply_info;
+	int r_err;
+
+	unsigned long r_timeout;  /* optional.  jiffies */
+	unsigned long r_started;  /* start time to measure timeout against */
+	unsigned long r_request_started; /* start time for mds request only,
+					    used to measure lease durations */
+
+	/* link unsafe requests to parent directory, for fsync */
+	struct inode	*r_unsafe_dir;
+	struct list_head r_unsafe_dir_item;
+
+	struct ceph_mds_session *r_session;
+
+	int               r_attempts;   /* resend attempts */
+	int               r_num_fwd;    /* number of forward attempts */
+	int               r_num_stale;
+	int               r_resend_mds; /* mds to resend to next, if any*/
+
+	atomic_t          r_ref;
+	struct list_head  r_wait;
+	struct completion r_completion;
+	struct completion r_safe_completion;
+	ceph_mds_request_callback_t r_callback;
+	struct list_head  r_unsafe_item;  /* per-session unsafe list item */
+	bool		  r_got_unsafe, r_got_safe;
+
+	bool              r_did_prepopulate;
+	u32               r_readdir_offset;
+
+	struct ceph_cap_reservation r_caps_reservation;
+	int r_num_caps;
+};
+
+/*
+ * mds client state
+ */
+struct ceph_mds_client {
+	struct ceph_client      *client;
+	struct mutex            mutex;         /* all nested structures */
+
+	struct ceph_mdsmap      *mdsmap;
+	struct completion       safe_umount_waiters, session_close_waiters;
+	struct list_head        waiting_for_map;
+
+	struct ceph_mds_session **sessions;    /* NULL for mds if no session */
+	int                     max_sessions;  /* len of s_mds_sessions */
+	int                     stopping;      /* true if shutting down */
+
+	/*
+	 * snap_rwsem will cover cap linkage into snaprealms, and
+	 * realm snap contexts.  (later, we can do per-realm snap
+	 * contexts locks..)  the empty list contains realms with no
+	 * references (implying they contain no inodes with caps) that
+	 * should be destroyed.
+	 */
+	struct rw_semaphore     snap_rwsem;
+	struct radix_tree_root  snap_realms;
+	struct list_head        snap_empty;
+	spinlock_t              snap_empty_lock;  /* protect snap_empty */
+
+	u64                    last_tid;      /* most recent mds request */
+	struct radix_tree_root request_tree;  /* pending mds requests */
+	struct delayed_work    delayed_work;  /* delayed work */
+	unsigned long    last_renew_caps;  /* last time we renewed our caps */
+	struct list_head cap_delay_list;   /* caps with delayed release */
+	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
+	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
+	spinlock_t       snap_flush_lock;
+
+	u64               cap_flush_seq;
+	struct list_head  cap_dirty;        /* inodes with dirty caps */
+	int               num_cap_flushing; /* # caps we are flushing */
+	spinlock_t        cap_dirty_lock;   /* protects above items */
+	wait_queue_head_t cap_flushing_wq;
+
+	struct dentry 	  *debugfs_file;
+
+	spinlock_t	  dentry_lru_lock;
+	struct list_head  dentry_lru;
+	int		  num_dentry;
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+extern struct ceph_mds_session *
+__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
+
+static inline struct ceph_mds_session *
+ceph_get_mds_session(struct ceph_mds_session *s)
+{
+	atomic_inc(&s->s_ref);
+	return s;
+}
+
+extern void ceph_put_mds_session(struct ceph_mds_session *s);
+
+extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
+			     struct ceph_msg *msg, int mds);
+
+extern void ceph_mdsc_init(struct ceph_mds_client *mdsc,
+			   struct ceph_client *client);
+extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+
+extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
+
+extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
+				    struct inode *inode,
+				    struct dentry *dn, int mask);
+
+extern struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
+extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+				     struct ceph_mds_request *req);
+extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+				struct inode *dir,
+				struct ceph_mds_request *req);
+static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
+{
+	atomic_inc(&req->r_ref);
+}
+extern void ceph_mdsc_put_request(struct ceph_mds_request *req);
+
+extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
+
+extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+				  int stop_on_nosnap);
+
+extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
+extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+				     struct inode *inode,
+				     struct dentry *dentry, char action,
+				     u32 seq);
+
+extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
+				 struct ceph_msg *msg);
+
+#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..15913cbeb289
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,166 @@
+#include "ceph_debug.h"
+
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "mdsmap.h"
+#include "messenger.h"
+#include "decode.h"
+
+#include "super.h"
+
+
+/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+	int n = 0;
+	int i;
+	char r;
+
+	/* count */
+	for (i = 0; i < m->m_max_mds; i++)
+		if (m->m_info[i].state > 0)
+			n++;
+	if (n == 0)
+		return -1;
+
+	/* pick */
+	get_random_bytes(&r, 1);
+	n = r % n;
+	i = 0;
+	for (i = 0; n > 0; i++, n--)
+		while (m->m_info[i].state <= 0)
+			i++;
+
+	return i;
+}
+
+/*
+ * Decode an MDS map
+ *
+ * Ignore any fields we don't care about (there are quite a few of
+ * them).
+ */
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+{
+	struct ceph_mdsmap *m;
+	int i, j, n;
+	int err = -EINVAL;
+	u16 version;
+
+	m = kzalloc(sizeof(*m), GFP_NOFS);
+	if (m == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ceph_decode_16_safe(p, end, version, bad);
+
+	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
+	ceph_decode_32(p, m->m_epoch);
+	ceph_decode_32(p, m->m_client_epoch);
+	ceph_decode_32(p, m->m_last_failure);
+	ceph_decode_32(p, m->m_root);
+	ceph_decode_32(p, m->m_session_timeout);
+	ceph_decode_32(p, m->m_session_autoclose);
+	ceph_decode_64(p, m->m_max_file_size);
+	ceph_decode_32(p, m->m_max_mds);
+
+	m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
+	if (m->m_info == NULL)
+		goto badmem;
+
+	/* pick out active nodes from mds_info (state > 0) */
+	ceph_decode_32(p, n);
+	for (i = 0; i < n; i++) {
+		u32 namelen;
+		s32 mds, inc, state;
+		u64 state_seq;
+		u8 infoversion;
+		struct ceph_entity_addr addr;
+		u32 num_export_targets;
+		void *pexport_targets = NULL;
+
+		ceph_decode_need(p, end, sizeof(addr) + 1 + sizeof(u32), bad);
+		*p += sizeof(addr);          /* skip addr key */
+		ceph_decode_8(p, infoversion);
+		ceph_decode_32(p, namelen);  /* skip mds name */
+		*p += namelen;
+
+		ceph_decode_need(p, end,
+				 5*sizeof(u32) + sizeof(u64) +
+				 sizeof(addr) + sizeof(struct ceph_timespec),
+				 bad);
+		ceph_decode_32(p, mds);
+		ceph_decode_32(p, inc);
+		ceph_decode_32(p, state);
+		ceph_decode_64(p, state_seq);
+		ceph_decode_copy(p, &addr, sizeof(addr));
+		*p += sizeof(struct ceph_timespec);
+		*p += sizeof(u32);
+		ceph_decode_32_safe(p, end, namelen, bad);
+		*p += sizeof(namelen);
+		if (infoversion >= 2) {
+			ceph_decode_32_safe(p, end, num_export_targets, bad);
+			pexport_targets = *p;
+			*p += sizeof(num_export_targets * sizeof(u32));
+		} else {
+			num_export_targets = 0;
+		}
+
+		dout("mdsmap_decode %d/%d mds%d.%d %s %s\n",
+		     i+1, n, mds, inc, pr_addr(&addr.in_addr),
+		     ceph_mds_state_name(state));
+		if (mds >= 0 && mds < m->m_max_mds && state > 0) {
+			m->m_info[mds].state = state;
+			m->m_info[mds].addr = addr;
+			m->m_info[mds].num_export_targets = num_export_targets;
+			if (num_export_targets) {
+				m->m_info[mds].export_targets =
+					kcalloc(num_export_targets, sizeof(u32),
+						GFP_NOFS);
+				for (j = 0; j < num_export_targets; j++)
+					ceph_decode_32(&pexport_targets,
+					      m->m_info[mds].export_targets[j]);
+			} else {
+				m->m_info[mds].export_targets = NULL;
+			}
+		}
+	}
+
+	/* pg_pools */
+	ceph_decode_32_safe(p, end, n, bad);
+	m->m_num_data_pg_pools = n;
+	m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+	if (!m->m_data_pg_pools)
+		goto badmem;
+	ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+	for (i = 0; i < n; i++)
+		ceph_decode_32(p, m->m_data_pg_pools[i]);
+	ceph_decode_32(p, m->m_cas_pg_pool);
+
+	/* ok, we don't care about the rest. */
+	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
+	return m;
+
+badmem:
+	err = -ENOMEM;
+bad:
+	pr_err("corrupt mdsmap\n");
+	ceph_mdsmap_destroy(m);
+	return ERR_PTR(-EINVAL);
+}
+
+void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
+{
+	int i;
+
+	for (i = 0; i < m->m_max_mds; i++)
+		kfree(m->m_info[i].export_targets);
+	kfree(m->m_info);
+	kfree(m->m_data_pg_pools);
+	kfree(m);
+}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..d317308648fb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,53 @@
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+
+#include "types.h"
+
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+	struct ceph_entity_addr addr;
+	s32 state;
+	int num_export_targets;
+	u32 *export_targets;
+};
+
+struct ceph_mdsmap {
+	u32 m_epoch, m_client_epoch, m_last_failure;
+	u32 m_root;
+	u32 m_session_timeout;          /* seconds */
+	u32 m_session_autoclose;        /* seconds */
+	u64 m_max_file_size;
+	u32 m_max_mds;                  /* size of m_addr, m_state arrays */
+	struct ceph_mds_info *m_info;
+
+	/* which object pools file data can be stored in */
+	int m_num_data_pg_pools;
+	u32 *m_data_pg_pools;
+	u32 m_cas_pg_pool;
+};
+
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+	if (w >= m->m_max_mds)
+		return NULL;
+	return &m->m_info[w].addr;
+}
+
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+	BUG_ON(w < 0);
+	if (w >= m->m_max_mds)
+		return CEPH_MDS_STATE_DNE;
+	return m->m_info[w].state;
+}
+
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+
+#endif
-- 
cgit v1.2.2


From f24e9980eb860d8600cbe5ef3d2fd9295320d229 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:10 -0700
Subject: ceph: OSD client

The OSD client is responsible for reading and writing data from/to the
object storage pool.  This includes determining where objects are
stored in the cluster, and ensuring that requests are retried or
redirected in the event of a node failure or data migration.

If an OSD does not respond before a timeout expires, keepalive
messages are sent across the lossless, ordered communications channel
to ensure that any break in the TCP is discovered.  If the session
does reset, a reconnection is attempted and affected requests are
resent (by the message transport layer).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 1294 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/osd_client.h |  144 ++++++
 fs/ceph/osdmap.c     |  875 ++++++++++++++++++++++++++++++++++
 fs/ceph/osdmap.h     |  123 +++++
 4 files changed, 2436 insertions(+)
 create mode 100644 fs/ceph/osd_client.c
 create mode 100644 fs/ceph/osd_client.h
 create mode 100644 fs/ceph/osdmap.c
 create mode 100644 fs/ceph/osdmap.h

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..978593a4f466
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1294 @@
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "super.h"
+#include "osd_client.h"
+#include "messenger.h"
+#include "decode.h"
+
+const static struct ceph_connection_operations osd_con_ops;
+
+static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
+
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly.  shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static void calc_layout(struct ceph_osd_client *osdc,
+			struct ceph_vino vino, struct ceph_file_layout *layout,
+			u64 off, u64 *plen,
+			struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	struct ceph_osd_op *op = (void *)(reqhead + 1);
+	u64 orig_len = *plen;
+	u64 objoff, objlen;    /* extent in object */
+	u64 bno;
+
+	reqhead->snapid = cpu_to_le64(vino.snap);
+
+	/* object extent? */
+	ceph_calc_file_object_mapping(layout, off, plen, &bno,
+				      &objoff, &objlen);
+	if (*plen < orig_len)
+		dout(" skipping last %llu, final file extent %llu~%llu\n",
+		     orig_len - *plen, off, *plen);
+
+	sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
+	req->r_oid_len = strlen(req->r_oid);
+
+	op->extent.offset = cpu_to_le64(objoff);
+	op->extent.length = cpu_to_le64(objlen);
+	req->r_num_pages = calc_pages_for(off, *plen);
+
+	dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
+	     req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
+}
+
+
+/*
+ * requests
+ */
+void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+	dout("osdc put_request %p %d -> %d\n", req, atomic_read(&req->r_ref),
+	     atomic_read(&req->r_ref)-1);
+	BUG_ON(atomic_read(&req->r_ref) <= 0);
+	if (atomic_dec_and_test(&req->r_ref)) {
+		if (req->r_request)
+			ceph_msg_put(req->r_request);
+		if (req->r_reply)
+			ceph_msg_put(req->r_reply);
+		if (req->r_own_pages)
+			ceph_release_page_vector(req->r_pages,
+						 req->r_num_pages);
+		ceph_put_snap_context(req->r_snapc);
+		if (req->r_mempool)
+			mempool_free(req, req->r_osdc->req_mempool);
+		else
+			kfree(req);
+	}
+}
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately.  (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+					       struct ceph_file_layout *layout,
+					       struct ceph_vino vino,
+					       u64 off, u64 *plen,
+					       int opcode, int flags,
+					       struct ceph_snap_context *snapc,
+					       int do_sync,
+					       u32 truncate_seq,
+					       u64 truncate_size,
+					       struct timespec *mtime,
+					       bool use_mempool, int num_reply)
+{
+	struct ceph_osd_request *req;
+	struct ceph_msg *msg;
+	struct ceph_osd_request_head *head;
+	struct ceph_osd_op *op;
+	void *p;
+	int do_trunc = truncate_seq && (off + *plen > truncate_size);
+	int num_op = 1 + do_sync + do_trunc;
+	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+	int err, i;
+	u64 prevofs;
+
+	if (use_mempool) {
+		req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
+		memset(req, 0, sizeof(*req));
+	} else {
+		req = kzalloc(sizeof(*req), GFP_NOFS);
+	}
+	if (req == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	err = ceph_msgpool_resv(&osdc->msgpool_op_reply, num_reply);
+	if (err) {
+		ceph_osdc_put_request(req);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	req->r_osdc = osdc;
+	req->r_mempool = use_mempool;
+	atomic_set(&req->r_ref, 1);
+	init_completion(&req->r_completion);
+	init_completion(&req->r_safe_completion);
+	INIT_LIST_HEAD(&req->r_unsafe_item);
+	req->r_flags = flags;
+
+	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+
+	/* create message; allow space for oid */
+	msg_size += 40;
+	if (snapc)
+		msg_size += sizeof(u64) * snapc->num_snaps;
+	if (use_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
+	if (IS_ERR(msg)) {
+		ceph_msgpool_resv(&osdc->msgpool_op_reply, num_reply);
+		ceph_osdc_put_request(req);
+		return ERR_PTR(PTR_ERR(msg));
+	}
+	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
+	memset(msg->front.iov_base, 0, msg->front.iov_len);
+	head = msg->front.iov_base;
+	op = (void *)(head + 1);
+	p = (void *)(op + num_op);
+
+	req->r_request = msg;
+	req->r_snapc = ceph_get_snap_context(snapc);
+
+	head->client_inc = cpu_to_le32(1); /* always, for now. */
+	head->flags = cpu_to_le32(flags);
+	if (flags & CEPH_OSD_FLAG_WRITE)
+		ceph_encode_timespec(&head->mtime, mtime);
+	head->num_ops = cpu_to_le16(num_op);
+	op->op = cpu_to_le16(opcode);
+
+	/* calculate max write size */
+	calc_layout(osdc, vino, layout, off, plen, req);
+	req->r_file_layout = *layout;  /* keep a copy */
+
+	if (flags & CEPH_OSD_FLAG_WRITE) {
+		req->r_request->hdr.data_off = cpu_to_le16(off);
+		req->r_request->hdr.data_len = cpu_to_le32(*plen);
+		op->payload_len = cpu_to_le32(*plen);
+	}
+
+	/* fill in oid */
+	head->object_len = cpu_to_le32(req->r_oid_len);
+	memcpy(p, req->r_oid, req->r_oid_len);
+	p += req->r_oid_len;
+
+	/* additional ops */
+	if (do_trunc) {
+		op++;
+		op->op = cpu_to_le16(opcode == CEPH_OSD_OP_READ ?
+			     CEPH_OSD_OP_MASKTRUNC : CEPH_OSD_OP_SETTRUNC);
+		op->trunc.truncate_seq = cpu_to_le32(truncate_seq);
+		prevofs = le64_to_cpu((op-1)->extent.offset);
+		op->trunc.truncate_size = cpu_to_le64(truncate_size -
+						      (off-prevofs));
+	}
+	if (do_sync) {
+		op++;
+		op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
+	}
+	if (snapc) {
+		head->snap_seq = cpu_to_le64(snapc->seq);
+		head->num_snaps = cpu_to_le32(snapc->num_snaps);
+		for (i = 0; i < snapc->num_snaps; i++) {
+			put_unaligned_le64(snapc->snaps[i], p);
+			p += sizeof(u64);
+		}
+	}
+
+	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+	return req;
+}
+
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+static void __insert_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *new)
+{
+	struct rb_node **p = &osdc->requests.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_osd_request, r_node);
+		if (new->r_tid < req->r_tid)
+			p = &(*p)->rb_left;
+		else if (new->r_tid > req->r_tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, &osdc->requests);
+}
+
+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
+						 u64 tid)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *n = osdc->requests.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_osd_request, r_node);
+		if (tid < req->r_tid)
+			n = n->rb_left;
+		else if (tid > req->r_tid)
+			n = n->rb_right;
+		else
+			return req;
+	}
+	return NULL;
+}
+
+static struct ceph_osd_request *
+__lookup_request_ge(struct ceph_osd_client *osdc,
+		    u64 tid)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *n = osdc->requests.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_osd_request, r_node);
+		if (tid < req->r_tid) {
+			if (!n->rb_left)
+				return req;
+			n = n->rb_left;
+		} else if (tid > req->r_tid) {
+			n = n->rb_right;
+		} else {
+			return req;
+		}
+	}
+	return NULL;
+}
+
+
+/*
+ * The messaging layer will reconnect to the osd as needed.  If the
+ * session has dropped, the OSD will have dropped the session state,
+ * and we'll get notified by the messaging layer.  If that happens, we
+ * need to resubmit all requests for that osd.
+ */
+static void osd_reset(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc;
+
+	if (!osd)
+		return;
+	dout("osd_reset osd%d\n", osd->o_osd);
+	osdc = osd->o_osdc;
+	osd->o_incarnation++;
+	down_read(&osdc->map_sem);
+	kick_requests(osdc, osd);
+	up_read(&osdc->map_sem);
+}
+
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd *osd;
+
+	osd = kzalloc(sizeof(*osd), GFP_NOFS);
+	if (!osd)
+		return NULL;
+
+	atomic_set(&osd->o_ref, 1);
+	osd->o_osdc = osdc;
+	INIT_LIST_HEAD(&osd->o_requests);
+	osd->o_incarnation = 1;
+
+	ceph_con_init(osdc->client->msgr, &osd->o_con);
+	osd->o_con.private = osd;
+	osd->o_con.ops = &osd_con_ops;
+	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+	return osd;
+}
+
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+	if (atomic_inc_not_zero(&osd->o_ref)) {
+		dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
+		     atomic_read(&osd->o_ref));
+		return osd;
+	} else {
+		dout("get_osd %p FAIL\n", osd);
+		return NULL;
+	}
+}
+
+static void put_osd(struct ceph_osd *osd)
+{
+	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
+	     atomic_read(&osd->o_ref) - 1);
+	if (atomic_dec_and_test(&osd->o_ref)) {
+		ceph_con_shutdown(&osd->o_con);
+		kfree(osd);
+	}
+}
+
+/*
+ * remove an osd from our map
+ */
+static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+	dout("remove_osd %p\n", osd);
+	BUG_ON(!list_empty(&osd->o_requests));
+	rb_erase(&osd->o_node, &osdc->osds);
+	ceph_con_close(&osd->o_con);
+	put_osd(osd);
+}
+
+/*
+ * reset osd connect
+ */
+static int reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+	int ret = 0;
+
+	dout("reset_osd %p osd%d\n", osd, osd->o_osd);
+	if (list_empty(&osd->o_requests)) {
+		remove_osd(osdc, osd);
+	} else {
+		ceph_con_close(&osd->o_con);
+		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+		osd->o_incarnation++;
+	}
+	return ret;
+}
+
+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+{
+	struct rb_node **p = &osdc->osds.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd *osd = NULL;
+
+	while (*p) {
+		parent = *p;
+		osd = rb_entry(parent, struct ceph_osd, o_node);
+		if (new->o_osd < osd->o_osd)
+			p = &(*p)->rb_left;
+		else if (new->o_osd > osd->o_osd)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->o_node, parent, p);
+	rb_insert_color(&new->o_node, &osdc->osds);
+}
+
+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+{
+	struct ceph_osd *osd;
+	struct rb_node *n = osdc->osds.rb_node;
+
+	while (n) {
+		osd = rb_entry(n, struct ceph_osd, o_node);
+		if (o < osd->o_osd)
+			n = n->rb_left;
+		else if (o > osd->o_osd)
+			n = n->rb_right;
+		else
+			return osd;
+	}
+	return NULL;
+}
+
+
+/*
+ * Register request, assign tid.  If this is the first request, set up
+ * the timeout event.
+ */
+static void register_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *head = req->r_request->front.iov_base;
+
+	mutex_lock(&osdc->request_mutex);
+	req->r_tid = ++osdc->last_tid;
+	head->tid = cpu_to_le64(req->r_tid);
+
+	dout("register_request %p tid %lld\n", req, req->r_tid);
+	__insert_request(osdc, req);
+	ceph_osdc_get_request(req);
+	osdc->num_requests++;
+
+	req->r_timeout_stamp =
+		jiffies + osdc->client->mount_args.osd_timeout*HZ;
+
+	if (osdc->num_requests == 1) {
+		osdc->timeout_tid = req->r_tid;
+		dout("  timeout on tid %llu at %lu\n", req->r_tid,
+		     req->r_timeout_stamp);
+		schedule_delayed_work(&osdc->timeout_work,
+		      round_jiffies_relative(req->r_timeout_stamp - jiffies));
+	}
+	mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * called under osdc->request_mutex
+ */
+static void __unregister_request(struct ceph_osd_client *osdc,
+				 struct ceph_osd_request *req)
+{
+	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+	rb_erase(&req->r_node, &osdc->requests);
+	osdc->num_requests--;
+
+	list_del_init(&req->r_osd_item);
+	if (list_empty(&req->r_osd->o_requests))
+		remove_osd(osdc, req->r_osd);
+	req->r_osd = NULL;
+
+	ceph_osdc_put_request(req);
+
+	if (req->r_tid == osdc->timeout_tid) {
+		if (osdc->num_requests == 0) {
+			dout("no requests, canceling timeout\n");
+			osdc->timeout_tid = 0;
+			cancel_delayed_work(&osdc->timeout_work);
+		} else {
+			req = rb_entry(rb_first(&osdc->requests),
+				       struct ceph_osd_request, r_node);
+			osdc->timeout_tid = req->r_tid;
+			dout("rescheduled timeout on tid %llu at %lu\n",
+			     req->r_tid, req->r_timeout_stamp);
+			schedule_delayed_work(&osdc->timeout_work,
+			      round_jiffies_relative(req->r_timeout_stamp -
+						     jiffies));
+		}
+	}
+}
+
+/*
+ * Cancel a previously queued request message
+ */
+static void __cancel_request(struct ceph_osd_request *req)
+{
+	if (req->r_sent) {
+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+		req->r_sent = 0;
+	}
+}
+
+/*
+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
+ * (as needed), and set the request r_osd appropriately.  If there is
+ * no up osd, set r_osd to NULL.
+ *
+ * Return 0 if unchanged, 1 if changed, or negative on error.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __map_osds(struct ceph_osd_client *osdc,
+		      struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	union ceph_pg pgid;
+	int o = -1;
+	int err;
+	struct ceph_osd *newosd = NULL;
+
+	dout("map_osds %p tid %lld\n", req, req->r_tid);
+	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+				      &req->r_file_layout, osdc->osdmap);
+	if (err)
+		return err;
+	pgid.pg64 = le64_to_cpu(reqhead->layout.ol_pgid);
+	o = ceph_calc_pg_primary(osdc->osdmap, pgid);
+
+	if ((req->r_osd && req->r_osd->o_osd == o &&
+	     req->r_sent >= req->r_osd->o_incarnation) ||
+	    (req->r_osd == NULL && o == -1))
+		return 0;  /* no change */
+
+	dout("map_osds tid %llu pgid %llx pool %d osd%d (was osd%d)\n",
+	     req->r_tid, pgid.pg64, pgid.pg.pool, o,
+	     req->r_osd ? req->r_osd->o_osd : -1);
+
+	if (req->r_osd) {
+		__cancel_request(req);
+		list_del_init(&req->r_osd_item);
+		if (list_empty(&req->r_osd->o_requests)) {
+			/* try to re-use r_osd if possible */
+			newosd = get_osd(req->r_osd);
+			remove_osd(osdc, newosd);
+		}
+		req->r_osd = NULL;
+	}
+
+	req->r_osd = __lookup_osd(osdc, o);
+	if (!req->r_osd && o >= 0) {
+		if (newosd) {
+			req->r_osd = newosd;
+			newosd = NULL;
+		} else {
+			err = -ENOMEM;
+			req->r_osd = create_osd(osdc);
+			if (!req->r_osd)
+				goto out;
+		}
+
+		dout("map_osds osd %p is osd%d\n", req->r_osd, o);
+		req->r_osd->o_osd = o;
+		req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
+		__insert_osd(osdc, req->r_osd);
+
+		ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+	}
+
+	if (req->r_osd)
+		list_add(&req->r_osd_item, &req->r_osd->o_requests);
+	err = 1;   /* osd changed */
+
+out:
+	if (newosd)
+		put_osd(newosd);
+	return err;
+}
+
+/*
+ * caller should hold map_sem (for read) and request_mutex
+ */
+static int __send_request(struct ceph_osd_client *osdc,
+			  struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead;
+	int err;
+
+	err = __map_osds(osdc, req);
+	if (err < 0)
+		return err;
+	if (req->r_osd == NULL) {
+		dout("send_request %p no up osds in pg\n", req);
+		ceph_monc_request_next_osdmap(&osdc->client->monc);
+		return 0;
+	}
+
+	dout("send_request %p tid %llu to osd%d flags %d\n",
+	     req, req->r_tid, req->r_osd->o_osd, req->r_flags);
+
+	reqhead = req->r_request->front.iov_base;
+	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
+	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
+	reqhead->reassert_version = req->r_reassert_version;
+
+	req->r_timeout_stamp = jiffies+osdc->client->mount_args.osd_timeout*HZ;
+
+	ceph_msg_get(req->r_request); /* send consumes a ref */
+	ceph_con_send(&req->r_osd->o_con, req->r_request);
+	req->r_sent = req->r_osd->o_incarnation;
+	return 0;
+}
+
+/*
+ * Timeout callback, called every N seconds when 1 or more osd
+ * requests has been active for more than N seconds.  When this
+ * happens, we ping all OSDs with requests who have timed out to
+ * ensure any communications channel reset is detected.  Reset the
+ * request timeouts another N seconds in the future as we go.
+ * Reschedule the timeout event another N seconds in future (unless
+ * there are no open requests).
+ */
+static void handle_timeout(struct work_struct *work)
+{
+	struct ceph_osd_client *osdc =
+		container_of(work, struct ceph_osd_client, timeout_work.work);
+	struct ceph_osd_request *req;
+	struct ceph_osd *osd;
+	unsigned long timeout = osdc->client->mount_args.osd_timeout * HZ;
+	unsigned long next_timeout = timeout + jiffies;
+	struct rb_node *p;
+
+	dout("timeout\n");
+	down_read(&osdc->map_sem);
+
+	ceph_monc_request_next_osdmap(&osdc->client->monc);
+
+	mutex_lock(&osdc->request_mutex);
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		if (req->r_resend) {
+			int err;
+
+			dout("osdc resending prev failed %lld\n", req->r_tid);
+			err = __send_request(osdc, req);
+			if (err)
+				dout("osdc failed again on %lld\n", req->r_tid);
+			else
+				req->r_resend = false;
+			continue;
+		}
+	}
+	for (p = rb_first(&osdc->osds); p; p = rb_next(p)) {
+		osd = rb_entry(p, struct ceph_osd, o_node);
+		if (list_empty(&osd->o_requests))
+			continue;
+		req = list_first_entry(&osd->o_requests,
+				       struct ceph_osd_request, r_osd_item);
+		if (time_before(jiffies, req->r_timeout_stamp))
+			continue;
+
+		dout(" tid %llu (at least) timed out on osd%d\n",
+		     req->r_tid, osd->o_osd);
+		req->r_timeout_stamp = next_timeout;
+		ceph_con_keepalive(&osd->o_con);
+	}
+
+	if (osdc->timeout_tid)
+		schedule_delayed_work(&osdc->timeout_work,
+				      round_jiffies_relative(timeout));
+
+	mutex_unlock(&osdc->request_mutex);
+
+	up_read(&osdc->map_sem);
+}
+
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+	struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+	struct ceph_osd_request *req;
+	u64 tid;
+	int numops, object_len, flags;
+
+	if (msg->front.iov_len < sizeof(*rhead))
+		goto bad;
+	tid = le64_to_cpu(rhead->tid);
+	numops = le32_to_cpu(rhead->num_ops);
+	object_len = le32_to_cpu(rhead->object_len);
+	if (msg->front.iov_len != sizeof(*rhead) + object_len +
+	    numops * sizeof(struct ceph_osd_op))
+		goto bad;
+	dout("handle_reply %p tid %llu\n", msg, tid);
+
+	/* lookup */
+	mutex_lock(&osdc->request_mutex);
+	req = __lookup_request(osdc, tid);
+	if (req == NULL) {
+		dout("handle_reply tid %llu dne\n", tid);
+		mutex_unlock(&osdc->request_mutex);
+		return;
+	}
+	ceph_osdc_get_request(req);
+	flags = le32_to_cpu(rhead->flags);
+
+	if (req->r_reply) {
+		/*
+		 * once we see the message has been received, we don't
+		 * need a ref (which is only needed for revoking
+		 * pages)
+		 */
+		ceph_msg_put(req->r_reply);
+		req->r_reply = NULL;
+	}
+
+	if (!req->r_got_reply) {
+		unsigned bytes;
+
+		req->r_result = le32_to_cpu(rhead->result);
+		bytes = le32_to_cpu(msg->hdr.data_len);
+		dout("handle_reply result %d bytes %d\n", req->r_result,
+		     bytes);
+		if (req->r_result == 0)
+			req->r_result = bytes;
+
+		/* in case this is a write and we need to replay, */
+		req->r_reassert_version = rhead->reassert_version;
+
+		req->r_got_reply = 1;
+	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+		dout("handle_reply tid %llu dup ack\n", tid);
+		goto done;
+	}
+
+	dout("handle_reply tid %llu flags %d\n", tid, flags);
+
+	/* either this is a read, or we got the safe response */
+	if ((flags & CEPH_OSD_FLAG_ONDISK) ||
+	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+		__unregister_request(osdc, req);
+
+	mutex_unlock(&osdc->request_mutex);
+
+	if (req->r_callback)
+		req->r_callback(req, msg);
+	else
+		complete(&req->r_completion);
+
+	if (flags & CEPH_OSD_FLAG_ONDISK) {
+		if (req->r_safe_callback)
+			req->r_safe_callback(req, msg);
+		complete(&req->r_safe_completion);  /* fsync waiter */
+	}
+
+done:
+	ceph_osdc_put_request(req);
+	return;
+
+bad:
+	pr_err("corrupt osd_op_reply got %d %d expected %d\n",
+	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
+	       (int)sizeof(*rhead));
+}
+
+
+/*
+ * Resubmit osd requests whose osd or osd address has changed.  Request
+ * a new osd map if osds are down, or we are otherwise unable to determine
+ * how to direct a request.
+ *
+ * Close connections to down osds.
+ *
+ * If @who is specified, resubmit requests for that specific osd.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static void kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *p, *n;
+	int needmap = 0;
+	int err;
+
+	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
+	mutex_lock(&osdc->request_mutex);
+	if (!kickosd) {
+		for (p = rb_first(&osdc->osds); p; p = n) {
+			struct ceph_osd *osd =
+				rb_entry(p, struct ceph_osd, o_node);
+
+			n = rb_next(p);
+			if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+			    !ceph_entity_addr_equal(&osd->o_con.peer_addr,
+					    ceph_osd_addr(osdc->osdmap,
+							  osd->o_osd)))
+				reset_osd(osdc, osd);
+		}
+	}
+
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		if (req->r_resend) {
+			dout(" r_resend set on tid %llu\n", req->r_tid);
+			goto kick;
+		}
+		if (req->r_osd && kickosd == req->r_osd)
+			goto kick;
+
+		err = __map_osds(osdc, req);
+		if (err == 0)
+			continue;  /* no change */
+		if (err < 0) {
+			/*
+			 * FIXME: really, we should set the request
+			 * error and fail if this isn't a 'nofail'
+			 * request, but that's a fair bit more
+			 * complicated to do.  So retry!
+			 */
+			dout(" setting r_resend on %llu\n", req->r_tid);
+			req->r_resend = true;
+			continue;
+		}
+		if (req->r_osd == NULL) {
+			dout("tid %llu maps to no valid osd\n", req->r_tid);
+			needmap++;  /* request a newer map */
+			continue;
+		}
+
+kick:
+		dout("kicking tid %llu osd%d\n", req->r_tid, req->r_osd->o_osd);
+		req->r_flags |= CEPH_OSD_FLAG_RETRY;
+		err = __send_request(osdc, req);
+		if (err) {
+			dout(" setting r_resend on %llu\n", req->r_tid);
+			req->r_resend = true;
+		}
+	}
+	mutex_unlock(&osdc->request_mutex);
+
+	if (needmap) {
+		dout("%d requests for down osds, need new map\n", needmap);
+		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	}
+}
+
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster.  Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+	void *p, *end, *next;
+	u32 nr_maps, maplen;
+	u32 epoch;
+	struct ceph_osdmap *newmap = NULL, *oldmap;
+	int err;
+	struct ceph_fsid fsid;
+
+	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	/* verify fsid */
+	ceph_decode_need(&p, end, sizeof(fsid), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	if (ceph_fsid_compare(&fsid, &osdc->client->monc.monmap->fsid)) {
+		pr_err("got osdmap with wrong fsid, ignoring\n");
+		return;
+	}
+
+	down_write(&osdc->map_sem);
+
+	/* incremental maps */
+	ceph_decode_32_safe(&p, end, nr_maps, bad);
+	dout(" %d inc maps\n", nr_maps);
+	while (nr_maps > 0) {
+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+		ceph_decode_32(&p, epoch);
+		ceph_decode_32(&p, maplen);
+		ceph_decode_need(&p, end, maplen, bad);
+		next = p + maplen;
+		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+			dout("applying incremental map %u len %d\n",
+			     epoch, maplen);
+			newmap = osdmap_apply_incremental(&p, next,
+							  osdc->osdmap,
+							  osdc->client->msgr);
+			if (IS_ERR(newmap)) {
+				err = PTR_ERR(newmap);
+				goto bad;
+			}
+			if (newmap != osdc->osdmap) {
+				ceph_osdmap_destroy(osdc->osdmap);
+				osdc->osdmap = newmap;
+			}
+		} else {
+			dout("ignoring incremental map %u len %d\n",
+			     epoch, maplen);
+		}
+		p = next;
+		nr_maps--;
+	}
+	if (newmap)
+		goto done;
+
+	/* full maps */
+	ceph_decode_32_safe(&p, end, nr_maps, bad);
+	dout(" %d full maps\n", nr_maps);
+	while (nr_maps) {
+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+		ceph_decode_32(&p, epoch);
+		ceph_decode_32(&p, maplen);
+		ceph_decode_need(&p, end, maplen, bad);
+		if (nr_maps > 1) {
+			dout("skipping non-latest full map %u len %d\n",
+			     epoch, maplen);
+		} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+			dout("skipping full map %u len %d, "
+			     "older than our %u\n", epoch, maplen,
+			     osdc->osdmap->epoch);
+		} else {
+			dout("taking full map %u len %d\n", epoch, maplen);
+			newmap = osdmap_decode(&p, p+maplen);
+			if (IS_ERR(newmap)) {
+				err = PTR_ERR(newmap);
+				goto bad;
+			}
+			oldmap = osdc->osdmap;
+			osdc->osdmap = newmap;
+			if (oldmap)
+				ceph_osdmap_destroy(oldmap);
+		}
+		p += maplen;
+		nr_maps--;
+	}
+
+done:
+	downgrade_write(&osdc->map_sem);
+	ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+	if (newmap)
+		kick_requests(osdc, NULL);
+	up_read(&osdc->map_sem);
+	return;
+
+bad:
+	pr_err("osdc handle_map corrupt msg\n");
+	up_write(&osdc->map_sem);
+	return;
+}
+
+
+/*
+ * A read request prepares specific pages that data is to be read into.
+ * When a message is being read off the wire, we call prepare_pages to
+ * find those pages.
+ *  0 = success, -1 failure.
+ */
+static int prepare_pages(struct ceph_connection *con, struct ceph_msg *m,
+			 int want)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc;
+	struct ceph_osd_reply_head *rhead = m->front.iov_base;
+	struct ceph_osd_request *req;
+	u64 tid;
+	int ret = -1;
+	int type = le16_to_cpu(m->hdr.type);
+
+	if (!osd)
+		return -1;
+	osdc = osd->o_osdc;
+
+	dout("prepare_pages on msg %p want %d\n", m, want);
+	if (unlikely(type != CEPH_MSG_OSD_OPREPLY))
+		return -1;  /* hmm! */
+
+	tid = le64_to_cpu(rhead->tid);
+	mutex_lock(&osdc->request_mutex);
+	req = __lookup_request(osdc, tid);
+	if (!req) {
+		dout("prepare_pages unknown tid %llu\n", tid);
+		goto out;
+	}
+	dout("prepare_pages tid %llu has %d pages, want %d\n",
+	     tid, req->r_num_pages, want);
+	if (likely(req->r_num_pages >= want && !req->r_prepared_pages)) {
+		m->pages = req->r_pages;
+		m->nr_pages = req->r_num_pages;
+		req->r_reply = m;  /* only for duration of read over socket */
+		ceph_msg_get(m);
+		req->r_prepared_pages = 1;
+		ret = 0; /* success */
+	}
+out:
+	mutex_unlock(&osdc->request_mutex);
+	return ret;
+}
+
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+			    struct ceph_osd_request *req,
+			    bool nofail)
+{
+	int rc;
+
+	req->r_request->pages = req->r_pages;
+	req->r_request->nr_pages = req->r_num_pages;
+
+	register_request(osdc, req);
+
+	down_read(&osdc->map_sem);
+	mutex_lock(&osdc->request_mutex);
+	rc = __send_request(osdc, req);
+	if (rc) {
+		if (nofail) {
+			dout("osdc_start_request failed send, marking %lld\n",
+			     req->r_tid);
+			req->r_resend = true;
+			rc = 0;
+		} else {
+			__unregister_request(osdc, req);
+		}
+	}
+	mutex_unlock(&osdc->request_mutex);
+	up_read(&osdc->map_sem);
+	return rc;
+}
+
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+			   struct ceph_osd_request *req)
+{
+	int rc;
+
+	rc = wait_for_completion_interruptible(&req->r_completion);
+	if (rc < 0) {
+		mutex_lock(&osdc->request_mutex);
+		__cancel_request(req);
+		mutex_unlock(&osdc->request_mutex);
+		dout("wait_request tid %llu timed out\n", req->r_tid);
+		return rc;
+	}
+
+	dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+	return req->r_result;
+}
+
+/*
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd_request *req;
+	u64 last_tid, next_tid = 0;
+
+	mutex_lock(&osdc->request_mutex);
+	last_tid = osdc->last_tid;
+	while (1) {
+		req = __lookup_request_ge(osdc, next_tid);
+		if (!req)
+			break;
+		if (req->r_tid > last_tid)
+			break;
+
+		next_tid = req->r_tid + 1;
+		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+			continue;
+
+		ceph_osdc_get_request(req);
+		mutex_unlock(&osdc->request_mutex);
+		dout("sync waiting on tid %llu (last is %llu)\n",
+		     req->r_tid, last_tid);
+		wait_for_completion(&req->r_safe_completion);
+		mutex_lock(&osdc->request_mutex);
+		ceph_osdc_put_request(req);
+	}
+	mutex_unlock(&osdc->request_mutex);
+	dout("sync done (thru tid %llu)\n", last_tid);
+}
+
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+	int err;
+
+	dout("init\n");
+	osdc->client = client;
+	osdc->osdmap = NULL;
+	init_rwsem(&osdc->map_sem);
+	init_completion(&osdc->map_waiters);
+	osdc->last_requested_map = 0;
+	mutex_init(&osdc->request_mutex);
+	osdc->timeout_tid = 0;
+	osdc->last_tid = 0;
+	osdc->osds = RB_ROOT;
+	osdc->requests = RB_ROOT;
+	osdc->num_requests = 0;
+	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+
+	osdc->req_mempool = mempool_create_kmalloc_pool(10,
+					sizeof(struct ceph_osd_request));
+	if (!osdc->req_mempool)
+		return -ENOMEM;
+
+	err = ceph_msgpool_init(&osdc->msgpool_op, 4096, 10, true);
+	if (err < 0)
+		return -ENOMEM;
+	err = ceph_msgpool_init(&osdc->msgpool_op_reply, 512, 0, false);
+	if (err < 0)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+	cancel_delayed_work_sync(&osdc->timeout_work);
+	if (osdc->osdmap) {
+		ceph_osdmap_destroy(osdc->osdmap);
+		osdc->osdmap = NULL;
+	}
+	mempool_destroy(osdc->req_mempool);
+	ceph_msgpool_destroy(&osdc->msgpool_op);
+	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+
+/*
+ * Read some contiguous pages.  If we cross a stripe boundary, shorten
+ * *plen.  Return number of bytes read, or error.
+ */
+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+			struct ceph_vino vino, struct ceph_file_layout *layout,
+			u64 off, u64 *plen,
+			u32 truncate_seq, u64 truncate_size,
+			struct page **pages, int num_pages)
+{
+	struct ceph_osd_request *req;
+	int rc = 0;
+
+	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+	     vino.snap, off, *plen);
+	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+				    NULL, 0, truncate_seq, truncate_size, NULL,
+				    false, 1);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	/* it may be a short read due to an object boundary */
+	req->r_pages = pages;
+	num_pages = calc_pages_for(off, *plen);
+	req->r_num_pages = num_pages;
+
+	dout("readpages  final extent is %llu~%llu (%d pages)\n",
+	     off, *plen, req->r_num_pages);
+
+	rc = ceph_osdc_start_request(osdc, req, false);
+	if (!rc)
+		rc = ceph_osdc_wait_request(osdc, req);
+
+	ceph_osdc_put_request(req);
+	dout("readpages result %d\n", rc);
+	return rc;
+}
+
+/*
+ * do a synchronous write on N pages
+ */
+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
+			 struct ceph_file_layout *layout,
+			 struct ceph_snap_context *snapc,
+			 u64 off, u64 len,
+			 u32 truncate_seq, u64 truncate_size,
+			 struct timespec *mtime,
+			 struct page **pages, int num_pages,
+			 int flags, int do_sync, bool nofail)
+{
+	struct ceph_osd_request *req;
+	int rc = 0;
+
+	BUG_ON(vino.snap != CEPH_NOSNAP);
+	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+				    CEPH_OSD_OP_WRITE,
+				    flags | CEPH_OSD_FLAG_ONDISK |
+					    CEPH_OSD_FLAG_WRITE,
+				    snapc, do_sync,
+				    truncate_seq, truncate_size, mtime,
+				    nofail, 1);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	/* it may be a short write due to an object boundary */
+	req->r_pages = pages;
+	req->r_num_pages = calc_pages_for(off, len);
+	dout("writepages %llu~%llu (%d pages)\n", off, len,
+	     req->r_num_pages);
+
+	rc = ceph_osdc_start_request(osdc, req, nofail);
+	if (!rc)
+		rc = ceph_osdc_wait_request(osdc, req);
+
+	ceph_osdc_put_request(req);
+	if (rc == 0)
+		rc = len;
+	dout("writepages result %d\n", rc);
+	return rc;
+}
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	if (!osd)
+		return;
+
+	switch (type) {
+	case CEPH_MSG_OSD_MAP:
+		ceph_osdc_handle_map(osdc, msg);
+		break;
+	case CEPH_MSG_OSD_OPREPLY:
+		handle_reply(osdc, msg);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+	ceph_msg_put(msg);
+}
+
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+				  struct ceph_msg_header *hdr)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	int type = le16_to_cpu(hdr->type);
+
+	switch (type) {
+	case CEPH_MSG_OSD_OPREPLY:
+		return ceph_msgpool_get(&osdc->msgpool_op_reply);
+	}
+	return ceph_alloc_msg(con, hdr);
+}
+
+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	if (get_osd(osd))
+		return con;
+	return NULL;
+}
+
+static void put_osd_con(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	put_osd(osd);
+}
+
+const static struct ceph_connection_operations osd_con_ops = {
+	.get = get_osd_con,
+	.put = put_osd_con,
+	.dispatch = dispatch,
+	.alloc_msg = alloc_msg,
+	.peer_reset = osd_reset,
+	.alloc_middle = ceph_alloc_middle,
+	.prepare_pages = prepare_pages,
+};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..9a4addf7d651
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,144 @@
+#ifndef _FS_CEPH_OSD_CLIENT_H
+#define _FS_CEPH_OSD_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/mempool.h>
+#include <linux/rbtree.h>
+
+#include "types.h"
+#include "osdmap.h"
+#include "messenger.h"
+
+struct ceph_msg;
+struct ceph_snap_context;
+struct ceph_osd_request;
+struct ceph_osd_client;
+
+/*
+ * completion callback for async writepages
+ */
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
+				     struct ceph_msg *);
+
+/* a given osd we're communicating with */
+struct ceph_osd {
+	atomic_t o_ref;
+	struct ceph_osd_client *o_osdc;
+	int o_osd;
+	int o_incarnation;
+	struct rb_node o_node;
+	struct ceph_connection o_con;
+	struct list_head o_requests;
+};
+
+/* an in-flight request */
+struct ceph_osd_request {
+	u64             r_tid;              /* unique for this client */
+	struct rb_node  r_node;
+	struct list_head r_osd_item;
+	struct ceph_osd *r_osd;
+
+	struct ceph_msg  *r_request, *r_reply;
+	int               r_result;
+	int               r_flags;     /* any additional flags for the osd */
+	u32               r_sent;      /* >0 if r_request is sending/sent */
+	int r_prepared_pages, r_got_reply;
+
+	struct ceph_osd_client *r_osdc;
+	atomic_t          r_ref;
+	bool              r_mempool;
+	struct completion r_completion, r_safe_completion;
+	ceph_osdc_callback_t r_callback, r_safe_callback;
+	struct ceph_eversion r_reassert_version;
+	struct list_head  r_unsafe_item;
+
+	struct inode *r_inode;         	      /* for use by callbacks */
+	struct writeback_control *r_wbc;      /* ditto */
+
+	char              r_oid[40];          /* object name */
+	int               r_oid_len;
+	unsigned long     r_timeout_stamp;
+	bool              r_resend;           /* msg send failed, needs retry */
+
+	struct ceph_file_layout r_file_layout;
+	struct ceph_snap_context *r_snapc;    /* snap context for writes */
+	unsigned          r_num_pages;        /* size of page array (follows) */
+	struct page     **r_pages;            /* pages for data payload */
+	int               r_pages_from_pool;
+	int               r_own_pages;        /* if true, i own page list */
+};
+
+struct ceph_osd_client {
+	struct ceph_client     *client;
+
+	struct ceph_osdmap     *osdmap;       /* current map */
+	struct rw_semaphore    map_sem;
+	struct completion      map_waiters;
+	u64                    last_requested_map;
+
+	struct mutex           request_mutex;
+	struct rb_root         osds;          /* osds */
+	u64                    timeout_tid;   /* tid of timeout triggering rq */
+	u64                    last_tid;      /* tid of last request */
+	struct rb_root         requests;      /* pending requests */
+	int                    num_requests;
+	struct delayed_work    timeout_work;
+	struct dentry 	       *debugfs_file;
+
+	mempool_t              *req_mempool;
+
+	struct ceph_msgpool   msgpool_op;
+	struct ceph_msgpool   msgpool_op_reply;
+};
+
+extern int ceph_osdc_init(struct ceph_osd_client *osdc,
+			  struct ceph_client *client);
+extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+
+extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
+				   struct ceph_msg *msg);
+extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
+				 struct ceph_msg *msg);
+
+extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
+				      struct ceph_file_layout *layout,
+				      struct ceph_vino vino,
+				      u64 offset, u64 *len, int op, int flags,
+				      struct ceph_snap_context *snapc,
+				      int do_sync, u32 truncate_seq,
+				      u64 truncate_size,
+				      struct timespec *mtime,
+				      bool use_mempool, int num_reply);
+
+static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+	atomic_inc(&req->r_ref);
+}
+extern void ceph_osdc_put_request(struct ceph_osd_request *req);
+
+extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+				   struct ceph_osd_request *req,
+				   bool nofail);
+extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+				  struct ceph_osd_request *req);
+extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+
+extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+			       struct ceph_vino vino,
+			       struct ceph_file_layout *layout,
+			       u64 off, u64 *plen,
+			       u32 truncate_seq, u64 truncate_size,
+			       struct page **pages, int nr_pages);
+
+extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
+				struct ceph_vino vino,
+				struct ceph_file_layout *layout,
+				struct ceph_snap_context *sc,
+				u64 off, u64 len,
+				u32 truncate_seq, u64 truncate_size,
+				struct timespec *mtime,
+				struct page **pages, int nr_pages,
+				int flags, int do_sync, bool nofail);
+
+#endif
+
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..e38fe6309b1c
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,875 @@
+
+#include <asm/div64.h>
+
+#include "super.h"
+#include "osdmap.h"
+#include "crush/hash.h"
+#include "crush/mapper.h"
+#include "decode.h"
+#include "ceph_debug.h"
+
+char *ceph_osdmap_state_str(char *str, int len, int state)
+{
+	int flag = 0;
+
+	if (!len)
+		goto done;
+
+	*str = '\0';
+	if (state) {
+		if (state & CEPH_OSD_EXISTS) {
+			snprintf(str, len, "exists");
+			flag = 1;
+		}
+		if (state & CEPH_OSD_UP) {
+			snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
+				 "up");
+			flag = 1;
+		}
+	} else {
+		snprintf(str, len, "doesn't exist");
+	}
+done:
+	return str;
+}
+
+/* maps */
+
+static int calc_bits_of(unsigned t)
+{
+	int b = 0;
+	while (t) {
+		t = t >> 1;
+		b++;
+	}
+	return b;
+}
+
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+	pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
+	pi->pgp_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
+	pi->lpg_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
+	pi->lpgp_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+}
+
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+				       struct crush_bucket_uniform *b)
+{
+	dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+	ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+	ceph_decode_32(p, b->item_weight);
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_list_bucket(void **p, void *end,
+				    struct crush_bucket_list *b)
+{
+	int j;
+	dout("crush_decode_list_bucket %p to %p\n", *p, end);
+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->item_weights == NULL)
+		return -ENOMEM;
+	b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->sum_weights == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+	for (j = 0; j < b->h.size; j++) {
+		ceph_decode_32(p, b->item_weights[j]);
+		ceph_decode_32(p, b->sum_weights[j]);
+	}
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_tree_bucket(void **p, void *end,
+				    struct crush_bucket_tree *b)
+{
+	int j;
+	dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+	ceph_decode_32_safe(p, end, b->num_nodes, bad);
+	b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+	if (b->node_weights == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+	for (j = 0; j < b->num_nodes; j++)
+		ceph_decode_32(p, b->node_weights[j]);
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_straw_bucket(void **p, void *end,
+				     struct crush_bucket_straw *b)
+{
+	int j;
+	dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->item_weights == NULL)
+		return -ENOMEM;
+	b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->straws == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+	for (j = 0; j < b->h.size; j++) {
+		ceph_decode_32(p, b->item_weights[j]);
+		ceph_decode_32(p, b->straws[j]);
+	}
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+	struct crush_map *c;
+	int err = -EINVAL;
+	int i, j;
+	void **p = &pbyval;
+	void *start = pbyval;
+	u32 magic;
+
+	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+	c = kzalloc(sizeof(*c), GFP_NOFS);
+	if (c == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ceph_decode_need(p, end, 4*sizeof(u32), bad);
+	ceph_decode_32(p, magic);
+	if (magic != CRUSH_MAGIC) {
+		pr_err("crush_decode magic %x != current %x\n",
+		       (unsigned)magic, (unsigned)CRUSH_MAGIC);
+		goto bad;
+	}
+	ceph_decode_32(p, c->max_buckets);
+	ceph_decode_32(p, c->max_rules);
+	ceph_decode_32(p, c->max_devices);
+
+	c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
+	if (c->device_parents == NULL)
+		goto badmem;
+	c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
+	if (c->bucket_parents == NULL)
+		goto badmem;
+
+	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+	if (c->buckets == NULL)
+		goto badmem;
+	c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+	if (c->rules == NULL)
+		goto badmem;
+
+	/* buckets */
+	for (i = 0; i < c->max_buckets; i++) {
+		int size = 0;
+		u32 alg;
+		struct crush_bucket *b;
+
+		ceph_decode_32_safe(p, end, alg, bad);
+		if (alg == 0) {
+			c->buckets[i] = NULL;
+			continue;
+		}
+		dout("crush_decode bucket %d off %x %p to %p\n",
+		     i, (int)(*p-start), *p, end);
+
+		switch (alg) {
+		case CRUSH_BUCKET_UNIFORM:
+			size = sizeof(struct crush_bucket_uniform);
+			break;
+		case CRUSH_BUCKET_LIST:
+			size = sizeof(struct crush_bucket_list);
+			break;
+		case CRUSH_BUCKET_TREE:
+			size = sizeof(struct crush_bucket_tree);
+			break;
+		case CRUSH_BUCKET_STRAW:
+			size = sizeof(struct crush_bucket_straw);
+			break;
+		default:
+			goto bad;
+		}
+		BUG_ON(size == 0);
+		b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+		if (b == NULL)
+			goto badmem;
+
+		ceph_decode_need(p, end, 4*sizeof(u32), bad);
+		ceph_decode_32(p, b->id);
+		ceph_decode_16(p, b->type);
+		ceph_decode_16(p, b->alg);
+		ceph_decode_32(p, b->weight);
+		ceph_decode_32(p, b->size);
+
+		dout("crush_decode bucket size %d off %x %p to %p\n",
+		     b->size, (int)(*p-start), *p, end);
+
+		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+		if (b->items == NULL)
+			goto badmem;
+		b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
+		if (b->perm == NULL)
+			goto badmem;
+		b->perm_n = 0;
+
+		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+		for (j = 0; j < b->size; j++)
+			ceph_decode_32(p, b->items[j]);
+
+		switch (b->alg) {
+		case CRUSH_BUCKET_UNIFORM:
+			err = crush_decode_uniform_bucket(p, end,
+				  (struct crush_bucket_uniform *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_LIST:
+			err = crush_decode_list_bucket(p, end,
+			       (struct crush_bucket_list *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_TREE:
+			err = crush_decode_tree_bucket(p, end,
+				(struct crush_bucket_tree *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_STRAW:
+			err = crush_decode_straw_bucket(p, end,
+				(struct crush_bucket_straw *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		}
+	}
+
+	/* rules */
+	dout("rule vec is %p\n", c->rules);
+	for (i = 0; i < c->max_rules; i++) {
+		u32 yes;
+		struct crush_rule *r;
+
+		ceph_decode_32_safe(p, end, yes, bad);
+		if (!yes) {
+			dout("crush_decode NO rule %d off %x %p to %p\n",
+			     i, (int)(*p-start), *p, end);
+			c->rules[i] = NULL;
+			continue;
+		}
+
+		dout("crush_decode rule %d off %x %p to %p\n",
+		     i, (int)(*p-start), *p, end);
+
+		/* len */
+		ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+		if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+			goto bad;
+#endif
+		r = c->rules[i] = kmalloc(sizeof(*r) +
+					  yes*sizeof(struct crush_rule_step),
+					  GFP_NOFS);
+		if (r == NULL)
+			goto badmem;
+		dout(" rule %d is at %p\n", i, r);
+		r->len = yes;
+		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+		for (j = 0; j < r->len; j++) {
+			ceph_decode_32(p, r->steps[j].op);
+			ceph_decode_32(p, r->steps[j].arg1);
+			ceph_decode_32(p, r->steps[j].arg2);
+		}
+	}
+
+	/* ignore trailing name maps. */
+
+	dout("crush_decode success\n");
+	return c;
+
+badmem:
+	err = -ENOMEM;
+bad:
+	dout("crush_decode fail %d\n", err);
+	crush_destroy(c);
+	return ERR_PTR(err);
+}
+
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+	dout("osdmap_destroy %p\n", map);
+	if (map->crush)
+		crush_destroy(map->crush);
+	while (!RB_EMPTY_ROOT(&map->pg_temp))
+		rb_erase(rb_first(&map->pg_temp), &map->pg_temp);
+	kfree(map->osd_state);
+	kfree(map->osd_weight);
+	kfree(map->pg_pool);
+	kfree(map->osd_addr);
+	kfree(map);
+}
+
+/*
+ * adjust max osd value.  reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+	u8 *state;
+	struct ceph_entity_addr *addr;
+	u32 *weight;
+
+	state = kcalloc(max, sizeof(*state), GFP_NOFS);
+	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+	if (state == NULL || addr == NULL || weight == NULL) {
+		kfree(state);
+		kfree(addr);
+		kfree(weight);
+		return -ENOMEM;
+	}
+
+	/* copy old? */
+	if (map->osd_state) {
+		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+		kfree(map->osd_state);
+		kfree(map->osd_addr);
+		kfree(map->osd_weight);
+	}
+
+	map->osd_state = state;
+	map->osd_weight = weight;
+	map->osd_addr = addr;
+	map->max_osd = max;
+	return 0;
+}
+
+/*
+ * Insert a new pg_temp mapping
+ */
+static void __insert_pg_mapping(struct ceph_pg_mapping *new,
+				struct rb_root *root)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_pg_mapping *pg = NULL;
+
+	while (*p) {
+		parent = *p;
+		pg = rb_entry(parent, struct ceph_pg_mapping, node);
+		if (new->pgid < pg->pgid)
+			p = &(*p)->rb_left;
+		else if (new->pgid > pg->pgid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+}
+
+/*
+ * decode a full map.
+ */
+struct ceph_osdmap *osdmap_decode(void **p, void *end)
+{
+	struct ceph_osdmap *map;
+	u16 version;
+	u32 len, max, i;
+	int err = -EINVAL;
+	void *start = *p;
+
+	dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+	map = kzalloc(sizeof(*map), GFP_NOFS);
+	if (map == NULL)
+		return ERR_PTR(-ENOMEM);
+	map->pg_temp = RB_ROOT;
+
+	ceph_decode_16_safe(p, end, version, bad);
+
+	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+	ceph_decode_32(p, map->epoch);
+	ceph_decode_copy(p, &map->created, sizeof(map->created));
+	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+
+	ceph_decode_32(p, map->num_pools);
+	map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool),
+			       GFP_NOFS);
+	if (!map->pg_pool) {
+		err = -ENOMEM;
+		goto bad;
+	}
+	ceph_decode_32_safe(p, end, max, bad);
+	while (max--) {
+		ceph_decode_need(p, end, 4+sizeof(map->pg_pool->v), bad);
+		ceph_decode_32(p, i);
+		if (i >= map->num_pools)
+			goto bad;
+		ceph_decode_copy(p, &map->pg_pool[i].v,
+				 sizeof(map->pg_pool->v));
+		calc_pg_masks(&map->pg_pool[i]);
+		p += le32_to_cpu(map->pg_pool[i].v.num_snaps) * sizeof(u64);
+		p += le32_to_cpu(map->pg_pool[i].v.num_removed_snap_intervals)
+			* sizeof(u64) * 2;
+	}
+
+	ceph_decode_32_safe(p, end, map->flags, bad);
+
+	ceph_decode_32(p, max);
+
+	/* (re)alloc osd arrays */
+	err = osdmap_set_max_osd(map, max);
+	if (err < 0)
+		goto bad;
+	dout("osdmap_decode max_osd = %d\n", map->max_osd);
+
+	/* osds */
+	err = -EINVAL;
+	ceph_decode_need(p, end, 3*sizeof(u32) +
+			 map->max_osd*(1 + sizeof(*map->osd_weight) +
+				       sizeof(*map->osd_addr)), bad);
+	*p += 4; /* skip length field (should match max) */
+	ceph_decode_copy(p, map->osd_state, map->max_osd);
+
+	*p += 4; /* skip length field (should match max) */
+	for (i = 0; i < map->max_osd; i++)
+		ceph_decode_32(p, map->osd_weight[i]);
+
+	*p += 4; /* skip length field (should match max) */
+	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+
+	/* pg_temp */
+	ceph_decode_32_safe(p, end, len, bad);
+	for (i = 0; i < len; i++) {
+		int n, j;
+		u64 pgid;
+		struct ceph_pg_mapping *pg;
+
+		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
+		ceph_decode_64(p, pgid);
+		ceph_decode_32(p, n);
+		ceph_decode_need(p, end, n * sizeof(u32), bad);
+		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
+		if (!pg) {
+			err = -ENOMEM;
+			goto bad;
+		}
+		pg->pgid = pgid;
+		pg->len = n;
+		for (j = 0; j < n; j++)
+			ceph_decode_32(p, pg->osds[j]);
+
+		__insert_pg_mapping(pg, &map->pg_temp);
+		dout(" added pg_temp %llx len %d\n", pgid, len);
+	}
+
+	/* crush */
+	ceph_decode_32_safe(p, end, len, bad);
+	dout("osdmap_decode crush len %d from off 0x%x\n", len,
+	     (int)(*p - start));
+	ceph_decode_need(p, end, len, bad);
+	map->crush = crush_decode(*p, end);
+	*p += len;
+	if (IS_ERR(map->crush)) {
+		err = PTR_ERR(map->crush);
+		map->crush = NULL;
+		goto bad;
+	}
+
+	/* ignore the rest of the map */
+	*p = end;
+
+	dout("osdmap_decode done %p %p\n", *p, end);
+	return map;
+
+bad:
+	dout("osdmap_decode fail\n");
+	ceph_osdmap_destroy(map);
+	return ERR_PTR(err);
+}
+
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+					     struct ceph_osdmap *map,
+					     struct ceph_messenger *msgr)
+{
+	struct ceph_osdmap *newmap = map;
+	struct crush_map *newcrush = NULL;
+	struct ceph_fsid fsid;
+	u32 epoch = 0;
+	struct ceph_timespec modified;
+	u32 len, pool;
+	__s32 new_flags, max;
+	void *start = *p;
+	int err = -EINVAL;
+	u16 version;
+	struct rb_node *rbp;
+
+	ceph_decode_16_safe(p, end, version, bad);
+
+	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
+			 bad);
+	ceph_decode_copy(p, &fsid, sizeof(fsid));
+	ceph_decode_32(p, epoch);
+	BUG_ON(epoch != map->epoch+1);
+	ceph_decode_copy(p, &modified, sizeof(modified));
+	ceph_decode_32(p, new_flags);
+
+	/* full map? */
+	ceph_decode_32_safe(p, end, len, bad);
+	if (len > 0) {
+		dout("apply_incremental full map len %d, %p to %p\n",
+		     len, *p, end);
+		newmap = osdmap_decode(p, min(*p+len, end));
+		return newmap;  /* error or not */
+	}
+
+	/* new crush? */
+	ceph_decode_32_safe(p, end, len, bad);
+	if (len > 0) {
+		dout("apply_incremental new crush map len %d, %p to %p\n",
+		     len, *p, end);
+		newcrush = crush_decode(*p, min(*p+len, end));
+		if (IS_ERR(newcrush))
+			return ERR_PTR(PTR_ERR(newcrush));
+	}
+
+	/* new flags? */
+	if (new_flags >= 0)
+		map->flags = new_flags;
+
+	ceph_decode_need(p, end, 5*sizeof(u32), bad);
+
+	/* new max? */
+	ceph_decode_32(p, max);
+	if (max >= 0) {
+		err = osdmap_set_max_osd(map, max);
+		if (err < 0)
+			goto bad;
+	}
+
+	map->epoch++;
+	map->modified = map->modified;
+	if (newcrush) {
+		if (map->crush)
+			crush_destroy(map->crush);
+		map->crush = newcrush;
+		newcrush = NULL;
+	}
+
+	/* new_pool */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		ceph_decode_32_safe(p, end, pool, bad);
+		if (pool >= map->num_pools) {
+			void *pg_pool = kcalloc(pool + 1,
+						sizeof(*map->pg_pool),
+						GFP_NOFS);
+			if (!pg_pool) {
+				err = -ENOMEM;
+				goto bad;
+			}
+			memcpy(pg_pool, map->pg_pool,
+			       map->num_pools * sizeof(*map->pg_pool));
+			kfree(map->pg_pool);
+			map->pg_pool = pg_pool;
+			map->num_pools = pool+1;
+		}
+		ceph_decode_copy(p, &map->pg_pool[pool].v,
+				 sizeof(map->pg_pool->v));
+		calc_pg_masks(&map->pg_pool[pool]);
+	}
+
+	/* old_pool (ignore) */
+	ceph_decode_32_safe(p, end, len, bad);
+	*p += len * sizeof(u32);
+
+	/* new_up */
+	err = -EINVAL;
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd;
+		struct ceph_entity_addr addr;
+		ceph_decode_32_safe(p, end, osd, bad);
+		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+		pr_info("osd%d up\n", osd);
+		BUG_ON(osd >= map->max_osd);
+		map->osd_state[osd] |= CEPH_OSD_UP;
+		map->osd_addr[osd] = addr;
+	}
+
+	/* new_down */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd;
+		ceph_decode_32_safe(p, end, osd, bad);
+		(*p)++;  /* clean flag */
+		pr_info("ceph osd%d down\n", osd);
+		if (osd < map->max_osd)
+			map->osd_state[osd] &= ~CEPH_OSD_UP;
+	}
+
+	/* new_weight */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd, off;
+		ceph_decode_need(p, end, sizeof(u32)*2, bad);
+		ceph_decode_32(p, osd);
+		ceph_decode_32(p, off);
+		pr_info("osd%d weight 0x%x %s\n", osd, off,
+		     off == CEPH_OSD_IN ? "(in)" :
+		     (off == CEPH_OSD_OUT ? "(out)" : ""));
+		if (osd < map->max_osd)
+			map->osd_weight[osd] = off;
+	}
+
+	/* new_pg_temp */
+	rbp = rb_first(&map->pg_temp);
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		struct ceph_pg_mapping *pg;
+		int j;
+		u64 pgid;
+		u32 pglen;
+		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+		ceph_decode_64(p, pgid);
+		ceph_decode_32(p, pglen);
+
+		/* remove any? */
+		while (rbp && rb_entry(rbp, struct ceph_pg_mapping,
+				       node)->pgid <= pgid) {
+			struct rb_node *cur = rbp;
+			rbp = rb_next(rbp);
+			dout(" removed pg_temp %llx\n",
+			     rb_entry(cur, struct ceph_pg_mapping, node)->pgid);
+			rb_erase(cur, &map->pg_temp);
+		}
+
+		if (pglen) {
+			/* insert */
+			ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
+			if (!pg) {
+				err = -ENOMEM;
+				goto bad;
+			}
+			pg->pgid = pgid;
+			pg->len = pglen;
+			for (j = 0; j < len; j++)
+				ceph_decode_32(p, pg->osds[j]);
+			__insert_pg_mapping(pg, &map->pg_temp);
+			dout(" added pg_temp %llx len %d\n", pgid, pglen);
+		}
+	}
+	while (rbp) {
+		struct rb_node *cur = rbp;
+		rbp = rb_next(rbp);
+		dout(" removed pg_temp %llx\n",
+		     rb_entry(cur, struct ceph_pg_mapping, node)->pgid);
+		rb_erase(cur, &map->pg_temp);
+	}
+
+	/* ignore the rest */
+	*p = end;
+	return map;
+
+bad:
+	pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
+	       epoch, (int)(*p - start), *p, start, end);
+	if (newcrush)
+		crush_destroy(newcrush);
+	return ERR_PTR(err);
+}
+
+
+
+
+/*
+ * calculate file layout from given offset, length.
+ * fill in correct oid, logical length, and object extent
+ * offset, length.
+ *
+ * for now, we write only a single su, until we can
+ * pass a stride back to the caller.
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+				   u64 off, u64 *plen,
+				   u64 *bno,
+				   u64 *oxoff, u64 *oxlen)
+{
+	u32 osize = le32_to_cpu(layout->fl_object_size);
+	u32 su = le32_to_cpu(layout->fl_stripe_unit);
+	u32 sc = le32_to_cpu(layout->fl_stripe_count);
+	u32 bl, stripeno, stripepos, objsetno;
+	u32 su_per_object;
+	u64 t;
+
+	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
+	     osize, su);
+	su_per_object = osize / le32_to_cpu(layout->fl_stripe_unit);
+	dout("osize %u / su %u = su_per_object %u\n", osize, su,
+	     su_per_object);
+
+	BUG_ON((su & ~PAGE_MASK) != 0);
+	/* bl = *off / su; */
+	t = off;
+	do_div(t, su);
+	bl = t;
+	dout("off %llu / su %u = bl %u\n", off, su, bl);
+
+	stripeno = bl / sc;
+	stripepos = bl % sc;
+	objsetno = stripeno / su_per_object;
+
+	*bno = objsetno * sc + stripepos;
+	dout("objset %u * sc %u = bno %u\n", objsetno, sc, (unsigned)*bno);
+	/* *oxoff = *off / layout->fl_stripe_unit; */
+	t = off;
+	*oxoff = do_div(t, su);
+	*oxlen = min_t(u64, *plen, su - *oxoff);
+	*plen = *oxlen;
+
+	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+}
+
+/*
+ * calculate an object layout (i.e. pgid) from an oid,
+ * file_layout, and osdmap
+ */
+int ceph_calc_object_layout(struct ceph_object_layout *ol,
+			    const char *oid,
+			    struct ceph_file_layout *fl,
+			    struct ceph_osdmap *osdmap)
+{
+	unsigned num, num_mask;
+	union ceph_pg pgid;
+	s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
+	int poolid = le32_to_cpu(fl->fl_pg_pool);
+	struct ceph_pg_pool_info *pool;
+
+	if (poolid >= osdmap->num_pools)
+		return -EIO;
+	pool = &osdmap->pg_pool[poolid];
+
+	if (preferred >= 0) {
+		num = le32_to_cpu(pool->v.lpg_num);
+		num_mask = pool->lpg_num_mask;
+	} else {
+		num = le32_to_cpu(pool->v.pg_num);
+		num_mask = pool->pg_num_mask;
+	}
+
+	pgid.pg64 = 0;   /* start with it zeroed out */
+	pgid.pg.ps = ceph_full_name_hash(oid, strlen(oid));
+	pgid.pg.preferred = preferred;
+	pgid.pg.pool = le32_to_cpu(fl->fl_pg_pool);
+	if (preferred >= 0)
+		dout("calc_object_layout '%s' pgid %d.%xp%d (%llx)\n", oid,
+		     pgid.pg.pool, pgid.pg.ps, (int)preferred, pgid.pg64);
+	else
+		dout("calc_object_layout '%s' pgid %d.%x (%llx)\n", oid,
+		     pgid.pg.pool, pgid.pg.ps, pgid.pg64);
+
+	ol->ol_pgid = cpu_to_le64(pgid.pg64);
+	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+
+	return 0;
+}
+
+/*
+ * Calculate raw osd vector for the given pgid.  Return pointer to osd
+ * array, or NULL on failure.
+ */
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, union ceph_pg pgid,
+			int *osds, int *num)
+{
+	struct rb_node *n = osdmap->pg_temp.rb_node;
+	struct ceph_pg_mapping *pg;
+	struct ceph_pg_pool_info *pool;
+	int ruleno;
+	unsigned pps; /* placement ps */
+
+	/* pg_temp? */
+	while (n) {
+		pg = rb_entry(n, struct ceph_pg_mapping, node);
+		if (pgid.pg64 < pg->pgid)
+			n = n->rb_left;
+		else if (pgid.pg64 > pg->pgid)
+			n = n->rb_right;
+		else {
+			*num = pg->len;
+			return pg->osds;
+		}
+	}
+
+	/* crush */
+	if (pgid.pg.pool >= osdmap->num_pools)
+		return NULL;
+	pool = &osdmap->pg_pool[pgid.pg.pool];
+	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
+				 pool->v.type, pool->v.size);
+	if (ruleno < 0) {
+		pr_err("no crush rule pool %d type %d size %d\n",
+		       pgid.pg.pool, pool->v.type, pool->v.size);
+		return NULL;
+	}
+
+	if (pgid.pg.preferred >= 0)
+		pps = ceph_stable_mod(pgid.pg.ps,
+				      le32_to_cpu(pool->v.lpgp_num),
+				      pool->lpgp_num_mask);
+	else
+		pps = ceph_stable_mod(pgid.pg.ps,
+				      le32_to_cpu(pool->v.pgp_num),
+				      pool->pgp_num_mask);
+	pps += pgid.pg.pool;
+	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
+			     min_t(int, pool->v.size, *num),
+			     pgid.pg.preferred, osdmap->osd_weight);
+	return osds;
+}
+
+/*
+ * Return primary osd for given pgid, or -1 if none.
+ */
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, union ceph_pg pgid)
+{
+	int rawosds[10], *osds;
+	int i, num = ARRAY_SIZE(rawosds);
+
+	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+	if (!osds)
+		return -1;
+
+	/* primary is first up osd */
+	for (i = 0; i < num; i++)
+		if (ceph_osd_is_up(osdmap, osds[i])) {
+			return osds[i];
+			break;
+		}
+	return -1;
+}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..07127c6fb134
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,123 @@
+#ifndef _FS_CEPH_OSDMAP_H
+#define _FS_CEPH_OSDMAP_H
+
+#include <linux/rbtree.h>
+#include "types.h"
+#include "ceph_fs.h"
+#include "crush/crush.h"
+
+/*
+ * The osd map describes the current membership of the osd cluster and
+ * specifies the mapping of objects to placement groups and placement
+ * groups to (sets of) osds.  That is, it completely specifies the
+ * (desired) distribution of all data objects in the system at some
+ * point in time.
+ *
+ * Each map version is identified by an epoch, which increases monotonically.
+ *
+ * The map can be updated either via an incremental map (diff) describing
+ * the change between two successive epochs, or as a fully encoded map.
+ */
+struct ceph_pg_pool_info {
+	struct ceph_pg_pool v;
+	int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+};
+
+struct ceph_pg_mapping {
+	struct rb_node node;
+	u64 pgid;
+	int len;
+	int osds[];
+};
+
+struct ceph_osdmap {
+	struct ceph_fsid fsid;
+	u32 epoch;
+	u32 mkfs_epoch;
+	struct ceph_timespec created, modified;
+
+	u32 flags;         /* CEPH_OSDMAP_* */
+
+	u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
+	u8 *osd_state;     /* CEPH_OSD_* */
+	u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
+	struct ceph_entity_addr *osd_addr;
+
+	struct rb_root pg_temp;
+
+	u32 num_pools;
+	struct ceph_pg_pool_info *pg_pool;
+
+	/* the CRUSH map specifies the mapping of placement groups to
+	 * the list of osds that store+replicate them. */
+	struct crush_map *crush;
+};
+
+/*
+ * file layout helpers
+ */
+#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
+#define ceph_file_layout_stripe_count(l) \
+	((__s32)le32_to_cpu((l).fl_stripe_count))
+#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
+#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
+#define ceph_file_layout_object_su(l) \
+	((__s32)le32_to_cpu((l).fl_object_stripe_unit))
+#define ceph_file_layout_pg_preferred(l) \
+	((__s32)le32_to_cpu((l).fl_pg_preferred))
+#define ceph_file_layout_pg_pool(l) \
+	((__s32)le32_to_cpu((l).fl_pg_pool))
+
+static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
+{
+	return le32_to_cpu(l->fl_stripe_unit) *
+		le32_to_cpu(l->fl_stripe_count);
+}
+
+/* "period" == bytes before i start on a new set of objects */
+static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
+{
+	return le32_to_cpu(l->fl_object_size) *
+		le32_to_cpu(l->fl_stripe_count);
+}
+
+
+static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+{
+	return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
+}
+
+static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
+{
+	return map && (map->flags & flag);
+}
+
+extern char *ceph_osdmap_state_str(char *str, int len, int state);
+
+static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
+						     int osd)
+{
+	if (osd >= map->max_osd)
+		return NULL;
+	return &map->osd_addr[osd];
+}
+
+extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
+extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+					    struct ceph_osdmap *map,
+					    struct ceph_messenger *msgr);
+extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+
+/* calculate mapping of a file extent to an object */
+extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+					  u64 off, u64 *plen,
+					  u64 *bno, u64 *oxoff, u64 *oxlen);
+
+/* calculate mapping of object to a placement group */
+extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
+				   const char *oid,
+				   struct ceph_file_layout *fl,
+				   struct ceph_osdmap *osdmap);
+extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, union ceph_pg pgid);
+
+#endif
-- 
cgit v1.2.2


From 5ecc0a0f8128b1876e8614638deaed49cc8b174c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:11 -0700
Subject: ceph: CRUSH mapping algorithm

CRUSH is a pseudorandom data distribution function designed to map
inputs onto a dynamic hierarchy of devices, while minimizing the
extent to which inputs are remapped when the devices are added or
removed.  It includes some features that are specifically useful for
storage, most notably the ability to map each input onto a set of N
devices that are separated across administrator-defined failure
domains.  CRUSH is used to distribute data across the cluster of Ceph
storage nodes.

More information about CRUSH can be found in this paper:

    http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/crush/crush.c  | 140 ++++++++++++
 fs/ceph/crush/crush.h  | 188 ++++++++++++++++
 fs/ceph/crush/hash.h   |  90 ++++++++
 fs/ceph/crush/mapper.c | 589 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/crush/mapper.h |  20 ++
 5 files changed, 1027 insertions(+)
 create mode 100644 fs/ceph/crush/crush.c
 create mode 100644 fs/ceph/crush/crush.h
 create mode 100644 fs/ceph/crush/hash.h
 create mode 100644 fs/ceph/crush/mapper.c
 create mode 100644 fs/ceph/crush/mapper.h

(limited to 'fs')

diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..13755cdc4fb3
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,140 @@
+
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+
+#include "crush.h"
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+{
+	if (p >= b->size)
+		return 0;
+
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return ((struct crush_bucket_uniform *)b)->item_weight;
+	case CRUSH_BUCKET_LIST:
+		return ((struct crush_bucket_list *)b)->item_weights[p];
+	case CRUSH_BUCKET_TREE:
+		if (p & 1)
+			return ((struct crush_bucket_tree *)b)->node_weights[p];
+		return 0;
+	case CRUSH_BUCKET_STRAW:
+		return ((struct crush_bucket_straw *)b)->item_weights[p];
+	}
+	return 0;
+}
+
+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
+void crush_calc_parents(struct crush_map *map)
+{
+	int i, b, c;
+
+	for (b = 0; b < map->max_buckets; b++) {
+		if (map->buckets[b] == NULL)
+			continue;
+		for (i = 0; i < map->buckets[b]->size; i++) {
+			c = map->buckets[b]->items[i];
+			BUG_ON(c >= map->max_devices ||
+			       c < -map->max_buckets);
+			if (c >= 0)
+				map->device_parents[c] = map->buckets[b]->id;
+			else
+				map->bucket_parents[-1-c] = map->buckets[b]->id;
+		}
+	}
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+	kfree(b->item_weights);
+	kfree(b->sum_weights);
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+	kfree(b->node_weights);
+	kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+	kfree(b->straws);
+	kfree(b->item_weights);
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+		break;
+	case CRUSH_BUCKET_LIST:
+		crush_destroy_bucket_list((struct crush_bucket_list *)b);
+		break;
+	case CRUSH_BUCKET_TREE:
+		crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+		break;
+	case CRUSH_BUCKET_STRAW:
+		crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+		break;
+	}
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+	int b;
+
+	/* buckets */
+	if (map->buckets) {
+		for (b = 0; b < map->max_buckets; b++) {
+			if (map->buckets[b] == NULL)
+				continue;
+			crush_destroy_bucket(map->buckets[b]);
+		}
+		kfree(map->buckets);
+	}
+
+	/* rules */
+	if (map->rules) {
+		for (b = 0; b < map->max_rules; b++)
+			kfree(map->rules[b]);
+		kfree(map->rules);
+	}
+
+	kfree(map->bucket_parents);
+	kfree(map->device_parents);
+	kfree(map);
+}
+
+
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..9ac7e091126f
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,188 @@
+#ifndef _CRUSH_CRUSH_H
+#define _CRUSH_CRUSH_H
+
+#include <linux/types.h>
+
+/*
+ * CRUSH is a pseudo-random data distribution algorithm that
+ * efficiently distributes input values (typically, data objects)
+ * across a heterogeneous, structured storage cluster.
+ *
+ * The algorithm was originally described in detail in this paper
+ * (although the algorithm has evolved somewhat since then):
+ *
+ *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+ *
+ * LGPL2
+ */
+
+
+#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
+
+
+#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
+#define CRUSH_MAX_SET   10  /* max size of a mapping result */
+
+
+/*
+ * CRUSH uses user-defined "rules" to describe how inputs should be
+ * mapped to devices.  A rule consists of sequence of steps to perform
+ * to generate the set of output devices.
+ */
+struct crush_rule_step {
+	__u32 op;
+	__s32 arg1;
+	__s32 arg2;
+};
+
+/* step op codes */
+enum {
+	CRUSH_RULE_NOOP = 0,
+	CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
+	CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
+				      /* arg2 = type */
+	CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
+	CRUSH_RULE_EMIT = 4,          /* no args */
+	CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
+	CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
+};
+
+/*
+ * for specifying choose num (arg1) relative to the max parameter
+ * passed to do_rule
+ */
+#define CRUSH_CHOOSE_N            0
+#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
+
+/*
+ * The rule mask is used to describe what the rule is intended for.
+ * Given a ruleset and size of output set, we search through the
+ * rule list for a matching rule_mask.
+ */
+struct crush_rule_mask {
+	__u8 ruleset;
+	__u8 type;
+	__u8 min_size;
+	__u8 max_size;
+};
+
+struct crush_rule {
+	__u32 len;
+	struct crush_rule_mask mask;
+	struct crush_rule_step steps[0];
+};
+
+#define crush_rule_size(len) (sizeof(struct crush_rule) + \
+			      (len)*sizeof(struct crush_rule_step))
+
+
+
+/*
+ * A bucket is a named container of other items (either devices or
+ * other buckets).  Items within a bucket are chosen using one of a
+ * few different algorithms.  The table summarizes how the speed of
+ * each option measures up against mapping stability when items are
+ * added or removed.
+ *
+ *  Bucket Alg     Speed       Additions    Removals
+ *  ------------------------------------------------
+ *  uniform         O(1)       poor         poor
+ *  list            O(n)       optimal      poor
+ *  tree            O(log n)   good         good
+ *  straw           O(n)       optimal      optimal
+ */
+enum {
+	CRUSH_BUCKET_UNIFORM = 1,
+	CRUSH_BUCKET_LIST = 2,
+	CRUSH_BUCKET_TREE = 3,
+	CRUSH_BUCKET_STRAW = 4
+};
+static inline const char *crush_bucket_alg_name(int alg)
+{
+	switch (alg) {
+	case CRUSH_BUCKET_UNIFORM: return "uniform";
+	case CRUSH_BUCKET_LIST: return "list";
+	case CRUSH_BUCKET_TREE: return "tree";
+	case CRUSH_BUCKET_STRAW: return "straw";
+	default: return "unknown";
+	}
+}
+
+struct crush_bucket {
+	__s32 id;        /* this'll be negative */
+	__u16 type;      /* non-zero; type=0 is reserved for devices */
+	__u16 alg;       /* one of CRUSH_BUCKET_* */
+	__u32 weight;    /* 16-bit fixed point */
+	__u32 size;      /* num items */
+	__s32 *items;
+
+	/*
+	 * cached random permutation: used for uniform bucket and for
+	 * the linear search fallback for the other bucket types.
+	 */
+	__u32 perm_x;  /* @x for which *perm is defined */
+	__u32 perm_n;  /* num elements of *perm that are permuted/defined */
+	__u32 *perm;
+};
+
+struct crush_bucket_uniform {
+	struct crush_bucket h;
+	__u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
+};
+
+struct crush_bucket_list {
+	struct crush_bucket h;
+	__u32 *item_weights;  /* 16-bit fixed point */
+	__u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
+				 of weights 0..i, inclusive */
+};
+
+struct crush_bucket_tree {
+	struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
+				   actual items */
+	__u8 num_nodes;
+	__u32 *node_weights;
+};
+
+struct crush_bucket_straw {
+	struct crush_bucket h;
+	__u32 *item_weights;   /* 16-bit fixed point */
+	__u32 *straws;         /* 16-bit fixed point */
+};
+
+
+
+/*
+ * CRUSH map includes all buckets, rules, etc.
+ */
+struct crush_map {
+	struct crush_bucket **buckets;
+	struct crush_rule **rules;
+
+	/*
+	 * Parent pointers to identify the parent bucket a device or
+	 * bucket in the hierarchy.  If an item appears more than
+	 * once, this is the _last_ time it appeared (where buckets
+	 * are processed in bucket id order, from -1 on down to
+	 * -max_buckets.
+	 */
+	__u32 *bucket_parents;
+	__u32 *device_parents;
+
+	__s32 max_buckets;
+	__u32 max_rules;
+	__s32 max_devices;
+};
+
+
+/* crush.c */
+extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
+extern void crush_calc_parents(struct crush_map *map);
+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
+extern void crush_destroy_bucket(struct crush_bucket *b);
+extern void crush_destroy(struct crush_map *map);
+
+#endif
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..42f3312783a1
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,90 @@
+#ifndef _CRUSH_HASH_H
+#define _CRUSH_HASH_H
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do {			\
+		a = a-b;  a = a-c;  a = a^(c>>13);	\
+		b = b-c;  b = b-a;  b = b^(a<<8);	\
+		c = c-a;  c = c-b;  c = c^(b>>13);	\
+		a = a-b;  a = a-c;  a = a^(c>>12);	\
+		b = b-c;  b = b-a;  b = b^(a<<16);	\
+		c = c-a;  c = c-b;  c = c^(b>>5);	\
+		a = a-b;  a = a-c;  a = a^(c>>3);	\
+		b = b-c;  b = b-a;  b = b^(a<<10);	\
+		c = c-a;  c = c-b;  c = c^(b>>15);	\
+	} while (0)
+
+#define crush_hash_seed 1315423911
+
+static inline __u32 crush_hash32(__u32 a)
+{
+	__u32 hash = crush_hash_seed ^ a;
+	__u32 b = a;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, a, hash);
+	return hash;
+}
+
+static inline __u32 crush_hash32_2(__u32 a, __u32 b)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(x, a, hash);
+	crush_hashmix(b, y, hash);
+	return hash;
+}
+
+static inline __u32 crush_hash32_3(__u32 a, __u32 b, __u32 c)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	return hash;
+}
+
+static inline __u32 crush_hash32_4(__u32 a, __u32 b, __u32 c,
+				   __u32 d)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(a, x, hash);
+	crush_hashmix(y, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, d, hash);
+	return hash;
+}
+
+static inline __u32 crush_hash32_5(__u32 a, __u32 b, __u32 c,
+				   __u32 d, __u32 e)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(e, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	crush_hashmix(d, x, hash);
+	crush_hashmix(y, e, hash);
+	return hash;
+}
+
+#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..0f0730c62695
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,589 @@
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+#  define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+
+#include "crush.h"
+#include "hash.h"
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+{
+	int i;
+
+	for (i = 0; i < map->max_rules; i++) {
+		if (map->rules[i] &&
+		    map->rules[i]->mask.ruleset == ruleset &&
+		    map->rules[i]->mask.type == type &&
+		    map->rules[i]->mask.min_size <= size &&
+		    map->rules[i]->mask.max_size >= size)
+			return i;
+	}
+	return -1;
+}
+
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors.  Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+			      int x, int r)
+{
+	unsigned pr = r % bucket->size;
+	unsigned i, s;
+
+	/* start a new permutation if @x has changed */
+	if (bucket->perm_x != x || bucket->perm_n == 0) {
+		dprintk("bucket %d new x=%d\n", bucket->id, x);
+		bucket->perm_x = x;
+
+		/* optimize common r=0 case */
+		if (pr == 0) {
+			s = crush_hash32_3(x, bucket->id, 0) %
+				bucket->size;
+			bucket->perm[0] = s;
+			bucket->perm_n = 0xffff;   /* magic value, see below */
+			goto out;
+		}
+
+		for (i = 0; i < bucket->size; i++)
+			bucket->perm[i] = i;
+		bucket->perm_n = 0;
+	} else if (bucket->perm_n == 0xffff) {
+		/* clean up after the r=0 case above */
+		for (i = 1; i < bucket->size; i++)
+			bucket->perm[i] = i;
+		bucket->perm[bucket->perm[0]] = 0;
+		bucket->perm_n = 1;
+	}
+
+	/* calculate permutation up to pr */
+	for (i = 0; i < bucket->perm_n; i++)
+		dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+	while (bucket->perm_n <= pr) {
+		unsigned p = bucket->perm_n;
+		/* no point in swapping the final entry */
+		if (p < bucket->size - 1) {
+			i = crush_hash32_3(x, bucket->id, p) %
+				(bucket->size - p);
+			if (i) {
+				unsigned t = bucket->perm[p + i];
+				bucket->perm[p + i] = bucket->perm[p];
+				bucket->perm[p] = t;
+			}
+			dprintk(" perm_choose swap %d with %d\n", p, p+i);
+		}
+		bucket->perm_n++;
+	}
+	for (i = 0; i < bucket->size; i++)
+		dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+
+	s = bucket->perm[pr];
+out:
+	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+		bucket->size, x, r, pr, s);
+	return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+				 int x, int r)
+{
+	return bucket_perm_choose(&bucket->h, x, r);
+}
+
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+			      int x, int r)
+{
+	int i;
+
+	for (i = bucket->h.size-1; i >= 0; i--) {
+		__u64 w = crush_hash32_4(x, bucket->h.items[i], r,
+					 bucket->h.id);
+		w &= 0xffff;
+		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+			"sw %x rand %llx",
+			i, x, r, bucket->h.items[i], bucket->item_weights[i],
+			bucket->sum_weights[i], w);
+		w *= bucket->sum_weights[i];
+		w = w >> 16;
+		/*dprintk(" scaled %llx\n", w);*/
+		if (w < bucket->item_weights[i])
+			return bucket->h.items[i];
+	}
+
+	BUG_ON(1);
+	return 0;
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+	int h = 0;
+	while ((n & 1) == 0) {
+		h++;
+		n = n >> 1;
+	}
+	return h;
+}
+
+static int left(int x)
+{
+	int h = height(x);
+	return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+	int h = height(x);
+	return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+	return x & 1;
+}
+
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+			      int x, int r)
+{
+	int n, l;
+	__u32 w;
+	__u64 t;
+
+	/* start at root */
+	n = bucket->num_nodes >> 1;
+
+	while (!terminal(n)) {
+		/* pick point in [0, w) */
+		w = bucket->node_weights[n];
+		t = (__u64)crush_hash32_4(x, n, r, bucket->h.id) * (__u64)w;
+		t = t >> 32;
+
+		/* descend to the left or right? */
+		l = left(n);
+		if (t < bucket->node_weights[l])
+			n = l;
+		else
+			n = right(n);
+	}
+
+	return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+			       int x, int r)
+{
+	int i;
+	int high = 0;
+	__u64 high_draw = 0;
+	__u64 draw;
+
+	for (i = 0; i < bucket->h.size; i++) {
+		draw = crush_hash32_3(x, bucket->h.items[i], r);
+		draw &= 0xffff;
+		draw *= bucket->straws[i];
+		if (i == 0 || draw > high_draw) {
+			high = i;
+			high_draw = draw;
+		}
+	}
+	return bucket->h.items[high];
+}
+
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+	dprintk("choose %d x=%d r=%d\n", in->id, x, r);
+	switch (in->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+					  x, r);
+	case CRUSH_BUCKET_LIST:
+		return bucket_list_choose((struct crush_bucket_list *)in,
+					  x, r);
+	case CRUSH_BUCKET_TREE:
+		return bucket_tree_choose((struct crush_bucket_tree *)in,
+					  x, r);
+	case CRUSH_BUCKET_STRAW:
+		return bucket_straw_choose((struct crush_bucket_straw *)in,
+					   x, r);
+	default:
+		BUG_ON(1);
+/* 		return in->items[0] */;
+	}
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+{
+	if (weight[item] >= 0x1000)
+		return 0;
+	if (weight[item] == 0)
+		return 1;
+	if ((crush_hash32_2(x, item) & 0xffff) < weight[item])
+		return 0;
+	return 1;
+}
+
+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ */
+static int crush_choose(struct crush_map *map,
+			struct crush_bucket *bucket,
+			__u32 *weight,
+			int x, int numrep, int type,
+			int *out, int outpos,
+			int firstn, int recurse_to_leaf,
+			int *out2)
+{
+	int rep;
+	int ftotal, flocal;
+	int retry_descent, retry_bucket, skip_rep;
+	struct crush_bucket *in = bucket;
+	int r;
+	int i;
+	int item;
+	int itemtype;
+	int collide, reject;
+	const int orig_tries = 5; /* attempts before we fall back to search */
+	dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
+
+	for (rep = outpos; rep < numrep; rep++) {
+		/* keep trying until we get a non-out, non-colliding item */
+		ftotal = 0;
+		skip_rep = 0;
+		do {
+			retry_descent = 0;
+			in = bucket;               /* initial bucket */
+
+			/* choose through intervening buckets */
+			flocal = 0;
+			do {
+				retry_bucket = 0;
+				r = rep;
+				if (in->alg == CRUSH_BUCKET_UNIFORM) {
+					/* be careful */
+					if (firstn || numrep >= in->size)
+						/* r' = r + f_total */
+						r += ftotal;
+					else if (in->size % numrep == 0)
+						/* r'=r+(n+1)*f_local */
+						r += (numrep+1) *
+							(flocal+ftotal);
+					else
+						/* r' = r + n*f_local */
+						r += numrep * (flocal+ftotal);
+				} else {
+					if (firstn)
+						/* r' = r + f_total */
+						r += ftotal;
+					else
+						/* r' = r + n*f_local */
+						r += numrep * (flocal+ftotal);
+				}
+
+				/* bucket choose */
+				if (flocal >= (in->size>>1) &&
+				    flocal > orig_tries)
+					item = bucket_perm_choose(in, x, r);
+				else
+					item = crush_bucket_choose(in, x, r);
+				BUG_ON(item >= map->max_devices);
+
+				/* desired type? */
+				if (item < 0)
+					itemtype = map->buckets[-1-item]->type;
+				else
+					itemtype = 0;
+				dprintk("  item %d type %d\n", item, itemtype);
+
+				/* keep going? */
+				if (itemtype != type) {
+					BUG_ON(item >= 0 ||
+					       (-1-item) >= map->max_buckets);
+					in = map->buckets[-1-item];
+					continue;
+				}
+
+				/* collision? */
+				collide = 0;
+				for (i = 0; i < outpos; i++) {
+					if (out[i] == item) {
+						collide = 1;
+						break;
+					}
+				}
+
+				if (recurse_to_leaf &&
+				    item < 0 &&
+				    crush_choose(map, map->buckets[-1-item],
+						 weight,
+						 x, outpos+1, 0,
+						 out2, outpos,
+						 firstn, 0, NULL) <= outpos) {
+					reject = 1;
+				} else {
+					/* out? */
+					if (itemtype == 0)
+						reject = is_out(map, weight,
+								item, x);
+					else
+						reject = 0;
+				}
+
+				if (reject || collide) {
+					ftotal++;
+					flocal++;
+
+					if (collide && flocal < 3)
+						/* retry locally a few times */
+						retry_bucket = 1;
+					else if (flocal < in->size + orig_tries)
+						/* exhaustive bucket search */
+						retry_bucket = 1;
+					else if (ftotal < 20)
+						/* then retry descent */
+						retry_descent = 1;
+					else
+						/* else give up */
+						skip_rep = 1;
+					dprintk("  reject %d  collide %d  "
+						"ftotal %d  flocal %d\n",
+						reject, collide, ftotal,
+						flocal);
+				}
+			} while (retry_bucket);
+		} while (retry_descent);
+
+		if (skip_rep) {
+			dprintk("skip rep\n");
+			continue;
+		}
+
+		dprintk("choose got %d\n", item);
+		out[outpos] = item;
+		outpos++;
+	}
+
+	dprintk("choose returns %d\n", outpos);
+	return outpos;
+}
+
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @force: force initial replica choice; -1 for none
+ */
+int crush_do_rule(struct crush_map *map,
+		  int ruleno, int x, int *result, int result_max,
+		  int force, __u32 *weight)
+{
+	int result_len;
+	int force_context[CRUSH_MAX_DEPTH];
+	int force_pos = -1;
+	int a[CRUSH_MAX_SET];
+	int b[CRUSH_MAX_SET];
+	int c[CRUSH_MAX_SET];
+	int recurse_to_leaf;
+	int *w;
+	int wsize = 0;
+	int *o;
+	int osize;
+	int *tmp;
+	struct crush_rule *rule;
+	int step;
+	int i, j;
+	int numrep;
+	int firstn;
+	int rc = -1;
+
+	BUG_ON(ruleno >= map->max_rules);
+
+	rule = map->rules[ruleno];
+	result_len = 0;
+	w = a;
+	o = b;
+
+	/*
+	 * determine hierarchical context of force, if any.  note
+	 * that this may or may not correspond to the specific types
+	 * referenced by the crush rule.
+	 */
+	if (force >= 0) {
+		if (force >= map->max_devices ||
+		    map->device_parents[force] == 0) {
+			/*dprintk("CRUSH: forcefed device dne\n");*/
+			rc = -1;  /* force fed device dne */
+			goto out;
+		}
+		if (!is_out(map, weight, force, x)) {
+			while (1) {
+				force_context[++force_pos] = force;
+				if (force >= 0)
+					force = map->device_parents[force];
+				else
+					force = map->bucket_parents[-1-force];
+				if (force == 0)
+					break;
+			}
+		}
+	}
+
+	for (step = 0; step < rule->len; step++) {
+		firstn = 0;
+		switch (rule->steps[step].op) {
+		case CRUSH_RULE_TAKE:
+			w[0] = rule->steps[step].arg1;
+			if (force_pos >= 0) {
+				BUG_ON(force_context[force_pos] != w[0]);
+				force_pos--;
+			}
+			wsize = 1;
+			break;
+
+		case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+		case CRUSH_RULE_CHOOSE_FIRSTN:
+			firstn = 1;
+		case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+		case CRUSH_RULE_CHOOSE_INDEP:
+			BUG_ON(wsize == 0);
+
+			recurse_to_leaf =
+				rule->steps[step].op ==
+				 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+				rule->steps[step].op ==
+				CRUSH_RULE_CHOOSE_LEAF_INDEP;
+
+			/* reset output */
+			osize = 0;
+
+			for (i = 0; i < wsize; i++) {
+				/*
+				 * see CRUSH_N, CRUSH_N_MINUS macros.
+				 * basically, numrep <= 0 means relative to
+				 * the provided result_max
+				 */
+				numrep = rule->steps[step].arg1;
+				if (numrep <= 0) {
+					numrep += result_max;
+					if (numrep <= 0)
+						continue;
+				}
+				j = 0;
+				if (osize == 0 && force_pos >= 0) {
+					/* skip any intermediate types */
+					while (force_pos &&
+					       force_context[force_pos] < 0 &&
+					       rule->steps[step].arg2 !=
+					       map->buckets[-1 -
+					       force_context[force_pos]]->type)
+						force_pos--;
+					o[osize] = force_context[force_pos];
+					if (recurse_to_leaf)
+						c[osize] = force_context[0];
+					j++;
+					force_pos--;
+				}
+				osize += crush_choose(map,
+						      map->buckets[-1-w[i]],
+						      weight,
+						      x, numrep,
+						      rule->steps[step].arg2,
+						      o+osize, j,
+						      firstn,
+						      recurse_to_leaf, c+osize);
+			}
+
+			if (recurse_to_leaf)
+				/* copy final _leaf_ values to output set */
+				memcpy(o, c, osize*sizeof(*o));
+
+			/* swap t and w arrays */
+			tmp = o;
+			o = w;
+			w = tmp;
+			wsize = osize;
+			break;
+
+
+		case CRUSH_RULE_EMIT:
+			for (i = 0; i < wsize && result_len < result_max; i++) {
+				result[result_len] = w[i];
+				result_len++;
+			}
+			wsize = 0;
+			break;
+
+		default:
+			BUG_ON(1);
+		}
+	}
+	rc = result_len;
+
+out:
+	return rc;
+}
+
+
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
+#ifndef _CRUSH_MAPPER_H
+#define _CRUSH_MAPPER_H
+
+/*
+ * CRUSH functions for find rules and then mapping an input to an
+ * output set.
+ *
+ * LGPL2
+ */
+
+#include "crush.h"
+
+extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
+extern int crush_do_rule(struct crush_map *map,
+			 int ruleno,
+			 int x, int *result, int result_max,
+			 int forcefeed,    /* -1 for none */
+			 __u32 *weights);
+
+#endif
-- 
cgit v1.2.2


From ba75bb98cfb93b62c54af25bf67ff90857264bbe Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:11 -0700
Subject: ceph: monitor client

The monitor cluster is responsible for managing cluster membership
and state.  The monitor client handles what minimal interaction
the Ceph client has with it: checking for updated versions of the
MDS and OSD maps, getting statfs() information, and unmounting.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c | 694 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/mon_client.h | 109 ++++++++
 2 files changed, 803 insertions(+)
 create mode 100644 fs/ceph/mon_client.c
 create mode 100644 fs/ceph/mon_client.h

(limited to 'fs')

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..b0c95cec5df8
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,694 @@
+#include "ceph_debug.h"
+
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+#include "mon_client.h"
+#include "super.h"
+#include "decode.h"
+
+/*
+ * Interact with Ceph monitor cluster.  Handle requests for new map
+ * versions, and periodically resend as needed.  Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information.  An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates.  We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure.  If the connection does break, we
+ * randomly hunt for a new monitor.  Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+
+const static struct ceph_connection_operations mon_con_ops;
+
+/*
+ * Decode a monmap blob (e.g., during mount).
+ */
+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+{
+	struct ceph_monmap *m = NULL;
+	int i, err = -EINVAL;
+	struct ceph_fsid fsid;
+	u32 epoch, num_mon;
+	u16 version;
+
+	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+
+	ceph_decode_16_safe(&p, end, version, bad);
+
+	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	ceph_decode_32(&p, epoch);
+
+	ceph_decode_32(&p, num_mon);
+	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+
+	if (num_mon >= CEPH_MAX_MON)
+		goto bad;
+	m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
+	if (m == NULL)
+		return ERR_PTR(-ENOMEM);
+	m->fsid = fsid;
+	m->epoch = epoch;
+	m->num_mon = num_mon;
+	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+
+	if (p != end)
+		goto bad;
+
+	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
+	     m->num_mon);
+	for (i = 0; i < m->num_mon; i++)
+		dout("monmap_decode  mon%d is %s\n", i,
+		     pr_addr(&m->mon_inst[i].addr.in_addr));
+	return m;
+
+bad:
+	dout("monmap_decode failed with %d\n", err);
+	kfree(m);
+	return ERR_PTR(err);
+}
+
+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+	int i;
+
+	for (i = 0; i < m->num_mon; i++)
+		if (ceph_entity_addr_equal(addr, &m->mon_inst[i].addr))
+			return 1;
+	return 0;
+}
+
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+	if (monc->con) {
+		dout("__close_session closing mon%d\n", monc->cur_mon);
+		ceph_con_close(monc->con);
+		monc->cur_mon = -1;
+	}
+}
+
+/*
+ * Open a session with a (new) monitor.
+ */
+static int __open_session(struct ceph_mon_client *monc)
+{
+	char r;
+
+	if (monc->cur_mon < 0) {
+		get_random_bytes(&r, 1);
+		monc->cur_mon = r % monc->monmap->num_mon;
+		dout("open_session num=%d r=%d -> mon%d\n",
+		     monc->monmap->num_mon, r, monc->cur_mon);
+		monc->sub_sent = 0;
+		monc->sub_renew_after = jiffies;  /* i.e., expired */
+		monc->want_next_osdmap = !!monc->want_next_osdmap;
+
+		dout("open_session mon%d opening\n", monc->cur_mon);
+		monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
+		monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
+		ceph_con_open(monc->con,
+			      &monc->monmap->mon_inst[monc->cur_mon].addr);
+	} else {
+		dout("open_session mon%d already open\n", monc->cur_mon);
+	}
+	return 0;
+}
+
+static bool __sub_expired(struct ceph_mon_client *monc)
+{
+	return time_after_eq(jiffies, monc->sub_renew_after);
+}
+
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+	unsigned delay;
+
+	if (monc->cur_mon < 0 || monc->want_mount || __sub_expired(monc))
+		delay = 10 * HZ;
+	else
+		delay = 20 * HZ;
+	dout("__schedule_delayed after %u\n", delay);
+	schedule_delayed_work(&monc->delayed_work, delay);
+}
+
+/*
+ * Send subscribe request for mdsmap and/or osdmap.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+	dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
+	     (unsigned)monc->sub_sent, __sub_expired(monc),
+	     monc->want_next_osdmap);
+	if ((__sub_expired(monc) && !monc->sub_sent) ||
+	    monc->want_next_osdmap == 1) {
+		struct ceph_msg *msg;
+		struct ceph_mon_subscribe_item *i;
+		void *p, *end;
+
+		msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 64, 0, 0, NULL);
+		if (!msg)
+			return;
+
+		p = msg->front.iov_base;
+		end = p + msg->front.iov_len;
+
+		dout("__send_subscribe to 'mdsmap' %u+\n",
+		     (unsigned)monc->have_mdsmap);
+		if (monc->want_next_osdmap) {
+			dout("__send_subscribe to 'osdmap' %u\n",
+			     (unsigned)monc->have_osdmap);
+			ceph_encode_32(&p, 2);
+			ceph_encode_string(&p, end, "osdmap", 6);
+			i = p;
+			i->have = cpu_to_le64(monc->have_osdmap);
+			i->onetime = 1;
+			p += sizeof(*i);
+			monc->want_next_osdmap = 2;  /* requested */
+		} else {
+			ceph_encode_32(&p, 1);
+		}
+		ceph_encode_string(&p, end, "mdsmap", 6);
+		i = p;
+		i->have = cpu_to_le64(monc->have_mdsmap);
+		i->onetime = 0;
+		p += sizeof(*i);
+
+		msg->front.iov_len = p - msg->front.iov_base;
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+		ceph_con_send(monc->con, msg);
+
+		monc->sub_sent = jiffies | 1;  /* never 0 */
+	}
+}
+
+static void handle_subscribe_ack(struct ceph_mon_client *monc,
+				 struct ceph_msg *msg)
+{
+	unsigned seconds;
+	void *p = msg->front.iov_base;
+	void *end = p + msg->front.iov_len;
+
+	ceph_decode_32_safe(&p, end, seconds, bad);
+	mutex_lock(&monc->mutex);
+	if (monc->hunting) {
+		pr_info("mon%d %s session established\n",
+			monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
+		monc->hunting = false;
+	}
+	dout("handle_subscribe_ack after %d seconds\n", seconds);
+	monc->sub_renew_after = monc->sub_sent + seconds*HZ - 1;
+	monc->sub_sent = 0;
+	mutex_unlock(&monc->mutex);
+	return;
+bad:
+	pr_err("got corrupt subscribe-ack msg\n");
+}
+
+/*
+ * Keep track of which maps we have
+ */
+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+{
+	mutex_lock(&monc->mutex);
+	monc->have_mdsmap = got;
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+{
+	mutex_lock(&monc->mutex);
+	monc->have_osdmap = got;
+	monc->want_next_osdmap = 0;
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+/*
+ * Register interest in the next osdmap
+ */
+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+{
+	dout("request_next_osdmap have %u\n", monc->have_osdmap);
+	mutex_lock(&monc->mutex);
+	if (!monc->want_next_osdmap)
+		monc->want_next_osdmap = 1;
+	if (monc->want_next_osdmap < 2)
+		__send_subscribe(monc);
+	mutex_unlock(&monc->mutex);
+}
+
+
+/*
+ * mount
+ */
+static void __request_mount(struct ceph_mon_client *monc)
+{
+	struct ceph_msg *msg;
+	struct ceph_client_mount *h;
+	int err;
+
+	dout("__request_mount\n");
+	err = __open_session(monc);
+	if (err)
+		return;
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_MOUNT, sizeof(*h), 0, 0, NULL);
+	if (IS_ERR(msg))
+		return;
+	h = msg->front.iov_base;
+	h->have_version = 0;
+	ceph_con_send(monc->con, msg);
+}
+
+int ceph_monc_request_mount(struct ceph_mon_client *monc)
+{
+	if (!monc->con) {
+		monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
+		if (!monc->con)
+			return -ENOMEM;
+		ceph_con_init(monc->client->msgr, monc->con);
+		monc->con->private = monc;
+		monc->con->ops = &mon_con_ops;
+	}
+
+	mutex_lock(&monc->mutex);
+	__request_mount(monc);
+	__schedule_delayed(monc);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+/*
+ * The monitor responds with mount ack indicate mount success.  The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void handle_mount_ack(struct ceph_mon_client *monc, struct ceph_msg *msg)
+{
+	struct ceph_client *client = monc->client;
+	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+	void *p, *end;
+	s32 result;
+	u32 len;
+	s64 cnum;
+	int err = -EINVAL;
+
+	if (client->whoami >= 0) {
+		dout("handle_mount_ack - already mounted\n");
+		return;
+	}
+
+	mutex_lock(&monc->mutex);
+
+	dout("handle_mount_ack\n");
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	ceph_decode_64_safe(&p, end, cnum, bad);
+	ceph_decode_32_safe(&p, end, result, bad);
+	ceph_decode_32_safe(&p, end, len, bad);
+	if (result) {
+		pr_err("mount denied: %.*s (%d)\n", len, (char *)p,
+		       result);
+		err = result;
+		goto out;
+	}
+	p += len;
+
+	ceph_decode_32_safe(&p, end, len, bad);
+	ceph_decode_need(&p, end, len, bad);
+	monmap = ceph_monmap_decode(p, p + len);
+	if (IS_ERR(monmap)) {
+		pr_err("problem decoding monmap, %d\n",
+		       (int)PTR_ERR(monmap));
+		err = -EINVAL;
+		goto out;
+	}
+	p += len;
+
+	client->monc.monmap = monmap;
+	kfree(old);
+
+	client->signed_ticket = NULL;
+	client->signed_ticket_len = 0;
+
+	monc->want_mount = false;
+
+	client->whoami = cnum;
+	client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+	client->msgr->inst.name.num = cpu_to_le64(cnum);
+	pr_info("client%lld fsid " FSID_FORMAT "\n",
+		client->whoami, PR_FSID(&client->monc.monmap->fsid));
+
+	ceph_debugfs_client_init(client);
+	__send_subscribe(monc);
+
+	err = 0;
+	goto out;
+
+bad:
+	pr_err("error decoding mount_ack message\n");
+out:
+	client->mount_err = err;
+	mutex_unlock(&monc->mutex);
+	wake_up(&client->mount_wq);
+}
+
+
+
+
+/*
+ * statfs
+ */
+static void handle_statfs_reply(struct ceph_mon_client *monc,
+				struct ceph_msg *msg)
+{
+	struct ceph_mon_statfs_request *req;
+	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+	u64 tid;
+
+	if (msg->front.iov_len != sizeof(*reply))
+		goto bad;
+	tid = le64_to_cpu(reply->tid);
+	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
+
+	mutex_lock(&monc->mutex);
+	req = radix_tree_lookup(&monc->statfs_request_tree, tid);
+	if (req) {
+		*req->buf = reply->st;
+		req->result = 0;
+	}
+	mutex_unlock(&monc->mutex);
+	if (req)
+		complete(&req->completion);
+	return;
+
+bad:
+	pr_err("corrupt statfs reply, no tid\n");
+}
+
+/*
+ * (re)send a statfs request
+ */
+static int send_statfs(struct ceph_mon_client *monc,
+		       struct ceph_mon_statfs_request *req)
+{
+	struct ceph_msg *msg;
+	struct ceph_mon_statfs *h;
+	int err;
+
+	dout("send_statfs tid %llu\n", req->tid);
+	err = __open_session(monc);
+	if (err)
+		return err;
+	msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+	req->request = msg;
+	h = msg->front.iov_base;
+	h->have_version = 0;
+	h->fsid = monc->monmap->fsid;
+	h->tid = cpu_to_le64(req->tid);
+	ceph_con_send(monc->con, msg);
+	return 0;
+}
+
+/*
+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+{
+	struct ceph_mon_statfs_request req;
+	int err;
+
+	req.buf = buf;
+	init_completion(&req.completion);
+
+	/* allocate memory for reply */
+	err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
+	if (err)
+		return err;
+
+	/* register request */
+	mutex_lock(&monc->mutex);
+	req.tid = ++monc->last_tid;
+	req.last_attempt = jiffies;
+	req.delay = BASE_DELAY_INTERVAL;
+	if (radix_tree_insert(&monc->statfs_request_tree, req.tid, &req) < 0) {
+		mutex_unlock(&monc->mutex);
+		pr_err("ENOMEM in do_statfs\n");
+		return -ENOMEM;
+	}
+	monc->num_statfs_requests++;
+	mutex_unlock(&monc->mutex);
+
+	/* send request and wait */
+	err = send_statfs(monc, &req);
+	if (!err)
+		err = wait_for_completion_interruptible(&req.completion);
+
+	mutex_lock(&monc->mutex);
+	radix_tree_delete(&monc->statfs_request_tree, req.tid);
+	monc->num_statfs_requests--;
+	ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
+	mutex_unlock(&monc->mutex);
+
+	if (!err)
+		err = req.result;
+	return err;
+}
+
+/*
+ * Resend pending statfs requests.
+ */
+static void __resend_statfs(struct ceph_mon_client *monc)
+{
+	u64 next_tid = 0;
+	int got;
+	int did = 0;
+	struct ceph_mon_statfs_request *req;
+
+	while (1) {
+		got = radix_tree_gang_lookup(&monc->statfs_request_tree,
+					     (void **)&req,
+					     next_tid, 1);
+		if (got == 0)
+			break;
+		did++;
+		next_tid = req->tid + 1;
+
+		send_statfs(monc, req);
+	}
+}
+
+/*
+ * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM).  And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+	struct ceph_mon_client *monc =
+		container_of(work, struct ceph_mon_client, delayed_work.work);
+
+	dout("monc delayed_work\n");
+	mutex_lock(&monc->mutex);
+	if (monc->want_mount) {
+		__request_mount(monc);
+	} else {
+		if (__sub_expired(monc)) {
+			__close_session(monc);
+			__open_session(monc);  /* continue hunting */
+		} else {
+			ceph_con_keepalive(monc->con);
+		}
+	}
+	__send_subscribe(monc);
+	__schedule_delayed(monc);
+	mutex_unlock(&monc->mutex);
+}
+
+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{
+	int err = 0;
+
+	dout("init\n");
+	memset(monc, 0, sizeof(*monc));
+	monc->client = cl;
+	monc->monmap = NULL;
+	mutex_init(&monc->mutex);
+
+	monc->con = NULL;
+
+	/* msg pools */
+	err = ceph_msgpool_init(&monc->msgpool_mount_ack, 4096, 1, false);
+	if (err < 0)
+		goto out;
+	err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, 8, 1, false);
+	if (err < 0)
+		goto out;
+	err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
+				sizeof(struct ceph_mon_statfs_reply), 0, false);
+	if (err < 0)
+		goto out;
+
+	monc->cur_mon = -1;
+	monc->hunting = false;  /* not really */
+	monc->sub_renew_after = jiffies;
+	monc->sub_sent = 0;
+
+	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+	INIT_RADIX_TREE(&monc->statfs_request_tree, GFP_NOFS);
+	monc->num_statfs_requests = 0;
+	monc->last_tid = 0;
+
+	monc->have_mdsmap = 0;
+	monc->have_osdmap = 0;
+	monc->want_next_osdmap = 1;
+	monc->want_mount = true;
+out:
+	return err;
+}
+
+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+	dout("stop\n");
+	cancel_delayed_work_sync(&monc->delayed_work);
+
+	mutex_lock(&monc->mutex);
+	__close_session(monc);
+	if (monc->con) {
+		monc->con->private = NULL;
+		monc->con->ops->put(monc->con);
+		monc->con = NULL;
+	}
+	mutex_unlock(&monc->mutex);
+
+	ceph_msgpool_destroy(&monc->msgpool_mount_ack);
+	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+	ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+
+	kfree(monc->monmap);
+}
+
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_mon_client *monc = con->private;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	if (!monc)
+		return;
+
+	switch (type) {
+	case CEPH_MSG_CLIENT_MOUNT_ACK:
+		handle_mount_ack(monc, msg);
+		break;
+
+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
+		handle_subscribe_ack(monc, msg);
+		break;
+
+	case CEPH_MSG_STATFS_REPLY:
+		handle_statfs_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_MDS_MAP:
+		ceph_mdsc_handle_map(&monc->client->mdsc, msg);
+		break;
+
+	case CEPH_MSG_OSD_MAP:
+		ceph_osdc_handle_map(&monc->client->osdc, msg);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+	ceph_msg_put(msg);
+}
+
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
+				      struct ceph_msg_header *hdr)
+{
+	struct ceph_mon_client *monc = con->private;
+	int type = le16_to_cpu(hdr->type);
+
+	switch (type) {
+	case CEPH_MSG_CLIENT_MOUNT_ACK:
+		return ceph_msgpool_get(&monc->msgpool_mount_ack);
+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
+		return ceph_msgpool_get(&monc->msgpool_subscribe_ack);
+	case CEPH_MSG_STATFS_REPLY:
+		return ceph_msgpool_get(&monc->msgpool_statfs_reply);
+	}
+	return ceph_alloc_msg(con, hdr);
+}
+
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+	struct ceph_mon_client *monc = con->private;
+
+	if (!monc)
+		return;
+
+	dout("mon_fault\n");
+	mutex_lock(&monc->mutex);
+	if (!con->private)
+		goto out;
+
+	if (monc->con && !monc->hunting)
+		pr_info("mon%d %s session lost, "
+			"hunting for new mon\n", monc->cur_mon,
+			pr_addr(&monc->con->peer_addr.in_addr));
+
+	__close_session(monc);
+	if (!monc->hunting) {
+		/* start hunting */
+		monc->hunting = true;
+		if (__open_session(monc) == 0) {
+			__send_subscribe(monc);
+			__resend_statfs(monc);
+		}
+	} else {
+		/* already hunting, let's wait a bit */
+		__schedule_delayed(monc);
+	}
+out:
+	mutex_unlock(&monc->mutex);
+}
+
+const static struct ceph_connection_operations mon_con_ops = {
+	.get = ceph_con_get,
+	.put = ceph_con_put,
+	.dispatch = dispatch,
+	.fault = mon_fault,
+	.alloc_msg = mon_alloc_msg,
+	.alloc_middle = ceph_alloc_middle,
+};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..5258c5693b03
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,109 @@
+#ifndef _FS_CEPH_MON_CLIENT_H
+#define _FS_CEPH_MON_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/radix-tree.h>
+
+#include "messenger.h"
+#include "msgpool.h"
+
+struct ceph_client;
+struct ceph_mount_args;
+
+/*
+ * The monitor map enumerates the set of all monitors.
+ */
+struct ceph_monmap {
+	struct ceph_fsid fsid;
+	u32 epoch;
+	u32 num_mon;
+	struct ceph_entity_inst mon_inst[0];
+};
+
+struct ceph_mon_client;
+struct ceph_mon_statfs_request;
+
+
+/*
+ * Generic mechanism for resending monitor requests.
+ */
+typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
+					 int newmon);
+
+/* a pending monitor request */
+struct ceph_mon_request {
+	struct ceph_mon_client *monc;
+	struct delayed_work delayed_work;
+	unsigned long delay;
+	ceph_monc_request_func_t do_request;
+};
+
+/*
+ * statfs() is done a bit differently because we need to get data back
+ * to the caller
+ */
+struct ceph_mon_statfs_request {
+	u64 tid;
+	int result;
+	struct ceph_statfs *buf;
+	struct completion completion;
+	unsigned long last_attempt, delay; /* jiffies */
+	struct ceph_msg *request;  /* original request */
+};
+
+struct ceph_mon_client {
+	struct ceph_client *client;
+	struct ceph_monmap *monmap;
+
+	struct mutex mutex;
+	struct delayed_work delayed_work;
+
+	bool hunting;
+	int cur_mon;                       /* last monitor i contacted */
+	unsigned long sub_sent, sub_renew_after;
+	struct ceph_connection *con;
+
+	/* msg pools */
+	struct ceph_msgpool msgpool_mount_ack;
+	struct ceph_msgpool msgpool_subscribe_ack;
+	struct ceph_msgpool msgpool_statfs_reply;
+
+	/* pending statfs requests */
+	struct radix_tree_root statfs_request_tree;
+	int num_statfs_requests;
+	u64 last_tid;
+
+	/* mds/osd map or mount requests */
+	bool want_mount;
+	int want_next_osdmap; /* 1 = want, 2 = want+asked */
+	u32 have_osdmap, have_mdsmap;
+
+	struct dentry *debugfs_file;
+};
+
+extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
+extern int ceph_monmap_contains(struct ceph_monmap *m,
+				struct ceph_entity_addr *addr);
+
+extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
+extern void ceph_monc_stop(struct ceph_mon_client *monc);
+
+/*
+ * The model here is to indicate that we need a new map of at least
+ * epoch @want, and also call in when we receive a map.  We will
+ * periodically rerequest the map from the monitor cluster until we
+ * get what we want.
+ */
+extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
+extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
+
+extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
+
+extern int ceph_monc_request_mount(struct ceph_mon_client *monc);
+
+extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
+			       struct ceph_statfs *buf);
+
+
+
+#endif
-- 
cgit v1.2.2


From a8599bd821d084d04a3290fffae1071624ec00ea Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:12 -0700
Subject: ceph: capability management

The Ceph metadata servers control client access to inode metadata and
file data by issuing capabilities, granting clients permission to read
and/or write both inode field and file data to OSDs (storage nodes).
Each capability consists of a set of bits indicating which operations
are allowed.

If the client holds a *_SHARED cap, the client has a coherent value
that can be safely read from the cached inode.

In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the client
is allowed to change inode attributes (e.g., file size, mtime), note
its dirty state in the ceph_cap, and asynchronously flush that
metadata change to the MDS.

In the event of a conflicting operation (perhaps by another client),
the MDS will revoke the conflicting client capabilities.

In order for a client to cache an inode, it must hold a capability
with at least one MDS server.  When inodes are released, release
notifications are batched and periodically sent en masse to the MDS
cluster to release server state.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 2830 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 2830 insertions(+)
 create mode 100644 fs/ceph/caps.c

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..5c7d0e9bbb7b
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2830 @@
+#include "ceph_debug.h"
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+
+#include "super.h"
+#include "decode.h"
+#include "messenger.h"
+
+/*
+ * Capability management
+ *
+ * The Ceph metadata servers control client access to inode metadata
+ * and file data by issuing capabilities, granting clients permission
+ * to read and/or write both inode field and file data to OSDs
+ * (storage nodes).  Each capability consists of a set of bits
+ * indicating which operations are allowed.
+ *
+ * If the client holds a *_SHARED cap, the client has a coherent value
+ * that can be safely read from the cached inode.
+ *
+ * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
+ * client is allowed to change inode attributes (e.g., file size,
+ * mtime), note its dirty state in the ceph_cap, and asynchronously
+ * flush that metadata change to the MDS.
+ *
+ * In the event of a conflicting operation (perhaps by another
+ * client), the MDS will revoke the conflicting client capabilities.
+ *
+ * In order for a client to cache an inode, it must hold a capability
+ * with at least one MDS server.  When inodes are released, release
+ * notifications are batched and periodically sent en masse to the MDS
+ * cluster to release server state.
+ */
+
+
+/*
+ * Generate readable cap strings for debugging output.
+ */
+#define MAX_CAP_STR 20
+static char cap_str[MAX_CAP_STR][40];
+static DEFINE_SPINLOCK(cap_str_lock);
+static int last_cap_str;
+
+static char *gcap_string(char *s, int c)
+{
+	if (c & CEPH_CAP_GSHARED)
+		*s++ = 's';
+	if (c & CEPH_CAP_GEXCL)
+		*s++ = 'x';
+	if (c & CEPH_CAP_GCACHE)
+		*s++ = 'c';
+	if (c & CEPH_CAP_GRD)
+		*s++ = 'r';
+	if (c & CEPH_CAP_GWR)
+		*s++ = 'w';
+	if (c & CEPH_CAP_GBUFFER)
+		*s++ = 'b';
+	if (c & CEPH_CAP_GLAZYIO)
+		*s++ = 'l';
+	return s;
+}
+
+const char *ceph_cap_string(int caps)
+{
+	int i;
+	char *s;
+	int c;
+
+	spin_lock(&cap_str_lock);
+	i = last_cap_str++;
+	if (last_cap_str == MAX_CAP_STR)
+		last_cap_str = 0;
+	spin_unlock(&cap_str_lock);
+
+	s = cap_str[i];
+
+	if (caps & CEPH_CAP_PIN)
+		*s++ = 'p';
+
+	c = (caps >> CEPH_CAP_SAUTH) & 3;
+	if (c) {
+		*s++ = 'A';
+		s = gcap_string(s, c);
+	}
+
+	c = (caps >> CEPH_CAP_SLINK) & 3;
+	if (c) {
+		*s++ = 'L';
+		s = gcap_string(s, c);
+	}
+
+	c = (caps >> CEPH_CAP_SXATTR) & 3;
+	if (c) {
+		*s++ = 'X';
+		s = gcap_string(s, c);
+	}
+
+	c = caps >> CEPH_CAP_SFILE;
+	if (c) {
+		*s++ = 'F';
+		s = gcap_string(s, c);
+	}
+
+	if (s == cap_str[i])
+		*s++ = '-';
+	*s = 0;
+	return cap_str[i];
+}
+
+/*
+ * Cap reservations
+ *
+ * Maintain a global pool of preallocated struct ceph_caps, referenced
+ * by struct ceph_caps_reservations.  This ensures that we preallocate
+ * memory needed to successfully process an MDS response.  (If an MDS
+ * sends us cap information and we fail to process it, we will have
+ * problems due to the client and MDS being out of sync.)
+ *
+ * Reservations are 'owned' by a ceph_cap_reservation context.
+ */
+static spinlock_t caps_list_lock;
+static struct list_head caps_list;  /* unused (reserved or unreserved) */
+static int caps_total_count;        /* total caps allocated */
+static int caps_use_count;          /* in use */
+static int caps_reserve_count;      /* unused, reserved */
+static int caps_avail_count;        /* unused, unreserved */
+
+void __init ceph_caps_init(void)
+{
+	INIT_LIST_HEAD(&caps_list);
+	spin_lock_init(&caps_list_lock);
+}
+
+void ceph_caps_finalize(void)
+{
+	struct ceph_cap *cap;
+
+	spin_lock(&caps_list_lock);
+	while (!list_empty(&caps_list)) {
+		cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+		list_del(&cap->caps_item);
+		kmem_cache_free(ceph_cap_cachep, cap);
+	}
+	caps_total_count = 0;
+	caps_avail_count = 0;
+	caps_use_count = 0;
+	caps_reserve_count = 0;
+	spin_unlock(&caps_list_lock);
+}
+
+int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
+{
+	int i;
+	struct ceph_cap *cap;
+	int have;
+	int alloc = 0;
+	LIST_HEAD(newcaps);
+	int ret = 0;
+
+	dout("reserve caps ctx=%p need=%d\n", ctx, need);
+
+	/* first reserve any caps that are already allocated */
+	spin_lock(&caps_list_lock);
+	if (caps_avail_count >= need)
+		have = need;
+	else
+		have = caps_avail_count;
+	caps_avail_count -= have;
+	caps_reserve_count += have;
+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+	       caps_avail_count);
+	spin_unlock(&caps_list_lock);
+
+	for (i = have; i < need; i++) {
+		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+		if (!cap) {
+			ret = -ENOMEM;
+			goto out_alloc_count;
+		}
+		list_add(&cap->caps_item, &newcaps);
+		alloc++;
+	}
+	BUG_ON(have + alloc != need);
+
+	spin_lock(&caps_list_lock);
+	caps_total_count += alloc;
+	caps_reserve_count += alloc;
+	list_splice(&newcaps, &caps_list);
+
+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+	       caps_avail_count);
+	spin_unlock(&caps_list_lock);
+
+	ctx->count = need;
+	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
+	     ctx, caps_total_count, caps_use_count, caps_reserve_count,
+	     caps_avail_count);
+	return 0;
+
+out_alloc_count:
+	/* we didn't manage to reserve as much as we needed */
+	pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+		   ctx, need, have);
+	return ret;
+}
+
+int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
+{
+	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+	if (ctx->count) {
+		spin_lock(&caps_list_lock);
+		BUG_ON(caps_reserve_count < ctx->count);
+		caps_reserve_count -= ctx->count;
+		caps_avail_count += ctx->count;
+		ctx->count = 0;
+		dout("unreserve caps %d = %d used + %d resv + %d avail\n",
+		     caps_total_count, caps_use_count, caps_reserve_count,
+		     caps_avail_count);
+		BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+		       caps_avail_count);
+		spin_unlock(&caps_list_lock);
+	}
+	return 0;
+}
+
+static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
+{
+	struct ceph_cap *cap = NULL;
+
+	/* temporary, until we do something about cap import/export */
+	if (!ctx)
+		return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+
+	spin_lock(&caps_list_lock);
+	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
+	     ctx, ctx->count, caps_total_count, caps_use_count,
+	     caps_reserve_count, caps_avail_count);
+	BUG_ON(!ctx->count);
+	BUG_ON(ctx->count > caps_reserve_count);
+	BUG_ON(list_empty(&caps_list));
+
+	ctx->count--;
+	caps_reserve_count--;
+	caps_use_count++;
+
+	cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+	list_del(&cap->caps_item);
+
+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+	       caps_avail_count);
+	spin_unlock(&caps_list_lock);
+	return cap;
+}
+
+static void put_cap(struct ceph_cap *cap,
+		    struct ceph_cap_reservation *ctx)
+{
+	spin_lock(&caps_list_lock);
+	dout("put_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
+	     ctx, ctx ? ctx->count : 0, caps_total_count, caps_use_count,
+	     caps_reserve_count, caps_avail_count);
+	caps_use_count--;
+	/*
+	 * Keep some preallocated caps around, at least enough to do a
+	 * readdir (which needs to preallocate lots of them), to avoid
+	 * lots of free/alloc churn.
+	 */
+	if (caps_avail_count >= caps_reserve_count +
+	    ceph_client(cap->ci->vfs_inode.i_sb)->mount_args.max_readdir) {
+		caps_total_count--;
+		kmem_cache_free(ceph_cap_cachep, cap);
+	} else {
+		if (ctx) {
+			ctx->count++;
+			caps_reserve_count++;
+		} else {
+			caps_avail_count++;
+		}
+		list_add(&cap->caps_item, &caps_list);
+	}
+
+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+	       caps_avail_count);
+	spin_unlock(&caps_list_lock);
+}
+
+void ceph_reservation_status(struct ceph_client *client,
+			     int *total, int *avail, int *used, int *reserved)
+{
+	if (total)
+		*total = caps_total_count;
+	if (avail)
+		*avail = caps_avail_count;
+	if (used)
+		*used = caps_use_count;
+	if (reserved)
+		*reserved = caps_reserve_count;
+}
+
+/*
+ * Find ceph_cap for given mds, if any.
+ *
+ * Called with i_lock held.
+ */
+static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+	struct ceph_cap *cap;
+	struct rb_node *n = ci->i_caps.rb_node;
+
+	while (n) {
+		cap = rb_entry(n, struct ceph_cap, ci_node);
+		if (mds < cap->mds)
+			n = n->rb_left;
+		else if (mds > cap->mds)
+			n = n->rb_right;
+		else
+			return cap;
+	}
+	return NULL;
+}
+
+/*
+ * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
+ * -1.
+ */
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
+{
+	struct ceph_cap *cap;
+	int mds = -1;
+	struct rb_node *p;
+
+	/* prefer mds with WR|WRBUFFER|EXCL caps */
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		mds = cap->mds;
+		if (mseq)
+			*mseq = cap->mseq;
+		if (cap->issued & (CEPH_CAP_FILE_WR |
+				   CEPH_CAP_FILE_BUFFER |
+				   CEPH_CAP_FILE_EXCL))
+			break;
+	}
+	return mds;
+}
+
+int ceph_get_cap_mds(struct inode *inode)
+{
+	int mds;
+	spin_lock(&inode->i_lock);
+	mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
+	spin_unlock(&inode->i_lock);
+	return mds;
+}
+
+/*
+ * Called under i_lock.
+ */
+static void __insert_cap_node(struct ceph_inode_info *ci,
+			      struct ceph_cap *new)
+{
+	struct rb_node **p = &ci->i_caps.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_cap *cap = NULL;
+
+	while (*p) {
+		parent = *p;
+		cap = rb_entry(parent, struct ceph_cap, ci_node);
+		if (new->mds < cap->mds)
+			p = &(*p)->rb_left;
+		else if (new->mds > cap->mds)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->ci_node, parent, p);
+	rb_insert_color(&new->ci_node, &ci->i_caps);
+}
+
+/*
+ * (re)set cap hold timeouts, which control the delayed release
+ * of unused caps back to the MDS.  Should be called on cap use.
+ */
+static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
+			       struct ceph_inode_info *ci)
+{
+	struct ceph_mount_args *ma = &mdsc->client->mount_args;
+
+	ci->i_hold_caps_min = round_jiffies(jiffies +
+					    ma->caps_wanted_delay_min * HZ);
+	ci->i_hold_caps_max = round_jiffies(jiffies +
+					    ma->caps_wanted_delay_max * HZ);
+	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
+	     ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+}
+
+/*
+ * (Re)queue cap at the end of the delayed cap release list.
+ *
+ * If I_FLUSH is set, leave the inode at the front of the list.
+ *
+ * Caller holds i_lock
+ *    -> we take mdsc->cap_delay_lock
+ */
+static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
+				struct ceph_inode_info *ci)
+{
+	__cap_set_timeouts(mdsc, ci);
+	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+	     ci->i_ceph_flags, ci->i_hold_caps_max);
+	if (!mdsc->stopping) {
+		spin_lock(&mdsc->cap_delay_lock);
+		if (!list_empty(&ci->i_cap_delay_list)) {
+			if (ci->i_ceph_flags & CEPH_I_FLUSH)
+				goto no_change;
+			list_del_init(&ci->i_cap_delay_list);
+		}
+		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+no_change:
+		spin_unlock(&mdsc->cap_delay_lock);
+	}
+}
+
+/*
+ * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
+ * indicating we should send a cap message to flush dirty metadata
+ * asap, and move to the front of the delayed cap list.
+ */
+static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
+				      struct ceph_inode_info *ci)
+{
+	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+	spin_lock(&mdsc->cap_delay_lock);
+	ci->i_ceph_flags |= CEPH_I_FLUSH;
+	if (!list_empty(&ci->i_cap_delay_list))
+		list_del_init(&ci->i_cap_delay_list);
+	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Cancel delayed work on cap.
+ *
+ * Caller must hold i_lock.
+ */
+static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
+			       struct ceph_inode_info *ci)
+{
+	dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+	if (list_empty(&ci->i_cap_delay_list))
+		return;
+	spin_lock(&mdsc->cap_delay_lock);
+	list_del_init(&ci->i_cap_delay_list);
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Common issue checks for add_cap, handle_cap_grant.
+ */
+static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
+			      unsigned issued)
+{
+	unsigned had = __ceph_caps_issued(ci, NULL);
+
+	/*
+	 * Each time we receive FILE_CACHE anew, we increment
+	 * i_rdcache_gen.
+	 */
+	if ((issued & CEPH_CAP_FILE_CACHE) &&
+	    (had & CEPH_CAP_FILE_CACHE) == 0)
+		ci->i_rdcache_gen++;
+
+	/*
+	 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+	 * don't know what happened to this directory while we didn't
+	 * have the cap.
+	 */
+	if ((issued & CEPH_CAP_FILE_SHARED) &&
+	    (had & CEPH_CAP_FILE_SHARED) == 0) {
+		ci->i_shared_gen++;
+		if (S_ISDIR(ci->vfs_inode.i_mode)) {
+			dout(" marking %p NOT complete\n", &ci->vfs_inode);
+			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+		}
+	}
+}
+
+/*
+ * Add a capability under the given MDS session.
+ *
+ * Caller should hold session snap_rwsem (read) and s_mutex.
+ *
+ * @fmode is the open file mode, if we are opening a file, otherwise
+ * it is < 0.  (This is so we can atomically add the cap and add an
+ * open file reference to it.)
+ */
+int ceph_add_cap(struct inode *inode,
+		 struct ceph_mds_session *session, u64 cap_id,
+		 int fmode, unsigned issued, unsigned wanted,
+		 unsigned seq, unsigned mseq, u64 realmino, int flags,
+		 struct ceph_cap_reservation *caps_reservation)
+{
+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *new_cap = NULL;
+	struct ceph_cap *cap;
+	int mds = session->s_mds;
+	int actual_wanted;
+
+	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
+	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
+
+	/*
+	 * If we are opening the file, include file mode wanted bits
+	 * in wanted.
+	 */
+	if (fmode >= 0)
+		wanted |= ceph_caps_for_mode(fmode);
+
+retry:
+	spin_lock(&inode->i_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	if (!cap) {
+		if (new_cap) {
+			cap = new_cap;
+			new_cap = NULL;
+		} else {
+			spin_unlock(&inode->i_lock);
+			new_cap = get_cap(caps_reservation);
+			if (new_cap == NULL)
+				return -ENOMEM;
+			goto retry;
+		}
+
+		cap->issued = 0;
+		cap->implemented = 0;
+		cap->mds = mds;
+		cap->mds_wanted = 0;
+
+		cap->ci = ci;
+		__insert_cap_node(ci, cap);
+
+		/* clear out old exporting info?  (i.e. on cap import) */
+		if (ci->i_cap_exporting_mds == mds) {
+			ci->i_cap_exporting_issued = 0;
+			ci->i_cap_exporting_mseq = 0;
+			ci->i_cap_exporting_mds = -1;
+		}
+
+		/* add to session cap list */
+		cap->session = session;
+		spin_lock(&session->s_cap_lock);
+		list_add_tail(&cap->session_caps, &session->s_caps);
+		session->s_nr_caps++;
+		spin_unlock(&session->s_cap_lock);
+	}
+
+	if (!ci->i_snap_realm) {
+		/*
+		 * add this inode to the appropriate snap realm
+		 */
+		struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
+							       realmino);
+		if (realm) {
+			ceph_get_snap_realm(mdsc, realm);
+			spin_lock(&realm->inodes_with_caps_lock);
+			ci->i_snap_realm = realm;
+			list_add(&ci->i_snap_realm_item,
+				 &realm->inodes_with_caps);
+			spin_unlock(&realm->inodes_with_caps_lock);
+		} else {
+			pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
+			       realmino);
+		}
+	}
+
+	__check_cap_issue(ci, cap, issued);
+
+	/*
+	 * If we are issued caps we don't want, or the mds' wanted
+	 * value appears to be off, queue a check so we'll release
+	 * later and/or update the mds wanted value.
+	 */
+	actual_wanted = __ceph_caps_wanted(ci);
+	if ((wanted & ~actual_wanted) ||
+	    (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
+		dout(" issued %s, mds wanted %s, actual %s, queueing\n",
+		     ceph_cap_string(issued), ceph_cap_string(wanted),
+		     ceph_cap_string(actual_wanted));
+		__cap_delay_requeue(mdsc, ci);
+	}
+
+	if (flags & CEPH_CAP_FLAG_AUTH)
+		ci->i_auth_cap = cap;
+	else if (ci->i_auth_cap == cap)
+		ci->i_auth_cap = NULL;
+
+	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
+	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+	     ceph_cap_string(issued|cap->issued), seq, mds);
+	cap->cap_id = cap_id;
+	cap->issued = issued;
+	cap->implemented |= issued;
+	cap->mds_wanted |= wanted;
+	cap->seq = seq;
+	cap->issue_seq = seq;
+	cap->mseq = mseq;
+	cap->gen = session->s_cap_gen;
+
+	if (fmode >= 0)
+		__ceph_get_fmode(ci, fmode);
+	spin_unlock(&inode->i_lock);
+	wake_up(&ci->i_cap_wq);
+	return 0;
+}
+
+/*
+ * Return true if cap has not timed out and belongs to the current
+ * generation of the MDS session (i.e. has not gone 'stale' due to
+ * us losing touch with the mds).
+ */
+static int __cap_is_valid(struct ceph_cap *cap)
+{
+	unsigned long ttl;
+	u32 gen;
+
+	spin_lock(&cap->session->s_cap_lock);
+	gen = cap->session->s_cap_gen;
+	ttl = cap->session->s_cap_ttl;
+	spin_unlock(&cap->session->s_cap_lock);
+
+	if (cap->gen < gen || time_after_eq(jiffies, ttl)) {
+		dout("__cap_is_valid %p cap %p issued %s "
+		     "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+		     cap, ceph_cap_string(cap->issued), cap->gen, gen);
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Return set of valid cap bits issued to us.  Note that caps time
+ * out, and may be invalidated in bulk if the client session times out
+ * and session->s_cap_gen is bumped.
+ */
+int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
+{
+	int have = ci->i_snap_caps;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+
+	if (implemented)
+		*implemented = 0;
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		dout("__ceph_caps_issued %p cap %p issued %s\n",
+		     &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+		have |= cap->issued;
+		if (implemented)
+			*implemented |= cap->implemented;
+	}
+	return have;
+}
+
+/*
+ * Get cap bits issued by caps other than @ocap
+ */
+int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
+{
+	int have = ci->i_snap_caps;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (cap == ocap)
+			continue;
+		if (!__cap_is_valid(cap))
+			continue;
+		have |= cap->issued;
+	}
+	return have;
+}
+
+/*
+ * Move a cap to the end of the LRU (oldest caps at list head, newest
+ * at list tail).
+ */
+static void __touch_cap(struct ceph_cap *cap)
+{
+	struct ceph_mds_session *s = cap->session;
+
+	dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+	     s->s_mds);
+	spin_lock(&s->s_cap_lock);
+	list_move_tail(&cap->session_caps, &s->s_caps);
+	spin_unlock(&s->s_cap_lock);
+}
+
+/*
+ * Check if we hold the given mask.  If so, move the cap(s) to the
+ * front of their respective LRUs.  (This is the preferred way for
+ * callers to check for caps they want.)
+ */
+int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
+{
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int have = ci->i_snap_caps;
+
+	if ((have & mask) == mask) {
+		dout("__ceph_caps_issued_mask %p snap issued %s"
+		     " (mask %s)\n", &ci->vfs_inode,
+		     ceph_cap_string(have),
+		     ceph_cap_string(mask));
+		return 1;
+	}
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		if ((cap->issued & mask) == mask) {
+			dout("__ceph_caps_issued_mask %p cap %p issued %s"
+			     " (mask %s)\n", &ci->vfs_inode, cap,
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(mask));
+			if (touch)
+				__touch_cap(cap);
+			return 1;
+		}
+
+		/* does a combination of caps satisfy mask? */
+		have |= cap->issued;
+		if ((have & mask) == mask) {
+			dout("__ceph_caps_issued_mask %p combo issued %s"
+			     " (mask %s)\n", &ci->vfs_inode,
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(mask));
+			if (touch) {
+				struct rb_node *q;
+
+				/* touch this + preceeding caps */
+				__touch_cap(cap);
+				for (q = rb_first(&ci->i_caps); q != p;
+				     q = rb_next(q)) {
+					cap = rb_entry(q, struct ceph_cap,
+						       ci_node);
+					if (!__cap_is_valid(cap))
+						continue;
+					__touch_cap(cap);
+				}
+			}
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Return true if mask caps are currently being revoked by an MDS.
+ */
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int ret = 0;
+
+	spin_lock(&inode->i_lock);
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (__cap_is_valid(cap) &&
+		    (cap->implemented & ~cap->issued & mask)) {
+			ret = 1;
+			break;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	dout("ceph_caps_revoking %p %s = %d\n", inode,
+	     ceph_cap_string(mask), ret);
+	return ret;
+}
+
+int __ceph_caps_used(struct ceph_inode_info *ci)
+{
+	int used = 0;
+	if (ci->i_pin_ref)
+		used |= CEPH_CAP_PIN;
+	if (ci->i_rd_ref)
+		used |= CEPH_CAP_FILE_RD;
+	if (ci->i_rdcache_ref || ci->i_rdcache_gen)
+		used |= CEPH_CAP_FILE_CACHE;
+	if (ci->i_wr_ref)
+		used |= CEPH_CAP_FILE_WR;
+	if (ci->i_wrbuffer_ref)
+		used |= CEPH_CAP_FILE_BUFFER;
+	return used;
+}
+
+/*
+ * wanted, by virtue of open file modes
+ */
+int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
+{
+	int want = 0;
+	int mode;
+	for (mode = 0; mode < 4; mode++)
+		if (ci->i_nr_by_mode[mode])
+			want |= ceph_caps_for_mode(mode);
+	return want;
+}
+
+/*
+ * Return caps we have registered with the MDS(s) as 'wanted'.
+ */
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+{
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int mds_wanted = 0;
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		mds_wanted |= cap->mds_wanted;
+	}
+	return mds_wanted;
+}
+
+/*
+ * called under i_lock
+ */
+static int __ceph_is_any_caps(struct ceph_inode_info *ci)
+{
+	return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+}
+
+/*
+ * caller should hold i_lock, and session s_mutex.
+ * returns true if this is the last cap.  if so, caller should iput.
+ */
+void __ceph_remove_cap(struct ceph_cap *cap,
+		       struct ceph_cap_reservation *ctx)
+{
+	struct ceph_mds_session *session = cap->session;
+	struct ceph_inode_info *ci = cap->ci;
+	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+
+	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+
+	/* remove from session list */
+	spin_lock(&session->s_cap_lock);
+	list_del_init(&cap->session_caps);
+	session->s_nr_caps--;
+	spin_unlock(&session->s_cap_lock);
+
+	/* remove from inode list */
+	rb_erase(&cap->ci_node, &ci->i_caps);
+	cap->session = NULL;
+	if (ci->i_auth_cap == cap)
+		ci->i_auth_cap = NULL;
+
+	put_cap(cap, ctx);
+
+	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
+		struct ceph_snap_realm *realm = ci->i_snap_realm;
+		spin_lock(&realm->inodes_with_caps_lock);
+		list_del_init(&ci->i_snap_realm_item);
+		ci->i_snap_realm_counter++;
+		ci->i_snap_realm = NULL;
+		spin_unlock(&realm->inodes_with_caps_lock);
+		ceph_put_snap_realm(mdsc, realm);
+	}
+	if (!__ceph_is_any_real_caps(ci))
+		__cap_delay_cancel(mdsc, ci);
+}
+
+/*
+ * Build and send a cap message to the given MDS.
+ *
+ * Caller should be holding s_mutex.
+ */
+static int send_cap_msg(struct ceph_mds_session *session,
+			u64 ino, u64 cid, int op,
+			int caps, int wanted, int dirty,
+			u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
+			u64 size, u64 max_size,
+			struct timespec *mtime, struct timespec *atime,
+			u64 time_warp_seq,
+			uid_t uid, gid_t gid, mode_t mode,
+			u64 xattr_version,
+			struct ceph_buffer *xattrs_buf,
+			u64 follows)
+{
+	struct ceph_mds_caps *fc;
+	struct ceph_msg *msg;
+
+	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
+	     " seq %u/%u mseq %u follows %lld size %llu/%llu"
+	     " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
+	     cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
+	     ceph_cap_string(dirty),
+	     seq, issue_seq, mseq, follows, size, max_size,
+	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	fc = msg->front.iov_base;
+
+	memset(fc, 0, sizeof(*fc));
+
+	fc->cap_id = cpu_to_le64(cid);
+	fc->op = cpu_to_le32(op);
+	fc->seq = cpu_to_le32(seq);
+	fc->client_tid = cpu_to_le64(flush_tid);
+	fc->issue_seq = cpu_to_le32(issue_seq);
+	fc->migrate_seq = cpu_to_le32(mseq);
+	fc->caps = cpu_to_le32(caps);
+	fc->wanted = cpu_to_le32(wanted);
+	fc->dirty = cpu_to_le32(dirty);
+	fc->ino = cpu_to_le64(ino);
+	fc->snap_follows = cpu_to_le64(follows);
+
+	fc->size = cpu_to_le64(size);
+	fc->max_size = cpu_to_le64(max_size);
+	if (mtime)
+		ceph_encode_timespec(&fc->mtime, mtime);
+	if (atime)
+		ceph_encode_timespec(&fc->atime, atime);
+	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
+
+	fc->uid = cpu_to_le32(uid);
+	fc->gid = cpu_to_le32(gid);
+	fc->mode = cpu_to_le32(mode);
+
+	fc->xattr_version = cpu_to_le64(xattr_version);
+	if (xattrs_buf) {
+		msg->middle = ceph_buffer_get(xattrs_buf);
+		fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+		msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+	}
+
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+/*
+ * Queue cap releases when an inode is dropped from our
+ * cache.
+ */
+void ceph_queue_caps_release(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct rb_node *p;
+
+	spin_lock(&inode->i_lock);
+	p = rb_first(&ci->i_caps);
+	while (p) {
+		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+		struct ceph_mds_session *session = cap->session;
+		struct ceph_msg *msg;
+		struct ceph_mds_cap_release *head;
+		struct ceph_mds_cap_item *item;
+
+		spin_lock(&session->s_cap_lock);
+		BUG_ON(!session->s_num_cap_releases);
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg, list_head);
+
+		dout(" adding %p release to mds%d msg %p (%d left)\n",
+		     inode, session->s_mds, msg, session->s_num_cap_releases);
+
+		BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+		head = msg->front.iov_base;
+		head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+		item = msg->front.iov_base + msg->front.iov_len;
+		item->ino = cpu_to_le64(ceph_ino(inode));
+		item->cap_id = cpu_to_le64(cap->cap_id);
+		item->migrate_seq = cpu_to_le32(cap->mseq);
+		item->seq = cpu_to_le32(cap->issue_seq);
+
+		session->s_num_cap_releases--;
+
+		msg->front.iov_len += sizeof(*item);
+		if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+			dout(" release msg %p full\n", msg);
+			list_move_tail(&msg->list_head,
+				      &session->s_cap_releases_done);
+		} else {
+			dout(" release msg %p at %d/%d (%d)\n", msg,
+			     (int)le32_to_cpu(head->num),
+			     (int)CEPH_CAPS_PER_RELEASE,
+			     (int)msg->front.iov_len);
+		}
+		spin_unlock(&session->s_cap_lock);
+		p = rb_next(p);
+		__ceph_remove_cap(cap, NULL);
+
+	}
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Send a cap msg on the given inode.  Update our caps state, then
+ * drop i_lock and send the message.
+ *
+ * Make note of max_size reported/requested from mds, revoked caps
+ * that have now been implemented.
+ *
+ * Make half-hearted attempt ot to invalidate page cache if we are
+ * dropping RDCACHE.  Note that this will leave behind locked pages
+ * that we'll then need to deal with elsewhere.
+ *
+ * Return non-zero if delayed release, or we experienced an error
+ * such that the caller should requeue + retry later.
+ *
+ * called with i_lock, then drops it.
+ * caller should hold snap_rwsem (read), s_mutex.
+ */
+static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+		      int op, int used, int want, int retain, int flushing,
+		      unsigned *pflush_tid)
+	__releases(cap->ci->vfs_inode->i_lock)
+{
+	struct ceph_inode_info *ci = cap->ci;
+	struct inode *inode = &ci->vfs_inode;
+	u64 cap_id = cap->cap_id;
+	int held = cap->issued | cap->implemented;
+	int revoking = cap->implemented & ~cap->issued;
+	int dropping = cap->issued & ~retain;
+	int keep;
+	u64 seq, issue_seq, mseq, time_warp_seq, follows;
+	u64 size, max_size;
+	struct timespec mtime, atime;
+	int wake = 0;
+	mode_t mode;
+	uid_t uid;
+	gid_t gid;
+	struct ceph_mds_session *session;
+	u64 xattr_version = 0;
+	int delayed = 0;
+	u64 flush_tid = 0;
+	int i;
+	int ret;
+
+	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
+	     inode, cap, cap->session,
+	     ceph_cap_string(held), ceph_cap_string(held & retain),
+	     ceph_cap_string(revoking));
+	BUG_ON((retain & CEPH_CAP_PIN) == 0);
+
+	session = cap->session;
+
+	/* don't release wanted unless we've waited a bit. */
+	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+	    time_before(jiffies, ci->i_hold_caps_min)) {
+		dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
+		     ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap->issued & retain),
+		     ceph_cap_string(cap->mds_wanted),
+		     ceph_cap_string(want));
+		want |= cap->mds_wanted;
+		retain |= cap->issued;
+		delayed = 1;
+	}
+	ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+
+	cap->issued &= retain;  /* drop bits we don't want */
+	if (cap->implemented & ~cap->issued) {
+		/*
+		 * Wake up any waiters on wanted -> needed transition.
+		 * This is due to the weird transition from buffered
+		 * to sync IO... we need to flush dirty pages _before_
+		 * allowing sync writes to avoid reordering.
+		 */
+		wake = 1;
+	}
+	cap->implemented &= cap->issued | used;
+	cap->mds_wanted = want;
+
+	if (flushing) {
+		/*
+		 * assign a tid for flush operations so we can avoid
+		 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
+		 * clean type races.  track latest tid for every bit
+		 * so we can handle flush AxFw, flush Fw, and have the
+		 * first ack clean Ax.
+		 */
+		flush_tid = ++ci->i_cap_flush_last_tid;
+		if (pflush_tid)
+			*pflush_tid = flush_tid;
+		dout(" cap_flush_tid %d\n", (int)flush_tid);
+		for (i = 0; i < CEPH_CAP_BITS; i++)
+			if (flushing & (1 << i))
+				ci->i_cap_flush_tid[i] = flush_tid;
+	}
+
+	keep = cap->implemented;
+	seq = cap->seq;
+	issue_seq = cap->issue_seq;
+	mseq = cap->mseq;
+	size = inode->i_size;
+	ci->i_reported_size = size;
+	max_size = ci->i_wanted_max_size;
+	ci->i_requested_max_size = max_size;
+	mtime = inode->i_mtime;
+	atime = inode->i_atime;
+	time_warp_seq = ci->i_time_warp_seq;
+	follows = ci->i_snap_realm->cached_context->seq;
+	uid = inode->i_uid;
+	gid = inode->i_gid;
+	mode = inode->i_mode;
+
+	if (dropping & CEPH_CAP_XATTR_EXCL) {
+		__ceph_build_xattrs_blob(ci);
+		xattr_version = ci->i_xattrs.version + 1;
+	}
+
+	spin_unlock(&inode->i_lock);
+
+	if (dropping & CEPH_CAP_FILE_CACHE) {
+		/* invalidate what we can */
+		dout("invalidating pages on %p\n", inode);
+		invalidate_mapping_pages(&inode->i_data, 0, -1);
+	}
+
+	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
+		op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+		size, max_size, &mtime, &atime, time_warp_seq,
+		uid, gid, mode,
+		xattr_version,
+		(flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
+		follows);
+	if (ret < 0) {
+		dout("error sending cap msg, must requeue %p\n", inode);
+		delayed = 1;
+	}
+
+	if (wake)
+		wake_up(&ci->i_cap_wq);
+
+	return delayed;
+}
+
+/*
+ * When a snapshot is taken, clients accumulate dirty metadata on
+ * inodes with capabilities in ceph_cap_snaps to describe the file
+ * state at the time the snapshot was taken.  This must be flushed
+ * asynchronously back to the MDS once sync writes complete and dirty
+ * data is written out.
+ *
+ * Called under i_lock.  Takes s_mutex as needed.
+ */
+void __ceph_flush_snaps(struct ceph_inode_info *ci,
+			struct ceph_mds_session **psession)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int mds;
+	struct ceph_cap_snap *capsnap;
+	u32 mseq;
+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
+						    session->s_mutex */
+	u64 next_follows = 0;  /* keep track of how far we've gotten through the
+			     i_cap_snaps list, and skip these entries next time
+			     around to avoid an infinite loop */
+
+	if (psession)
+		session = *psession;
+
+	dout("__flush_snaps %p\n", inode);
+retry:
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		/* avoid an infiniute loop after retry */
+		if (capsnap->follows < next_follows)
+			continue;
+		/*
+		 * we need to wait for sync writes to complete and for dirty
+		 * pages to be written out.
+		 */
+		if (capsnap->dirty_pages || capsnap->writing)
+			continue;
+
+		/* pick mds, take s_mutex */
+		mds = __ceph_get_cap_mds(ci, &mseq);
+		if (session && session->s_mds != mds) {
+			dout("oops, wrong session %p mutex\n", session);
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			session = NULL;
+		}
+		if (!session) {
+			spin_unlock(&inode->i_lock);
+			mutex_lock(&mdsc->mutex);
+			session = __ceph_lookup_mds_session(mdsc, mds);
+			mutex_unlock(&mdsc->mutex);
+			if (session) {
+				dout("inverting session/ino locks on %p\n",
+				     session);
+				mutex_lock(&session->s_mutex);
+			}
+			/*
+			 * if session == NULL, we raced against a cap
+			 * deletion.  retry, and we'll get a better
+			 * @mds value next time.
+			 */
+			spin_lock(&inode->i_lock);
+			goto retry;
+		}
+
+		capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+		atomic_inc(&capsnap->nref);
+		if (!list_empty(&capsnap->flushing_item))
+			list_del_init(&capsnap->flushing_item);
+		list_add_tail(&capsnap->flushing_item,
+			      &session->s_cap_snaps_flushing);
+		spin_unlock(&inode->i_lock);
+
+		dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
+		     inode, capsnap, next_follows, capsnap->size);
+		send_cap_msg(session, ceph_vino(inode).ino, 0,
+			     CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+			     capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
+			     capsnap->size, 0,
+			     &capsnap->mtime, &capsnap->atime,
+			     capsnap->time_warp_seq,
+			     capsnap->uid, capsnap->gid, capsnap->mode,
+			     0, NULL,
+			     capsnap->follows);
+
+		next_follows = capsnap->follows + 1;
+		ceph_put_cap_snap(capsnap);
+
+		spin_lock(&inode->i_lock);
+		goto retry;
+	}
+
+	/* we flushed them all; remove this inode from the queue */
+	spin_lock(&mdsc->snap_flush_lock);
+	list_del_init(&ci->i_snap_flush_item);
+	spin_unlock(&mdsc->snap_flush_lock);
+
+	if (psession)
+		*psession = session;
+	else if (session) {
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+	}
+}
+
+static void ceph_flush_snaps(struct ceph_inode_info *ci)
+{
+	struct inode *inode = &ci->vfs_inode;
+
+	spin_lock(&inode->i_lock);
+	__ceph_flush_snaps(ci, NULL);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Add dirty inode to the flushing list.  Assigned a seq number so we
+ * can wait for caps to flush without starving.
+ */
+static void __mark_caps_flushing(struct inode *inode,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	BUG_ON(list_empty(&ci->i_dirty_item));
+	spin_lock(&mdsc->cap_dirty_lock);
+	if (list_empty(&ci->i_flushing_item)) {
+		list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+		mdsc->num_cap_flushing++;
+		ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
+		dout(" inode %p now flushing seq %lld\n", &ci->vfs_inode,
+		     ci->i_cap_flush_seq);
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+}
+
+/*
+ * Swiss army knife function to examine currently used and wanted
+ * versus held caps.  Release, flush, ack revoked caps to mds as
+ * appropriate.
+ *
+ *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
+ *    cap release further.
+ *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
+ *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
+ *    further delay.
+ */
+void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+		     struct ceph_mds_session *session)
+{
+	struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap *cap;
+	int file_wanted, used;
+	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
+	int drop_session_lock = session ? 0 : 1;
+	int want, retain, revoking, flushing = 0;
+	int mds = -1;   /* keep track of how far we've gone through i_caps list
+			   to avoid an infinite loop on retry */
+	struct rb_node *p;
+	int tried_invalidate = 0;
+	int delayed = 0, sent = 0, force_requeue = 0, num;
+	int is_delayed = flags & CHECK_CAPS_NODELAY;
+
+	/* if we are unmounting, flush any unused caps immediately. */
+	if (mdsc->stopping)
+		is_delayed = 1;
+
+	spin_lock(&inode->i_lock);
+
+	if (ci->i_ceph_flags & CEPH_I_FLUSH)
+		flags |= CHECK_CAPS_FLUSH;
+
+	/* flush snaps first time around only */
+	if (!list_empty(&ci->i_cap_snaps))
+		__ceph_flush_snaps(ci, &session);
+	goto retry_locked;
+retry:
+	spin_lock(&inode->i_lock);
+retry_locked:
+	file_wanted = __ceph_caps_file_wanted(ci);
+	used = __ceph_caps_used(ci);
+	want = file_wanted | used;
+
+	retain = want | CEPH_CAP_PIN;
+	if (!mdsc->stopping && inode->i_nlink > 0) {
+		if (want) {
+			retain |= CEPH_CAP_ANY;       /* be greedy */
+		} else {
+			retain |= CEPH_CAP_ANY_SHARED;
+			/*
+			 * keep RD only if we didn't have the file open RW,
+			 * because then the mds would revoke it anyway to
+			 * journal max_size=0.
+			 */
+			if (ci->i_max_size == 0)
+				retain |= CEPH_CAP_ANY_RD;
+		}
+	}
+
+	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
+	     " issued %s retain %s %s%s%s\n", inode,
+	     ceph_cap_string(file_wanted),
+	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
+	     ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(__ceph_caps_issued(ci, NULL)),
+	     ceph_cap_string(retain),
+	     (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
+	     (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
+	     (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+
+	/*
+	 * If we no longer need to hold onto old our caps, and we may
+	 * have cached pages, but don't want them, then try to invalidate.
+	 * If we fail, it's because pages are locked.... try again later.
+	 */
+	if ((!is_delayed || mdsc->stopping) &&
+	    ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
+	    ci->i_rdcache_gen &&                     /* may have cached pages */
+	    file_wanted == 0 &&                      /* no open files */
+	    !ci->i_truncate_pending &&
+	    !tried_invalidate) {
+		u32 invalidating_gen = ci->i_rdcache_gen;
+		int ret;
+
+		dout("check_caps trying to invalidate on %p\n", inode);
+		spin_unlock(&inode->i_lock);
+		ret = invalidate_inode_pages2(&inode->i_data);
+		spin_lock(&inode->i_lock);
+		if (ret == 0 && invalidating_gen == ci->i_rdcache_gen) {
+			/* success. */
+			ci->i_rdcache_gen = 0;
+			ci->i_rdcache_revoking = 0;
+		} else {
+			dout("check_caps failed to invalidate pages\n");
+			/* we failed to invalidate pages.  check these
+			   caps again later. */
+			force_requeue = 1;
+			__cap_set_timeouts(mdsc, ci);
+		}
+		tried_invalidate = 1;
+		goto retry_locked;
+	}
+
+	num = 0;
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		num++;
+
+		/* avoid looping forever */
+		if (mds >= cap->mds ||
+		    ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
+			continue;
+
+		/* NOTE: no side-effects allowed, until we take s_mutex */
+
+		revoking = cap->implemented & ~cap->issued;
+		if (revoking)
+			dout("mds%d revoking %s\n", cap->mds,
+			     ceph_cap_string(revoking));
+
+		if (cap == ci->i_auth_cap &&
+		    (cap->issued & CEPH_CAP_FILE_WR)) {
+			/* request larger max_size from MDS? */
+			if (ci->i_wanted_max_size > ci->i_max_size &&
+			    ci->i_wanted_max_size > ci->i_requested_max_size) {
+				dout("requesting new max_size\n");
+				goto ack;
+			}
+
+			/* approaching file_max? */
+			if ((inode->i_size << 1) >= ci->i_max_size &&
+			    (ci->i_reported_size << 1) < ci->i_max_size) {
+				dout("i_size approaching max_size\n");
+				goto ack;
+			}
+		}
+		/* flush anything dirty? */
+		if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
+		    ci->i_dirty_caps) {
+			dout("flushing dirty caps\n");
+			goto ack;
+		}
+
+		/* completed revocation? going down and there are no caps? */
+		if (revoking && (revoking & used) == 0) {
+			dout("completed revocation of %s\n",
+			     ceph_cap_string(cap->implemented & ~cap->issued));
+			goto ack;
+		}
+
+		/* want more caps from mds? */
+		if (want & ~(cap->mds_wanted | cap->issued))
+			goto ack;
+
+		/* things we might delay */
+		if ((cap->issued & ~retain) == 0 &&
+		    cap->mds_wanted == want)
+			continue;     /* nope, all good */
+
+		if (is_delayed)
+			goto ack;
+
+		/* delay? */
+		if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+		    time_before(jiffies, ci->i_hold_caps_max)) {
+			dout(" delaying issued %s -> %s, wanted %s -> %s\n",
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(cap->issued & retain),
+			     ceph_cap_string(cap->mds_wanted),
+			     ceph_cap_string(want));
+			delayed++;
+			continue;
+		}
+
+ack:
+		if (session && session != cap->session) {
+			dout("oops, wrong session %p mutex\n", session);
+			mutex_unlock(&session->s_mutex);
+			session = NULL;
+		}
+		if (!session) {
+			session = cap->session;
+			if (mutex_trylock(&session->s_mutex) == 0) {
+				dout("inverting session/ino locks on %p\n",
+				     session);
+				spin_unlock(&inode->i_lock);
+				if (took_snap_rwsem) {
+					up_read(&mdsc->snap_rwsem);
+					took_snap_rwsem = 0;
+				}
+				mutex_lock(&session->s_mutex);
+				goto retry;
+			}
+		}
+		/* take snap_rwsem after session mutex */
+		if (!took_snap_rwsem) {
+			if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
+				dout("inverting snap/in locks on %p\n",
+				     inode);
+				spin_unlock(&inode->i_lock);
+				down_read(&mdsc->snap_rwsem);
+				took_snap_rwsem = 1;
+				goto retry;
+			}
+			took_snap_rwsem = 1;
+		}
+
+		if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
+			/* update dirty, flushing bits */
+			flushing = ci->i_dirty_caps;
+			dout(" flushing %s, flushing_caps %s -> %s\n",
+			     ceph_cap_string(flushing),
+			     ceph_cap_string(ci->i_flushing_caps),
+			     ceph_cap_string(ci->i_flushing_caps | flushing));
+			ci->i_flushing_caps |= flushing;
+			ci->i_dirty_caps = 0;
+			__mark_caps_flushing(inode, session);
+		}
+
+		mds = cap->mds;  /* remember mds, so we don't repeat */
+		sent++;
+
+		/* __send_cap drops i_lock */
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
+				      retain, flushing, NULL);
+		goto retry; /* retake i_lock and restart our cap scan. */
+	}
+
+	/*
+	 * Reschedule delayed caps release if we delayed anything,
+	 * otherwise cancel.
+	 */
+	if (delayed && is_delayed)
+		force_requeue = 1;   /* __send_cap delayed release; requeue */
+	if (!delayed && !is_delayed)
+		__cap_delay_cancel(mdsc, ci);
+	else if (!is_delayed || force_requeue)
+		__cap_delay_requeue(mdsc, ci);
+
+	spin_unlock(&inode->i_lock);
+
+	if (session && drop_session_lock)
+		mutex_unlock(&session->s_mutex);
+	if (took_snap_rwsem)
+		up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Mark caps dirty.  If inode is newly dirty, add to the global dirty
+ * list.
+ */
+int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+{
+	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+	struct inode *inode = &ci->vfs_inode;
+	int was = __ceph_caps_dirty(ci);
+	int dirty = 0;
+
+	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+	     ceph_cap_string(mask), ceph_cap_string(ci->i_dirty_caps),
+	     ceph_cap_string(ci->i_dirty_caps | mask));
+	ci->i_dirty_caps |= mask;
+	if (!was) {
+		dout(" inode %p now dirty\n", &ci->vfs_inode);
+		spin_lock(&mdsc->cap_dirty_lock);
+		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+		spin_unlock(&mdsc->cap_dirty_lock);
+		igrab(inode);
+		dirty |= I_DIRTY_SYNC;
+	}
+	if ((was & CEPH_CAP_FILE_BUFFER) &&
+	    (mask & CEPH_CAP_FILE_BUFFER))
+		dirty |= I_DIRTY_DATASYNC;
+	if (dirty)
+		__mark_inode_dirty(inode, dirty);
+	__cap_delay_requeue(mdsc, ci);
+	return was;
+}
+
+/*
+ * Try to flush dirty caps back to the auth mds.
+ */
+static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
+			  unsigned *flush_tid)
+{
+	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int unlock_session = session ? 0 : 1;
+	int flushing = 0;
+
+retry:
+	spin_lock(&inode->i_lock);
+	if (ci->i_dirty_caps && ci->i_auth_cap) {
+		struct ceph_cap *cap = ci->i_auth_cap;
+		int used = __ceph_caps_used(ci);
+		int want = __ceph_caps_wanted(ci);
+		int delayed;
+
+		if (!session) {
+			spin_unlock(&inode->i_lock);
+			session = cap->session;
+			mutex_lock(&session->s_mutex);
+			goto retry;
+		}
+		BUG_ON(session != cap->session);
+		if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
+			goto out;
+
+		__mark_caps_flushing(inode, session);
+
+		flushing = ci->i_dirty_caps;
+		dout(" flushing %s, flushing_caps %s -> %s\n",
+		     ceph_cap_string(flushing),
+		     ceph_cap_string(ci->i_flushing_caps),
+		     ceph_cap_string(ci->i_flushing_caps | flushing));
+		ci->i_flushing_caps |= flushing;
+		ci->i_dirty_caps = 0;
+
+		/* __send_cap drops i_lock */
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
+				     cap->issued | cap->implemented, flushing,
+				     flush_tid);
+		if (!delayed)
+			goto out_unlocked;
+
+		spin_lock(&inode->i_lock);
+		__cap_delay_requeue(mdsc, ci);
+	}
+out:
+	spin_unlock(&inode->i_lock);
+out_unlocked:
+	if (session && unlock_session)
+		mutex_unlock(&session->s_mutex);
+	return flushing;
+}
+
+/*
+ * Return true if we've flushed caps through the given flush_tid.
+ */
+static int caps_are_flushed(struct inode *inode, unsigned tid)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int dirty, i, ret = 1;
+
+	spin_lock(&inode->i_lock);
+	dirty = __ceph_caps_dirty(ci);
+	for (i = 0; i < CEPH_CAP_BITS; i++)
+		if ((ci->i_flushing_caps & (1 << i)) &&
+		    ci->i_cap_flush_tid[i] <= tid) {
+			/* still flushing this bit */
+			ret = 0;
+			break;
+		}
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+/*
+ * Wait on any unsafe replies for the given inode.  First wait on the
+ * newest request, and make that the upper bound.  Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+static void sync_write_wait(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct list_head *head = &ci->i_unsafe_writes;
+	struct ceph_osd_request *req;
+	u64 last_tid;
+
+	spin_lock(&ci->i_unsafe_lock);
+	if (list_empty(head))
+		goto out;
+
+	/* set upper bound as _last_ entry in chain */
+	req = list_entry(head->prev, struct ceph_osd_request,
+			 r_unsafe_item);
+	last_tid = req->r_tid;
+
+	do {
+		ceph_osdc_get_request(req);
+		spin_unlock(&ci->i_unsafe_lock);
+		dout("sync_write_wait on tid %llu (until %llu)\n",
+		     req->r_tid, last_tid);
+		wait_for_completion(&req->r_safe_completion);
+		spin_lock(&ci->i_unsafe_lock);
+		ceph_osdc_put_request(req);
+
+		/*
+		 * from here on look at first entry in chain, since we
+		 * only want to wait for anything older than last_tid
+		 */
+		if (list_empty(head))
+			break;
+		req = list_entry(head->next, struct ceph_osd_request,
+				 r_unsafe_item);
+	} while (req->r_tid < last_tid);
+out:
+	spin_unlock(&ci->i_unsafe_lock);
+}
+
+int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned flush_tid;
+	int ret;
+	int dirty;
+
+	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+	sync_write_wait(inode);
+
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret < 0)
+		return ret;
+
+	dirty = try_flush_caps(inode, NULL, &flush_tid);
+	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+
+	/*
+	 * only wait on non-file metadata writeback (the mds
+	 * can recover size and mtime, so we don't need to
+	 * wait for that)
+	 */
+	if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+		dout("fsync waiting for flush_tid %u\n", flush_tid);
+		ret = wait_event_interruptible(ci->i_cap_wq,
+				       caps_are_flushed(inode, flush_tid));
+	}
+
+	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
+	return ret;
+}
+
+/*
+ * Flush any dirty caps back to the mds.  If we aren't asked to wait,
+ * queue inode for flush but don't do so immediately, because we can
+ * get by with fewer MDS messages if we wait for data writeback to
+ * complete first.
+ */
+int ceph_write_inode(struct inode *inode, int wait)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned flush_tid;
+	int err = 0;
+	int dirty;
+
+	dout("write_inode %p wait=%d\n", inode, wait);
+	if (wait) {
+		dirty = try_flush_caps(inode, NULL, &flush_tid);
+		if (dirty)
+			err = wait_event_interruptible(ci->i_cap_wq,
+				       caps_are_flushed(inode, flush_tid));
+	} else {
+		struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+
+		spin_lock(&inode->i_lock);
+		if (__ceph_caps_dirty(ci))
+			__cap_delay_requeue_front(mdsc, ci);
+		spin_unlock(&inode->i_lock);
+	}
+	return err;
+}
+
+/*
+ * After a recovering MDS goes active, we need to resend any caps
+ * we were flushing.
+ *
+ * Caller holds session->s_mutex.
+ */
+static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session)
+{
+	struct ceph_cap_snap *capsnap;
+
+	dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
+	list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
+			    flushing_item) {
+		struct ceph_inode_info *ci = capsnap->ci;
+		struct inode *inode = &ci->vfs_inode;
+		struct ceph_cap *cap;
+
+		spin_lock(&inode->i_lock);
+		cap = ci->i_auth_cap;
+		if (cap && cap->session == session) {
+			dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
+			     cap, capsnap);
+			__ceph_flush_snaps(ci, &session);
+		} else {
+			pr_err("%p auth cap %p not mds%d ???\n", inode,
+			       cap, session->s_mds);
+			spin_unlock(&inode->i_lock);
+		}
+	}
+}
+
+void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci;
+
+	kick_flushing_capsnaps(mdsc, session);
+
+	dout("kick_flushing_caps mds%d\n", session->s_mds);
+	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+		struct inode *inode = &ci->vfs_inode;
+		struct ceph_cap *cap;
+		int delayed = 0;
+
+		spin_lock(&inode->i_lock);
+		cap = ci->i_auth_cap;
+		if (cap && cap->session == session) {
+			dout("kick_flushing_caps %p cap %p %s\n", inode,
+			     cap, ceph_cap_string(ci->i_flushing_caps));
+			delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+					     __ceph_caps_used(ci),
+					     __ceph_caps_wanted(ci),
+					     cap->issued | cap->implemented,
+					     ci->i_flushing_caps, NULL);
+			if (delayed) {
+				spin_lock(&inode->i_lock);
+				__cap_delay_requeue(mdsc, ci);
+				spin_unlock(&inode->i_lock);
+			}
+		} else {
+			pr_err("%p auth cap %p not mds%d ???\n", inode,
+			       cap, session->s_mds);
+			spin_unlock(&inode->i_lock);
+		}
+	}
+}
+
+
+/*
+ * Take references to capabilities we hold, so that we don't release
+ * them to the MDS prematurely.
+ *
+ * Protected by i_lock.
+ */
+static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+{
+	if (got & CEPH_CAP_PIN)
+		ci->i_pin_ref++;
+	if (got & CEPH_CAP_FILE_RD)
+		ci->i_rd_ref++;
+	if (got & CEPH_CAP_FILE_CACHE)
+		ci->i_rdcache_ref++;
+	if (got & CEPH_CAP_FILE_WR)
+		ci->i_wr_ref++;
+	if (got & CEPH_CAP_FILE_BUFFER) {
+		if (ci->i_wrbuffer_ref == 0)
+			igrab(&ci->vfs_inode);
+		ci->i_wrbuffer_ref++;
+		dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
+		     &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
+	}
+}
+
+/*
+ * Try to grab cap references.  Specify those refs we @want, and the
+ * minimal set we @need.  Also include the larger offset we are writing
+ * to (when applicable), and check against max_size here as well.
+ * Note that caller is responsible for ensuring max_size increases are
+ * requested from the MDS.
+ */
+static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
+			    int *got, loff_t endoff, int *check_max, int *err)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int ret = 0;
+	int have, implemented;
+
+	dout("get_cap_refs %p need %s want %s\n", inode,
+	     ceph_cap_string(need), ceph_cap_string(want));
+	spin_lock(&inode->i_lock);
+
+	/* make sure we _have_ some caps! */
+	if (!__ceph_is_any_caps(ci)) {
+		dout("get_cap_refs %p no real caps\n", inode);
+		*err = -EBADF;
+		ret = 1;
+		goto out;
+	}
+
+	if (need & CEPH_CAP_FILE_WR) {
+		if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
+			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
+			     inode, endoff, ci->i_max_size);
+			if (endoff > ci->i_wanted_max_size) {
+				*check_max = 1;
+				ret = 1;
+			}
+			goto out;
+		}
+		/*
+		 * If a sync write is in progress, we must wait, so that we
+		 * can get a final snapshot value for size+mtime.
+		 */
+		if (__ceph_have_pending_cap_snap(ci)) {
+			dout("get_cap_refs %p cap_snap_pending\n", inode);
+			goto out;
+		}
+	}
+	have = __ceph_caps_issued(ci, &implemented);
+
+	/*
+	 * disallow writes while a truncate is pending
+	 */
+	if (ci->i_truncate_pending)
+		have &= ~CEPH_CAP_FILE_WR;
+
+	if ((have & need) == need) {
+		/*
+		 * Look at (implemented & ~have & not) so that we keep waiting
+		 * on transition from wanted -> needed caps.  This is needed
+		 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
+		 * going before a prior buffered writeback happens.
+		 */
+		int not = want & ~(have & need);
+		int revoking = implemented & ~have;
+		dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
+		     inode, ceph_cap_string(have), ceph_cap_string(not),
+		     ceph_cap_string(revoking));
+		if ((revoking & not) == 0) {
+			*got = need | (have & want);
+			__take_cap_refs(ci, *got);
+			ret = 1;
+		}
+	} else {
+		dout("get_cap_refs %p have %s needed %s\n", inode,
+		     ceph_cap_string(have), ceph_cap_string(need));
+	}
+out:
+	spin_unlock(&inode->i_lock);
+	dout("get_cap_refs %p ret %d got %s\n", inode,
+	     ret, ceph_cap_string(*got));
+	return ret;
+}
+
+/*
+ * Check the offset we are writing up to against our current
+ * max_size.  If necessary, tell the MDS we want to write to
+ * a larger offset.
+ */
+static void check_max_size(struct inode *inode, loff_t endoff)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int check = 0;
+
+	/* do we need to explicitly request a larger max_size? */
+	spin_lock(&inode->i_lock);
+	if ((endoff >= ci->i_max_size ||
+	     endoff > (inode->i_size << 1)) &&
+	    endoff > ci->i_wanted_max_size) {
+		dout("write %p at large endoff %llu, req max_size\n",
+		     inode, endoff);
+		ci->i_wanted_max_size = endoff;
+		check = 1;
+	}
+	spin_unlock(&inode->i_lock);
+	if (check)
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+}
+
+/*
+ * Wait for caps, and take cap references.  If we can't get a WR cap
+ * due to a small max_size, make sure we check_max_size (and possibly
+ * ask the mds) so we don't get hung up indefinitely.
+ */
+int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
+		  loff_t endoff)
+{
+	int check_max, ret, err;
+
+retry:
+	if (endoff > 0)
+		check_max_size(&ci->vfs_inode, endoff);
+	check_max = 0;
+	err = 0;
+	ret = wait_event_interruptible(ci->i_cap_wq,
+				       try_get_cap_refs(ci, need, want,
+							got, endoff,
+							&check_max, &err));
+	if (err)
+		ret = err;
+	if (check_max)
+		goto retry;
+	return ret;
+}
+
+/*
+ * Take cap refs.  Caller must already know we hold at least one ref
+ * on the caps in question or we don't know this is safe.
+ */
+void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
+{
+	spin_lock(&ci->vfs_inode.i_lock);
+	__take_cap_refs(ci, caps);
+	spin_unlock(&ci->vfs_inode.i_lock);
+}
+
+/*
+ * Release cap refs.
+ *
+ * If we released the last ref on any given cap, call ceph_check_caps
+ * to release (or schedule a release).
+ *
+ * If we are releasing a WR cap (from a sync write), finalize any affected
+ * cap_snap, and wake up any waiters.
+ */
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0, put = 0, flushsnaps = 0, wake = 0;
+	struct ceph_cap_snap *capsnap;
+
+	spin_lock(&inode->i_lock);
+	if (had & CEPH_CAP_PIN)
+		--ci->i_pin_ref;
+	if (had & CEPH_CAP_FILE_RD)
+		if (--ci->i_rd_ref == 0)
+			last++;
+	if (had & CEPH_CAP_FILE_CACHE)
+		if (--ci->i_rdcache_ref == 0)
+			last++;
+	if (had & CEPH_CAP_FILE_BUFFER) {
+		if (--ci->i_wrbuffer_ref == 0) {
+			last++;
+			put++;
+		}
+		dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
+		     inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
+	}
+	if (had & CEPH_CAP_FILE_WR)
+		if (--ci->i_wr_ref == 0) {
+			last++;
+			if (!list_empty(&ci->i_cap_snaps)) {
+				capsnap = list_first_entry(&ci->i_cap_snaps,
+						     struct ceph_cap_snap,
+						     ci_item);
+				if (capsnap->writing) {
+					capsnap->writing = 0;
+					flushsnaps =
+						__ceph_finish_cap_snap(ci,
+								       capsnap);
+					wake = 1;
+				}
+			}
+		}
+	spin_unlock(&inode->i_lock);
+
+	dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
+	     last ? "last" : "");
+
+	if (last && !flushsnaps)
+		ceph_check_caps(ci, 0, NULL);
+	else if (flushsnaps)
+		ceph_flush_snaps(ci);
+	if (wake)
+		wake_up(&ci->i_cap_wq);
+	if (put)
+		iput(inode);
+}
+
+/*
+ * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
+ * context.  Adjust per-snap dirty page accounting as appropriate.
+ * Once all dirty data for a cap_snap is flushed, flush snapped file
+ * metadata back to the MDS.  If we dropped the last ref, call
+ * ceph_check_caps.
+ */
+void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+				struct ceph_snap_context *snapc)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0;
+	int last_snap = 0;
+	int found = 0;
+	struct ceph_cap_snap *capsnap = NULL;
+
+	spin_lock(&inode->i_lock);
+	ci->i_wrbuffer_ref -= nr;
+	last = !ci->i_wrbuffer_ref;
+
+	if (ci->i_head_snapc == snapc) {
+		ci->i_wrbuffer_ref_head -= nr;
+		if (!ci->i_wrbuffer_ref_head) {
+			ceph_put_snap_context(ci->i_head_snapc);
+			ci->i_head_snapc = NULL;
+		}
+		dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
+		     inode,
+		     ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
+		     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+		     last ? " LAST" : "");
+	} else {
+		list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+			if (capsnap->context == snapc) {
+				found = 1;
+				capsnap->dirty_pages -= nr;
+				last_snap = !capsnap->dirty_pages;
+				break;
+			}
+		}
+		BUG_ON(!found);
+		dout("put_wrbuffer_cap_refs on %p cap_snap %p "
+		     " snap %lld %d/%d -> %d/%d %s%s\n",
+		     inode, capsnap, capsnap->context->seq,
+		     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+		     ci->i_wrbuffer_ref, capsnap->dirty_pages,
+		     last ? " (wrbuffer last)" : "",
+		     last_snap ? " (capsnap last)" : "");
+	}
+
+	spin_unlock(&inode->i_lock);
+
+	if (last) {
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+		iput(inode);
+	} else if (last_snap) {
+		ceph_flush_snaps(ci);
+		wake_up(&ci->i_cap_wq);
+	}
+}
+
+/*
+ * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
+ * actually be a revocation if it specifies a smaller cap set.)
+ *
+ * caller holds s_mutex.
+ * return value:
+ *  0 - ok
+ *  1 - check_caps on auth cap only (writeback)
+ *  2 - check_caps (ack revoke)
+ */
+static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+			    struct ceph_mds_session *session,
+			    struct ceph_cap *cap,
+			    struct ceph_buffer *xattr_buf)
+	__releases(inode->i_lock)
+
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	int seq = le32_to_cpu(grant->seq);
+	int newcaps = le32_to_cpu(grant->caps);
+	int issued, implemented, used, wanted, dirty;
+	u64 size = le64_to_cpu(grant->size);
+	u64 max_size = le64_to_cpu(grant->max_size);
+	struct timespec mtime, atime, ctime;
+	int reply = 0;
+	int wake = 0;
+	int writeback = 0;
+	int revoked_rdcache = 0;
+	int invalidate_async = 0;
+	int tried_invalidate = 0;
+	int ret;
+
+	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+	     inode, cap, mds, seq, ceph_cap_string(newcaps));
+	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
+		inode->i_size);
+
+	/*
+	 * If CACHE is being revoked, and we have no dirty buffers,
+	 * try to invalidate (once).  (If there are dirty buffers, we
+	 * will invalidate _after_ writeback.)
+	 */
+restart:
+	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+	    !ci->i_wrbuffer_ref && !tried_invalidate) {
+		dout("CACHE invalidation\n");
+		spin_unlock(&inode->i_lock);
+		tried_invalidate = 1;
+
+		ret = invalidate_inode_pages2(&inode->i_data);
+		spin_lock(&inode->i_lock);
+		if (ret < 0) {
+			/* there were locked pages.. invalidate later
+			   in a separate thread. */
+			if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+				invalidate_async = 1;
+				ci->i_rdcache_revoking = ci->i_rdcache_gen;
+			}
+		} else {
+			/* we successfully invalidated those pages */
+			revoked_rdcache = 1;
+			ci->i_rdcache_gen = 0;
+			ci->i_rdcache_revoking = 0;
+		}
+		goto restart;
+	}
+
+	/* side effects now are allowed */
+
+	issued = __ceph_caps_issued(ci, &implemented);
+	issued |= implemented | __ceph_caps_dirty(ci);
+
+	cap->gen = session->s_cap_gen;
+
+	__check_cap_issue(ci, cap, newcaps);
+
+	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+		inode->i_mode = le32_to_cpu(grant->mode);
+		inode->i_uid = le32_to_cpu(grant->uid);
+		inode->i_gid = le32_to_cpu(grant->gid);
+		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+		     inode->i_uid, inode->i_gid);
+	}
+
+	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+		inode->i_nlink = le32_to_cpu(grant->nlink);
+
+	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
+		int len = le32_to_cpu(grant->xattr_len);
+		u64 version = le64_to_cpu(grant->xattr_version);
+
+		if (version > ci->i_xattrs.version) {
+			dout(" got new xattrs v%llu on %p len %d\n",
+			     version, inode, len);
+			if (ci->i_xattrs.blob)
+				ceph_buffer_put(ci->i_xattrs.blob);
+			ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
+			ci->i_xattrs.version = version;
+		}
+	}
+
+	/* size/ctime/mtime/atime? */
+	ceph_fill_file_size(inode, issued,
+			    le32_to_cpu(grant->truncate_seq),
+			    le64_to_cpu(grant->truncate_size), size);
+	ceph_decode_timespec(&mtime, &grant->mtime);
+	ceph_decode_timespec(&atime, &grant->atime);
+	ceph_decode_timespec(&ctime, &grant->ctime);
+	ceph_fill_file_time(inode, issued,
+			    le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
+			    &atime);
+
+	/* max size increase? */
+	if (max_size != ci->i_max_size) {
+		dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
+		ci->i_max_size = max_size;
+		if (max_size >= ci->i_wanted_max_size) {
+			ci->i_wanted_max_size = 0;  /* reset */
+			ci->i_requested_max_size = 0;
+		}
+		wake = 1;
+	}
+
+	/* check cap bits */
+	wanted = __ceph_caps_wanted(ci);
+	used = __ceph_caps_used(ci);
+	dirty = __ceph_caps_dirty(ci);
+	dout(" my wanted = %s, used = %s, dirty %s\n",
+	     ceph_cap_string(wanted),
+	     ceph_cap_string(used),
+	     ceph_cap_string(dirty));
+	if (wanted != le32_to_cpu(grant->wanted)) {
+		dout("mds wanted %s -> %s\n",
+		     ceph_cap_string(le32_to_cpu(grant->wanted)),
+		     ceph_cap_string(wanted));
+		grant->wanted = cpu_to_le32(wanted);
+	}
+
+	cap->seq = seq;
+
+	/* file layout may have changed */
+	ci->i_layout = grant->layout;
+
+	/* revocation, grant, or no-op? */
+	if (cap->issued & ~newcaps) {
+		dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
+		     ceph_cap_string(newcaps));
+		if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
+			writeback = 1; /* will delay ack */
+		else if (dirty & ~newcaps)
+			reply = 1;     /* initiate writeback in check_caps */
+		else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
+			   revoked_rdcache)
+			reply = 2;     /* send revoke ack in check_caps */
+		cap->issued = newcaps;
+	} else if (cap->issued == newcaps) {
+		dout("caps unchanged: %s -> %s\n",
+		     ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+	} else {
+		dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
+		     ceph_cap_string(newcaps));
+		cap->issued = newcaps;
+		cap->implemented |= newcaps; /* add bits only, to
+					      * avoid stepping on a
+					      * pending revocation */
+		wake = 1;
+	}
+
+	spin_unlock(&inode->i_lock);
+	if (writeback) {
+		/*
+		 * queue inode for writeback: we can't actually call
+		 * filemap_write_and_wait, etc. from message handler
+		 * context.
+		 */
+		dout("queueing %p for writeback\n", inode);
+		if (ceph_queue_writeback(inode))
+			igrab(inode);
+	}
+	if (invalidate_async) {
+		dout("queueing %p for page invalidation\n", inode);
+		if (ceph_queue_page_invalidation(inode))
+			igrab(inode);
+	}
+	if (wake)
+		wake_up(&ci->i_cap_wq);
+	return reply;
+}
+
+/*
+ * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
+ * MDS has been safely committed.
+ */
+static void handle_cap_flush_ack(struct inode *inode,
+				 struct ceph_mds_caps *m,
+				 struct ceph_mds_session *session,
+				 struct ceph_cap *cap)
+	__releases(inode->i_lock)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	unsigned seq = le32_to_cpu(m->seq);
+	int dirty = le32_to_cpu(m->dirty);
+	int cleaned = 0;
+	u64 flush_tid = le64_to_cpu(m->client_tid);
+	int old_dirty = 0, new_dirty = 0;
+	int i;
+
+	for (i = 0; i < CEPH_CAP_BITS; i++)
+		if ((dirty & (1 << i)) &&
+		    flush_tid == ci->i_cap_flush_tid[i])
+			cleaned |= 1 << i;
+
+	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
+	     " flushing %s -> %s\n",
+	     inode, session->s_mds, seq, ceph_cap_string(dirty),
+	     ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+
+	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+		goto out;
+
+	old_dirty = ci->i_dirty_caps | ci->i_flushing_caps;
+	ci->i_flushing_caps &= ~cleaned;
+	new_dirty = ci->i_dirty_caps | ci->i_flushing_caps;
+
+	spin_lock(&mdsc->cap_dirty_lock);
+	if (ci->i_flushing_caps == 0) {
+		list_del_init(&ci->i_flushing_item);
+		if (!list_empty(&session->s_cap_flushing))
+			dout(" mds%d still flushing cap on %p\n",
+			     session->s_mds,
+			     &list_entry(session->s_cap_flushing.next,
+					 struct ceph_inode_info,
+					 i_flushing_item)->vfs_inode);
+		mdsc->num_cap_flushing--;
+		wake_up(&mdsc->cap_flushing_wq);
+		dout(" inode %p now !flushing\n", inode);
+	}
+	if (old_dirty && !new_dirty) {
+		dout(" inode %p now clean\n", inode);
+		list_del_init(&ci->i_dirty_item);
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+	wake_up(&ci->i_cap_wq);
+
+out:
+	spin_unlock(&inode->i_lock);
+	if (old_dirty && !new_dirty)
+		iput(inode);
+}
+
+/*
+ * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
+ * throw away our cap_snap.
+ *
+ * Caller hold s_mutex.
+ */
+static void handle_cap_flushsnap_ack(struct inode *inode,
+				     struct ceph_mds_caps *m,
+				     struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 follows = le64_to_cpu(m->snap_follows);
+	u64 flush_tid = le64_to_cpu(m->client_tid);
+	struct ceph_cap_snap *capsnap;
+	int drop = 0;
+
+	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
+	     inode, ci, session->s_mds, follows);
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		if (capsnap->follows == follows) {
+			if (capsnap->flush_tid != flush_tid) {
+				dout(" cap_snap %p follows %lld tid %lld !="
+				     " %lld\n", capsnap, follows,
+				     flush_tid, capsnap->flush_tid);
+				break;
+			}
+			WARN_ON(capsnap->dirty_pages || capsnap->writing);
+			dout(" removing cap_snap %p follows %lld\n",
+			     capsnap, follows);
+			ceph_put_snap_context(capsnap->context);
+			list_del(&capsnap->ci_item);
+			list_del(&capsnap->flushing_item);
+			ceph_put_cap_snap(capsnap);
+			drop = 1;
+			break;
+		} else {
+			dout(" skipping cap_snap %p follows %lld\n",
+			     capsnap, capsnap->follows);
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	if (drop)
+		iput(inode);
+}
+
+/*
+ * Handle TRUNC from MDS, indicating file truncation.
+ *
+ * caller hold s_mutex.
+ */
+static void handle_cap_trunc(struct inode *inode,
+			     struct ceph_mds_caps *trunc,
+			     struct ceph_mds_session *session)
+	__releases(inode->i_lock)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	int seq = le32_to_cpu(trunc->seq);
+	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
+	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
+	u64 size = le64_to_cpu(trunc->size);
+	int implemented = 0;
+	int dirty = __ceph_caps_dirty(ci);
+	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
+	int queue_trunc = 0;
+
+	issued |= implemented | dirty;
+
+	dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
+	     inode, mds, seq, truncate_size, truncate_seq);
+	queue_trunc = ceph_fill_file_size(inode, issued,
+					  truncate_seq, truncate_size, size);
+	spin_unlock(&inode->i_lock);
+
+	if (queue_trunc)
+		if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+			       &ci->i_vmtruncate_work))
+			igrab(inode);
+}
+
+/*
+ * Handle EXPORT from MDS.  Cap is being migrated _from_ this mds to a
+ * different one.  If we are the most recent migration we've seen (as
+ * indicated by mseq), make note of the migrating cap bits for the
+ * duration (until we see the corresponding IMPORT).
+ *
+ * caller holds s_mutex
+ */
+static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
+			      struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	unsigned mseq = le32_to_cpu(ex->migrate_seq);
+	struct ceph_cap *cap = NULL, *t;
+	struct rb_node *p;
+	int remember = 1;
+
+	dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
+	     inode, ci, mds, mseq);
+
+	spin_lock(&inode->i_lock);
+
+	/* make sure we haven't seen a higher mseq */
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		t = rb_entry(p, struct ceph_cap, ci_node);
+		if (ceph_seq_cmp(t->mseq, mseq) > 0) {
+			dout(" higher mseq on cap from mds%d\n",
+			     t->session->s_mds);
+			remember = 0;
+		}
+		if (t->session->s_mds == mds)
+			cap = t;
+	}
+
+	if (cap) {
+		if (remember) {
+			/* make note */
+			ci->i_cap_exporting_mds = mds;
+			ci->i_cap_exporting_mseq = mseq;
+			ci->i_cap_exporting_issued = cap->issued;
+		}
+		__ceph_remove_cap(cap, NULL);
+	} else {
+		WARN_ON(!cap);
+	}
+
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Handle cap IMPORT.  If there are temp bits from an older EXPORT,
+ * clean them up.
+ *
+ * caller holds s_mutex.
+ */
+static void handle_cap_import(struct ceph_mds_client *mdsc,
+			      struct inode *inode, struct ceph_mds_caps *im,
+			      struct ceph_mds_session *session,
+			      void *snaptrace, int snaptrace_len)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	unsigned issued = le32_to_cpu(im->caps);
+	unsigned wanted = le32_to_cpu(im->wanted);
+	unsigned seq = le32_to_cpu(im->seq);
+	unsigned mseq = le32_to_cpu(im->migrate_seq);
+	u64 realmino = le64_to_cpu(im->realm);
+	u64 cap_id = le64_to_cpu(im->cap_id);
+
+	if (ci->i_cap_exporting_mds >= 0 &&
+	    ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
+		dout("handle_cap_import inode %p ci %p mds%d mseq %d"
+		     " - cleared exporting from mds%d\n",
+		     inode, ci, mds, mseq,
+		     ci->i_cap_exporting_mds);
+		ci->i_cap_exporting_issued = 0;
+		ci->i_cap_exporting_mseq = 0;
+		ci->i_cap_exporting_mds = -1;
+	} else {
+		dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
+		     inode, ci, mds, mseq);
+	}
+
+	down_write(&mdsc->snap_rwsem);
+	ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
+			       false);
+	downgrade_write(&mdsc->snap_rwsem);
+	ceph_add_cap(inode, session, cap_id, -1,
+		     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
+		     NULL /* no caps context */);
+	try_flush_caps(inode, session, NULL);
+	up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Handle a caps message from the MDS.
+ *
+ * Identify the appropriate session, inode, and call the right handler
+ * based on the cap op.
+ */
+void ceph_handle_caps(struct ceph_mds_session *session,
+		      struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct super_block *sb = mdsc->client->sb;
+	struct inode *inode;
+	struct ceph_cap *cap;
+	struct ceph_mds_caps *h;
+	int mds = le64_to_cpu(msg->hdr.src.name.num);
+	int op;
+	u32 seq;
+	struct ceph_vino vino;
+	u64 cap_id;
+	u64 size, max_size;
+	int check_caps = 0;
+	int r;
+
+	dout("handle_caps from mds%d\n", mds);
+
+	/* decode */
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	h = msg->front.iov_base;
+	op = le32_to_cpu(h->op);
+	vino.ino = le64_to_cpu(h->ino);
+	vino.snap = CEPH_NOSNAP;
+	cap_id = le64_to_cpu(h->cap_id);
+	seq = le32_to_cpu(h->seq);
+	size = le64_to_cpu(h->size);
+	max_size = le64_to_cpu(h->max_size);
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
+	     (unsigned)seq);
+
+	/* lookup ino */
+	inode = ceph_find_inode(sb, vino);
+	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
+	     vino.snap, inode);
+	if (!inode) {
+		dout(" i don't have ino %llx\n", vino.ino);
+		goto done;
+	}
+
+	/* these will work even if we don't have a cap yet */
+	switch (op) {
+	case CEPH_CAP_OP_FLUSHSNAP_ACK:
+		handle_cap_flushsnap_ack(inode, h, session);
+		goto done;
+
+	case CEPH_CAP_OP_EXPORT:
+		handle_cap_export(inode, h, session);
+		goto done;
+
+	case CEPH_CAP_OP_IMPORT:
+		handle_cap_import(mdsc, inode, h, session,
+				  msg->middle,
+				  le32_to_cpu(h->snap_trace_len));
+		check_caps = 1; /* we may have sent a RELEASE to the old auth */
+		goto done;
+	}
+
+	/* the rest require a cap */
+	spin_lock(&inode->i_lock);
+	cap = __get_cap_for_mds(ceph_inode(inode), mds);
+	if (!cap) {
+		dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
+		     inode, ceph_ino(inode), ceph_snap(inode), mds);
+		spin_unlock(&inode->i_lock);
+		goto done;
+	}
+
+	/* note that each of these drops i_lock for us */
+	switch (op) {
+	case CEPH_CAP_OP_REVOKE:
+	case CEPH_CAP_OP_GRANT:
+		r = handle_cap_grant(inode, h, session, cap, msg->middle);
+		if (r == 1)
+			ceph_check_caps(ceph_inode(inode),
+					CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+					session);
+		else if (r == 2)
+			ceph_check_caps(ceph_inode(inode),
+					CHECK_CAPS_NODELAY,
+					session);
+		break;
+
+	case CEPH_CAP_OP_FLUSH_ACK:
+		handle_cap_flush_ack(inode, h, session, cap);
+		break;
+
+	case CEPH_CAP_OP_TRUNC:
+		handle_cap_trunc(inode, h, session);
+		break;
+
+	default:
+		spin_unlock(&inode->i_lock);
+		pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
+		       ceph_cap_op_name(op));
+	}
+
+done:
+	mutex_unlock(&session->s_mutex);
+
+	if (check_caps)
+		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
+	if (inode)
+		iput(inode);
+	return;
+
+bad:
+	pr_err("ceph_handle_caps: corrupt message\n");
+	return;
+}
+
+/*
+ * Delayed work handler to process end of delayed cap release LRU list.
+ */
+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc, int flushdirty)
+{
+	struct ceph_inode_info *ci;
+	int flags = CHECK_CAPS_NODELAY;
+
+	if (flushdirty)
+		flags |= CHECK_CAPS_FLUSH;
+
+	dout("check_delayed_caps\n");
+	while (1) {
+		spin_lock(&mdsc->cap_delay_lock);
+		if (list_empty(&mdsc->cap_delay_list))
+			break;
+		ci = list_first_entry(&mdsc->cap_delay_list,
+				      struct ceph_inode_info,
+				      i_cap_delay_list);
+		if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
+		    time_before(jiffies, ci->i_hold_caps_max))
+			break;
+		list_del_init(&ci->i_cap_delay_list);
+		spin_unlock(&mdsc->cap_delay_lock);
+		dout("check_delayed_caps on %p\n", &ci->vfs_inode);
+		ceph_check_caps(ci, flags, NULL);
+	}
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Drop open file reference.  If we were the last open file,
+ * we may need to release capabilities to the MDS (or schedule
+ * their delayed release).
+ */
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0;
+
+	spin_lock(&inode->i_lock);
+	dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
+	     ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
+	BUG_ON(ci->i_nr_by_mode[fmode] == 0);
+	if (--ci->i_nr_by_mode[fmode] == 0)
+		last++;
+	spin_unlock(&inode->i_lock);
+
+	if (last && ci->i_vino.snap == CEPH_NOSNAP)
+		ceph_check_caps(ci, 0, NULL);
+}
+
+/*
+ * Helpers for embedding cap and dentry lease releases into mds
+ * requests.
+ *
+ * @force is used by dentry_release (below) to force inclusion of a
+ * record for the directory inode, even when there aren't any caps to
+ * drop.
+ */
+int ceph_encode_inode_release(void **p, struct inode *inode,
+			      int mds, int drop, int unless, int force)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *cap;
+	struct ceph_mds_request_release *rel = *p;
+	int ret = 0;
+
+	dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
+	     mds, ceph_cap_string(drop), ceph_cap_string(unless));
+
+	spin_lock(&inode->i_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	if (cap && __cap_is_valid(cap)) {
+		if (force ||
+		    ((cap->issued & drop) &&
+		     (cap->issued & unless) == 0)) {
+			if ((cap->issued & drop) &&
+			    (cap->issued & unless) == 0) {
+				dout("encode_inode_release %p cap %p %s -> "
+				     "%s\n", inode, cap,
+				     ceph_cap_string(cap->issued),
+				     ceph_cap_string(cap->issued & ~drop));
+				cap->issued &= ~drop;
+				cap->implemented &= ~drop;
+				if (ci->i_ceph_flags & CEPH_I_NODELAY) {
+					int wanted = __ceph_caps_wanted(ci);
+					dout("  wanted %s -> %s (act %s)\n",
+					     ceph_cap_string(cap->mds_wanted),
+					     ceph_cap_string(cap->mds_wanted &
+							     ~wanted),
+					     ceph_cap_string(wanted));
+					cap->mds_wanted &= wanted;
+				}
+			} else {
+				dout("encode_inode_release %p cap %p %s"
+				     " (force)\n", inode, cap,
+				     ceph_cap_string(cap->issued));
+			}
+
+			rel->ino = cpu_to_le64(ceph_ino(inode));
+			rel->cap_id = cpu_to_le64(cap->cap_id);
+			rel->seq = cpu_to_le32(cap->seq);
+			rel->issue_seq = cpu_to_le32(cap->issue_seq),
+			rel->mseq = cpu_to_le32(cap->mseq);
+			rel->caps = cpu_to_le32(cap->issued);
+			rel->wanted = cpu_to_le32(cap->mds_wanted);
+			rel->dname_len = 0;
+			rel->dname_seq = 0;
+			*p += sizeof(*rel);
+			ret = 1;
+		} else {
+			dout("encode_inode_release %p cap %p %s\n",
+			     inode, cap, ceph_cap_string(cap->issued));
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+			       int mds, int drop, int unless)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct ceph_mds_request_release *rel = *p;
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	int force = 0;
+	int ret;
+
+	/*
+	 * force an record for the directory caps if we have a dentry lease.
+	 * this is racy (can't take i_lock and d_lock together), but it
+	 * doesn't have to be perfect; the mds will revoke anything we don't
+	 * release.
+	 */
+	spin_lock(&dentry->d_lock);
+	if (di->lease_session && di->lease_session->s_mds == mds)
+		force = 1;
+	spin_unlock(&dentry->d_lock);
+
+	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+
+	spin_lock(&dentry->d_lock);
+	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+		dout("encode_dentry_release %p mds%d seq %d\n",
+		     dentry, mds, (int)di->lease_seq);
+		rel->dname_len = cpu_to_le32(dentry->d_name.len);
+		memcpy(*p, dentry->d_name.name, dentry->d_name.len);
+		*p += dentry->d_name.len;
+		rel->dname_seq = cpu_to_le32(di->lease_seq);
+	}
+	spin_unlock(&dentry->d_lock);
+	return ret;
+}
-- 
cgit v1.2.2


From 963b61eb041e8850807d95f8d7a4c6a454c45000 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:12 -0700
Subject: ceph: snapshot management

Ceph snapshots rely on client cooperation in determining which
operations apply to which snapshots, and appropriately flushing
snapshotted data and metadata back to the OSD and MDS clusters.
Because snapshots apply to subtrees of the file hierarchy and can be
created at any time, there is a fair bit of bookkeeping required to
make this work.

Portions of the hierarchy that belong to the same set of snapshots
are described by a single 'snap realm.'  A 'snap context' describes
the set of snapshots that exist for a given file or directory.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/snap.c | 897 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 897 insertions(+)
 create mode 100644 fs/ceph/snap.c

(limited to 'fs')

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..2e3cb40b7e48
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,897 @@
+#include "ceph_debug.h"
+
+#include <linux/radix-tree.h>
+#include <linux/sort.h>
+
+#include "super.h"
+#include "decode.h"
+
+/*
+ * Snapshots in ceph are driven in large part by cooperation from the
+ * client.  In contrast to local file systems or file servers that
+ * implement snapshots at a single point in the system, ceph's
+ * distributed access to storage requires clients to help decide
+ * whether a write logically occurs before or after a recently created
+ * snapshot.
+ *
+ * This provides a perfect instantanous client-wide snapshot.  Between
+ * clients, however, snapshots may appear to be applied at slightly
+ * different points in time, depending on delays in delivering the
+ * snapshot notification.
+ *
+ * Snapshots are _not_ file system-wide.  Instead, each snapshot
+ * applies to the subdirectory nested beneath some directory.  This
+ * effectively divides the hierarchy into multiple "realms," where all
+ * of the files contained by each realm share the same set of
+ * snapshots.  An individual realm's snap set contains snapshots
+ * explicitly created on that realm, as well as any snaps in its
+ * parent's snap set _after_ the point at which the parent became it's
+ * parent (due to, say, a rename).  Similarly, snaps from prior parents
+ * during the time intervals during which they were the parent are included.
+ *
+ * The client is spared most of this detail, fortunately... it must only
+ * maintains a hierarchy of realms reflecting the current parent/child
+ * realm relationship, and for each realm has an explicit list of snaps
+ * inherited from prior parents.
+ *
+ * A snap_realm struct is maintained for realms containing every inode
+ * with an open cap in the system.  (The needed snap realm information is
+ * provided by the MDS whenever a cap is issued, i.e., on open.)  A 'seq'
+ * version number is used to ensure that as realm parameters change (new
+ * snapshot, new parent, etc.) the client's realm hierarchy is updated.
+ *
+ * The realm hierarchy drives the generation of a 'snap context' for each
+ * realm, which simply lists the resulting set of snaps for the realm.  This
+ * is attached to any writes sent to OSDs.
+ */
+/*
+ * Unfortunately error handling is a bit mixed here.  If we get a snap
+ * update, but don't have enough memory to update our realm hierarchy,
+ * it's not clear what we can do about it (besides complaining to the
+ * console).
+ */
+
+
+/*
+ * increase ref count for the realm
+ *
+ * caller must hold snap_rwsem for write.
+ */
+void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+			 struct ceph_snap_realm *realm)
+{
+	dout("get_realm %p %d -> %d\n", realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+	/*
+	 * since we _only_ increment realm refs or empty the empty
+	 * list with snap_rwsem held, adjusting the empty list here is
+	 * safe.  we do need to protect against concurrent empty list
+	 * additions, however.
+	 */
+	if (atomic_read(&realm->nref) == 0) {
+		spin_lock(&mdsc->snap_empty_lock);
+		list_del_init(&realm->empty_item);
+		spin_unlock(&mdsc->snap_empty_lock);
+	}
+
+	atomic_inc(&realm->nref);
+}
+
+/*
+ * create and get the realm rooted at @ino and bump its ref count.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static struct ceph_snap_realm *ceph_create_snap_realm(
+	struct ceph_mds_client *mdsc,
+	u64 ino)
+{
+	struct ceph_snap_realm *realm;
+
+	realm = kzalloc(sizeof(*realm), GFP_NOFS);
+	if (!realm)
+		return ERR_PTR(-ENOMEM);
+
+	radix_tree_insert(&mdsc->snap_realms, ino, realm);
+
+	atomic_set(&realm->nref, 0);    /* tree does not take a ref */
+	realm->ino = ino;
+	INIT_LIST_HEAD(&realm->children);
+	INIT_LIST_HEAD(&realm->child_item);
+	INIT_LIST_HEAD(&realm->empty_item);
+	INIT_LIST_HEAD(&realm->inodes_with_caps);
+	spin_lock_init(&realm->inodes_with_caps_lock);
+	dout("create_snap_realm %llx %p\n", realm->ino, realm);
+	return realm;
+}
+
+/*
+ * find and get (if found) the realm rooted at @ino and bump its ref count.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+					       u64 ino)
+{
+	struct ceph_snap_realm *realm;
+
+	realm = radix_tree_lookup(&mdsc->snap_realms, ino);
+	if (realm)
+		dout("lookup_snap_realm %llx %p\n", realm->ino, realm);
+	return realm;
+}
+
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+			     struct ceph_snap_realm *realm);
+
+/*
+ * called with snap_rwsem (write)
+ */
+static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
+				 struct ceph_snap_realm *realm)
+{
+	dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+
+	radix_tree_delete(&mdsc->snap_realms, realm->ino);
+
+	if (realm->parent) {
+		list_del_init(&realm->child_item);
+		__put_snap_realm(mdsc, realm->parent);
+	}
+
+	kfree(realm->prior_parent_snaps);
+	kfree(realm->snaps);
+	ceph_put_snap_context(realm->cached_context);
+	kfree(realm);
+}
+
+/*
+ * caller holds snap_rwsem (write)
+ */
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+			     struct ceph_snap_realm *realm)
+{
+	dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+	if (atomic_dec_and_test(&realm->nref))
+		__destroy_snap_realm(mdsc, realm);
+}
+
+/*
+ * caller needn't hold any locks
+ */
+void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+			 struct ceph_snap_realm *realm)
+{
+	dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+	if (!atomic_dec_and_test(&realm->nref))
+		return;
+
+	if (down_write_trylock(&mdsc->snap_rwsem)) {
+		__destroy_snap_realm(mdsc, realm);
+		up_write(&mdsc->snap_rwsem);
+	} else {
+		spin_lock(&mdsc->snap_empty_lock);
+		list_add(&mdsc->snap_empty, &realm->empty_item);
+		spin_unlock(&mdsc->snap_empty_lock);
+	}
+}
+
+/*
+ * Clean up any realms whose ref counts have dropped to zero.  Note
+ * that this does not include realms who were created but not yet
+ * used.
+ *
+ * Called under snap_rwsem (write)
+ */
+static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+	struct ceph_snap_realm *realm;
+
+	spin_lock(&mdsc->snap_empty_lock);
+	while (!list_empty(&mdsc->snap_empty)) {
+		realm = list_first_entry(&mdsc->snap_empty,
+				   struct ceph_snap_realm, empty_item);
+		list_del(&realm->empty_item);
+		spin_unlock(&mdsc->snap_empty_lock);
+		__destroy_snap_realm(mdsc, realm);
+		spin_lock(&mdsc->snap_empty_lock);
+	}
+	spin_unlock(&mdsc->snap_empty_lock);
+}
+
+void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+	down_write(&mdsc->snap_rwsem);
+	__cleanup_empty_realms(mdsc);
+	up_write(&mdsc->snap_rwsem);
+}
+
+/*
+ * adjust the parent realm of a given @realm.  adjust child list, and parent
+ * pointers, and ref counts appropriately.
+ *
+ * return true if parent was changed, 0 if unchanged, <0 on error.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
+				    struct ceph_snap_realm *realm,
+				    u64 parentino)
+{
+	struct ceph_snap_realm *parent;
+
+	if (realm->parent_ino == parentino)
+		return 0;
+
+	parent = ceph_lookup_snap_realm(mdsc, parentino);
+	if (IS_ERR(parent))
+		return PTR_ERR(parent);
+	if (!parent) {
+		parent = ceph_create_snap_realm(mdsc, parentino);
+		if (IS_ERR(parent))
+			return PTR_ERR(parent);
+	}
+	dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
+	     realm->ino, realm, realm->parent_ino, realm->parent,
+	     parentino, parent);
+	if (realm->parent) {
+		list_del_init(&realm->child_item);
+		ceph_put_snap_realm(mdsc, realm->parent);
+	}
+	realm->parent_ino = parentino;
+	realm->parent = parent;
+	ceph_get_snap_realm(mdsc, parent);
+	list_add(&realm->child_item, &parent->children);
+	return 1;
+}
+
+
+static int cmpu64_rev(const void *a, const void *b)
+{
+	if (*(u64 *)a < *(u64 *)b)
+		return 1;
+	if (*(u64 *)a > *(u64 *)b)
+		return -1;
+	return 0;
+}
+
+/*
+ * build the snap context for a given realm.
+ */
+static int build_snap_context(struct ceph_snap_realm *realm)
+{
+	struct ceph_snap_realm *parent = realm->parent;
+	struct ceph_snap_context *snapc;
+	int err = 0;
+	int i;
+	int num = realm->num_prior_parent_snaps + realm->num_snaps;
+
+	/*
+	 * build parent context, if it hasn't been built.
+	 * conservatively estimate that all parent snaps might be
+	 * included by us.
+	 */
+	if (parent) {
+		if (!parent->cached_context) {
+			err = build_snap_context(parent);
+			if (err)
+				goto fail;
+		}
+		num += parent->cached_context->num_snaps;
+	}
+
+	/* do i actually need to update?  not if my context seq
+	   matches realm seq, and my parents' does to.  (this works
+	   because we rebuild_snap_realms() works _downward_ in
+	   hierarchy after each update.) */
+	if (realm->cached_context &&
+	    realm->cached_context->seq <= realm->seq &&
+	    (!parent ||
+	     realm->cached_context->seq <= parent->cached_context->seq)) {
+		dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
+		     " (unchanged)\n",
+		     realm->ino, realm, realm->cached_context,
+		     realm->cached_context->seq,
+		     realm->cached_context->num_snaps);
+		return 0;
+	}
+
+	/* alloc new snap context */
+	err = -ENOMEM;
+	if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
+		goto fail;
+	snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
+	if (!snapc)
+		goto fail;
+	atomic_set(&snapc->nref, 1);
+
+	/* build (reverse sorted) snap vector */
+	num = 0;
+	snapc->seq = realm->seq;
+	if (parent) {
+		/* include any of parent's snaps occuring _after_ my
+		   parent became my parent */
+		for (i = 0; i < parent->cached_context->num_snaps; i++)
+			if (parent->cached_context->snaps[i] >=
+			    realm->parent_since)
+				snapc->snaps[num++] =
+					parent->cached_context->snaps[i];
+		if (parent->cached_context->seq > snapc->seq)
+			snapc->seq = parent->cached_context->seq;
+	}
+	memcpy(snapc->snaps + num, realm->snaps,
+	       sizeof(u64)*realm->num_snaps);
+	num += realm->num_snaps;
+	memcpy(snapc->snaps + num, realm->prior_parent_snaps,
+	       sizeof(u64)*realm->num_prior_parent_snaps);
+	num += realm->num_prior_parent_snaps;
+
+	sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
+	snapc->num_snaps = num;
+	dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
+	     realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
+
+	if (realm->cached_context)
+		ceph_put_snap_context(realm->cached_context);
+	realm->cached_context = snapc;
+	return 0;
+
+fail:
+	/*
+	 * if we fail, clear old (incorrect) cached_context... hopefully
+	 * we'll have better luck building it later
+	 */
+	if (realm->cached_context) {
+		ceph_put_snap_context(realm->cached_context);
+		realm->cached_context = NULL;
+	}
+	pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
+	       realm, err);
+	return err;
+}
+
+/*
+ * rebuild snap context for the given realm and all of its children.
+ */
+static void rebuild_snap_realms(struct ceph_snap_realm *realm)
+{
+	struct ceph_snap_realm *child;
+
+	dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
+	build_snap_context(realm);
+
+	list_for_each_entry(child, &realm->children, child_item)
+		rebuild_snap_realms(child);
+}
+
+
+/*
+ * helper to allocate and decode an array of snapids.  free prior
+ * instance, if any.
+ */
+static int dup_array(u64 **dst, __le64 *src, int num)
+{
+	int i;
+
+	kfree(*dst);
+	if (num) {
+		*dst = kcalloc(num, sizeof(u64), GFP_NOFS);
+		if (!*dst)
+			return -ENOMEM;
+		for (i = 0; i < num; i++)
+			(*dst)[i] = get_unaligned_le64(src + i);
+	} else {
+		*dst = NULL;
+	}
+	return 0;
+}
+
+
+/*
+ * When a snapshot is applied, the size/mtime inode metadata is queued
+ * in a ceph_cap_snap (one for each snapshot) until writeback
+ * completes and the metadata can be flushed back to the MDS.
+ *
+ * However, if a (sync) write is currently in-progress when we apply
+ * the snapshot, we have to wait until the write succeeds or fails
+ * (and a final size/mtime is known).  In this case the
+ * cap_snap->writing = 1, and is said to be "pending."  When the write
+ * finishes, we __ceph_finish_cap_snap().
+ *
+ * Caller must hold snap_rwsem for read (i.e., the realm topology won't
+ * change).
+ */
+void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+			 struct ceph_snap_context *snapc)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap_snap *capsnap;
+	int used;
+
+	capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
+	if (!capsnap) {
+		pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
+		return;
+	}
+
+	spin_lock(&inode->i_lock);
+	used = __ceph_caps_used(ci);
+	if (__ceph_have_pending_cap_snap(ci)) {
+		/* there is no point in queuing multiple "pending" cap_snaps,
+		   as no new writes are allowed to start when pending, so any
+		   writes in progress now were started before the previous
+		   cap_snap.  lucky us. */
+		dout("queue_cap_snap %p snapc %p seq %llu used %d"
+		     " already pending\n", inode, snapc, snapc->seq, used);
+		kfree(capsnap);
+	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+		igrab(inode);
+
+		atomic_set(&capsnap->nref, 1);
+		capsnap->ci = ci;
+		INIT_LIST_HEAD(&capsnap->ci_item);
+		INIT_LIST_HEAD(&capsnap->flushing_item);
+
+		capsnap->follows = snapc->seq - 1;
+		capsnap->context = ceph_get_snap_context(snapc);
+		capsnap->issued = __ceph_caps_issued(ci, NULL);
+		capsnap->dirty = __ceph_caps_dirty(ci);
+
+		capsnap->mode = inode->i_mode;
+		capsnap->uid = inode->i_uid;
+		capsnap->gid = inode->i_gid;
+
+		/* fixme? */
+		capsnap->xattr_blob = NULL;
+		capsnap->xattr_len = 0;
+
+		/* dirty page count moved from _head to this cap_snap;
+		   all subsequent writes page dirties occur _after_ this
+		   snapshot. */
+		capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+		ci->i_wrbuffer_ref_head = 0;
+		ceph_put_snap_context(ci->i_head_snapc);
+		ci->i_head_snapc = NULL;
+		list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+
+		if (used & CEPH_CAP_FILE_WR) {
+			dout("queue_cap_snap %p cap_snap %p snapc %p"
+			     " seq %llu used WR, now pending\n", inode,
+			     capsnap, snapc, snapc->seq);
+			capsnap->writing = 1;
+		} else {
+			/* note mtime, size NOW. */
+			__ceph_finish_cap_snap(ci, capsnap);
+		}
+	} else {
+		dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+		kfree(capsnap);
+	}
+
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Finalize the size, mtime for a cap_snap.. that is, settle on final values
+ * to be used for the snapshot, to be flushed back to the mds.
+ *
+ * If capsnap can now be flushed, add to snap_flush list, and return 1.
+ *
+ * Caller must hold i_lock.
+ */
+int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+			    struct ceph_cap_snap *capsnap)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+
+	BUG_ON(capsnap->writing);
+	capsnap->size = inode->i_size;
+	capsnap->mtime = inode->i_mtime;
+	capsnap->atime = inode->i_atime;
+	capsnap->ctime = inode->i_ctime;
+	capsnap->time_warp_seq = ci->i_time_warp_seq;
+	if (capsnap->dirty_pages) {
+		dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
+		     "still has %d dirty pages\n", inode, capsnap,
+		     capsnap->context, capsnap->context->seq,
+		     capsnap->size, capsnap->dirty_pages);
+		return 0;
+	}
+	dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
+	     inode, capsnap, capsnap->context,
+	     capsnap->context->seq, capsnap->size);
+
+	spin_lock(&mdsc->snap_flush_lock);
+	list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+	spin_unlock(&mdsc->snap_flush_lock);
+	return 1;  /* caller may want to ceph_flush_snaps */
+}
+
+
+/*
+ * Parse and apply a snapblob "snap trace" from the MDS.  This specifies
+ * the snap realm parameters from a given realm and all of its ancestors,
+ * up to the root.
+ *
+ * Caller must hold snap_rwsem for write.
+ */
+int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
+			   void *p, void *e, bool deletion)
+{
+	struct ceph_mds_snap_realm *ri;    /* encoded */
+	__le64 *snaps;                     /* encoded */
+	__le64 *prior_parent_snaps;        /* encoded */
+	struct ceph_snap_realm *realm;
+	int invalidate = 0;
+	int err = -ENOMEM;
+
+	dout("update_snap_trace deletion=%d\n", deletion);
+more:
+	ceph_decode_need(&p, e, sizeof(*ri), bad);
+	ri = p;
+	p += sizeof(*ri);
+	ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
+			    le32_to_cpu(ri->num_prior_parent_snaps)), bad);
+	snaps = p;
+	p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
+	prior_parent_snaps = p;
+	p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
+
+	realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
+	if (IS_ERR(realm)) {
+		err = PTR_ERR(realm);
+		goto fail;
+	}
+	if (!realm) {
+		realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
+		if (IS_ERR(realm)) {
+			err = PTR_ERR(realm);
+			goto fail;
+		}
+	}
+
+	if (le64_to_cpu(ri->seq) > realm->seq) {
+		dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+		     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+		/*
+		 * if the realm seq has changed, queue a cap_snap for every
+		 * inode with open caps.  we do this _before_ we update
+		 * the realm info so that we prepare for writeback under the
+		 * _previous_ snap context.
+		 *
+		 * ...unless it's a snap deletion!
+		 */
+		if (!deletion) {
+			struct ceph_inode_info *ci;
+			struct inode *lastinode = NULL;
+
+			spin_lock(&realm->inodes_with_caps_lock);
+			list_for_each_entry(ci, &realm->inodes_with_caps,
+					    i_snap_realm_item) {
+				struct inode *inode = igrab(&ci->vfs_inode);
+				if (!inode)
+					continue;
+				spin_unlock(&realm->inodes_with_caps_lock);
+				if (lastinode)
+					iput(lastinode);
+				lastinode = inode;
+				ceph_queue_cap_snap(ci, realm->cached_context);
+				spin_lock(&realm->inodes_with_caps_lock);
+			}
+			spin_unlock(&realm->inodes_with_caps_lock);
+			if (lastinode)
+				iput(lastinode);
+			dout("update_snap_trace cap_snaps queued\n");
+		}
+
+	} else {
+		dout("update_snap_trace %llx %p seq %lld unchanged\n",
+		     realm->ino, realm, realm->seq);
+	}
+
+	/* ensure the parent is correct */
+	err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
+	if (err < 0)
+		goto fail;
+	invalidate += err;
+
+	if (le64_to_cpu(ri->seq) > realm->seq) {
+		/* update realm parameters, snap lists */
+		realm->seq = le64_to_cpu(ri->seq);
+		realm->created = le64_to_cpu(ri->created);
+		realm->parent_since = le64_to_cpu(ri->parent_since);
+
+		realm->num_snaps = le32_to_cpu(ri->num_snaps);
+		err = dup_array(&realm->snaps, snaps, realm->num_snaps);
+		if (err < 0)
+			goto fail;
+
+		realm->num_prior_parent_snaps =
+			le32_to_cpu(ri->num_prior_parent_snaps);
+		err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
+				realm->num_prior_parent_snaps);
+		if (err < 0)
+			goto fail;
+
+		invalidate = 1;
+	} else if (!realm->cached_context) {
+		invalidate = 1;
+	}
+
+	dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
+	     realm, invalidate, p, e);
+
+	if (p < e)
+		goto more;
+
+	/* invalidate when we reach the _end_ (root) of the trace */
+	if (invalidate)
+		rebuild_snap_realms(realm);
+
+	__cleanup_empty_realms(mdsc);
+	return 0;
+
+bad:
+	err = -EINVAL;
+fail:
+	pr_err("update_snap_trace error %d\n", err);
+	return err;
+}
+
+
+/*
+ * Send any cap_snaps that are queued for flush.  Try to carry
+ * s_mutex across multiple snap flushes to avoid locking overhead.
+ *
+ * Caller holds no locks.
+ */
+static void flush_snaps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci;
+	struct inode *inode;
+	struct ceph_mds_session *session = NULL;
+
+	dout("flush_snaps\n");
+	spin_lock(&mdsc->snap_flush_lock);
+	while (!list_empty(&mdsc->snap_flush_list)) {
+		ci = list_first_entry(&mdsc->snap_flush_list,
+				struct ceph_inode_info, i_snap_flush_item);
+		inode = &ci->vfs_inode;
+		igrab(inode);
+		spin_unlock(&mdsc->snap_flush_lock);
+		spin_lock(&inode->i_lock);
+		__ceph_flush_snaps(ci, &session);
+		spin_unlock(&inode->i_lock);
+		iput(inode);
+		spin_lock(&mdsc->snap_flush_lock);
+	}
+	spin_unlock(&mdsc->snap_flush_lock);
+
+	if (session) {
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+	}
+	dout("flush_snaps done\n");
+}
+
+
+/*
+ * Handle a snap notification from the MDS.
+ *
+ * This can take two basic forms: the simplest is just a snap creation
+ * or deletion notification on an existing realm.  This should update the
+ * realm and its children.
+ *
+ * The more difficult case is realm creation, due to snap creation at a
+ * new point in the file hierarchy, or due to a rename that moves a file or
+ * directory into another realm.
+ */
+void ceph_handle_snap(struct ceph_mds_client *mdsc,
+		      struct ceph_msg *msg)
+{
+	struct super_block *sb = mdsc->client->sb;
+	struct ceph_mds_session *session;
+	int mds;
+	u64 split;
+	int op;
+	int trace_len;
+	struct ceph_snap_realm *realm = NULL;
+	void *p = msg->front.iov_base;
+	void *e = p + msg->front.iov_len;
+	struct ceph_mds_snap_head *h;
+	int num_split_inos, num_split_realms;
+	__le64 *split_inos = NULL, *split_realms = NULL;
+	int i;
+	int locked_rwsem = 0;
+
+	if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
+		return;
+	mds = le64_to_cpu(msg->hdr.src.name.num);
+
+	/* decode */
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	h = p;
+	op = le32_to_cpu(h->op);
+	split = le64_to_cpu(h->split);   /* non-zero if we are splitting an
+					  * existing realm */
+	num_split_inos = le32_to_cpu(h->num_split_inos);
+	num_split_realms = le32_to_cpu(h->num_split_realms);
+	trace_len = le32_to_cpu(h->trace_len);
+	p += sizeof(*h);
+
+	dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
+	     ceph_snap_op_name(op), split, trace_len);
+
+	/* find session */
+	mutex_lock(&mdsc->mutex);
+	session = __ceph_lookup_mds_session(mdsc, mds);
+	mutex_unlock(&mdsc->mutex);
+	if (!session) {
+		dout("WTF, got snap but no session for mds%d\n", mds);
+		return;
+	}
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+	mutex_unlock(&session->s_mutex);
+
+	down_write(&mdsc->snap_rwsem);
+	locked_rwsem = 1;
+
+	if (op == CEPH_SNAP_OP_SPLIT) {
+		struct ceph_mds_snap_realm *ri;
+
+		/*
+		 * A "split" breaks part of an existing realm off into
+		 * a new realm.  The MDS provides a list of inodes
+		 * (with caps) and child realms that belong to the new
+		 * child.
+		 */
+		split_inos = p;
+		p += sizeof(u64) * num_split_inos;
+		split_realms = p;
+		p += sizeof(u64) * num_split_realms;
+		ceph_decode_need(&p, e, sizeof(*ri), bad);
+		/* we will peek at realm info here, but will _not_
+		 * advance p, as the realm update will occur below in
+		 * ceph_update_snap_trace. */
+		ri = p;
+
+		realm = ceph_lookup_snap_realm(mdsc, split);
+		if (IS_ERR(realm))
+			goto out;
+		if (!realm) {
+			realm = ceph_create_snap_realm(mdsc, split);
+			if (IS_ERR(realm))
+				goto out;
+		}
+		ceph_get_snap_realm(mdsc, realm);
+
+		dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+		for (i = 0; i < num_split_inos; i++) {
+			struct ceph_vino vino = {
+				.ino = le64_to_cpu(split_inos[i]),
+				.snap = CEPH_NOSNAP,
+			};
+			struct inode *inode = ceph_find_inode(sb, vino);
+			struct ceph_inode_info *ci;
+
+			if (!inode)
+				continue;
+			ci = ceph_inode(inode);
+
+			spin_lock(&inode->i_lock);
+			if (!ci->i_snap_realm)
+				goto skip_inode;
+			/*
+			 * If this inode belongs to a realm that was
+			 * created after our new realm, we experienced
+			 * a race (due to another split notifications
+			 * arriving from a different MDS).  So skip
+			 * this inode.
+			 */
+			if (ci->i_snap_realm->created >
+			    le64_to_cpu(ri->created)) {
+				dout(" leaving %p in newer realm %llx %p\n",
+				     inode, ci->i_snap_realm->ino,
+				     ci->i_snap_realm);
+				goto skip_inode;
+			}
+			dout(" will move %p to split realm %llx %p\n",
+			     inode, realm->ino, realm);
+			/*
+			 * Remove the inode from the realm's inode
+			 * list, but don't add it to the new realm
+			 * yet.  We don't want the cap_snap to be
+			 * queued (again) by ceph_update_snap_trace()
+			 * below.  Queue it _now_, under the old context.
+			 */
+			list_del_init(&ci->i_snap_realm_item);
+			spin_unlock(&inode->i_lock);
+
+			ceph_queue_cap_snap(ci,
+					    ci->i_snap_realm->cached_context);
+
+			iput(inode);
+			continue;
+
+skip_inode:
+			spin_unlock(&inode->i_lock);
+			iput(inode);
+		}
+
+		/* we may have taken some of the old realm's children. */
+		for (i = 0; i < num_split_realms; i++) {
+			struct ceph_snap_realm *child =
+				ceph_lookup_snap_realm(mdsc,
+					   le64_to_cpu(split_realms[i]));
+			if (IS_ERR(child))
+				continue;
+			if (!child)
+				continue;
+			adjust_snap_realm_parent(mdsc, child, realm->ino);
+		}
+	}
+
+	/*
+	 * update using the provided snap trace. if we are deleting a
+	 * snap, we can avoid queueing cap_snaps.
+	 */
+	ceph_update_snap_trace(mdsc, p, e,
+			       op == CEPH_SNAP_OP_DESTROY);
+
+	if (op == CEPH_SNAP_OP_SPLIT) {
+		/*
+		 * ok, _now_ add the inodes into the new realm.
+		 */
+		for (i = 0; i < num_split_inos; i++) {
+			struct ceph_vino vino = {
+				.ino = le64_to_cpu(split_inos[i]),
+				.snap = CEPH_NOSNAP,
+			};
+			struct inode *inode = ceph_find_inode(sb, vino);
+			struct ceph_inode_info *ci;
+
+			if (!inode)
+				continue;
+			ci = ceph_inode(inode);
+			spin_lock(&inode->i_lock);
+			if (!ci->i_snap_realm)
+				goto split_skip_inode;
+			ceph_put_snap_realm(mdsc, ci->i_snap_realm);
+			spin_lock(&realm->inodes_with_caps_lock);
+			list_add(&ci->i_snap_realm_item,
+				 &realm->inodes_with_caps);
+			ci->i_snap_realm = realm;
+			spin_unlock(&realm->inodes_with_caps_lock);
+			ceph_get_snap_realm(mdsc, realm);
+split_skip_inode:
+			spin_unlock(&inode->i_lock);
+			iput(inode);
+		}
+
+		/* we took a reference when we created the realm, above */
+		ceph_put_snap_realm(mdsc, realm);
+	}
+
+	__cleanup_empty_realms(mdsc);
+
+	up_write(&mdsc->snap_rwsem);
+
+	flush_snaps(mdsc);
+	return;
+
+bad:
+	pr_err("corrupt snap message from mds%d\n", mds);
+out:
+	if (locked_rwsem)
+		up_write(&mdsc->snap_rwsem);
+	return;
+}
+
+
+
-- 
cgit v1.2.2


From 31b8006e1d79e127a776c9414e3e0b5f9508047e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:13 -0700
Subject: ceph: messenger library

A generic message passing library is used to communicate with all
other components in the Ceph file system.  The messenger library
provides ordered, reliable delivery of messages between two nodes in
the system.

This implementation is based on TCP.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/decode.h    |  136 ++++
 fs/ceph/messenger.c | 2019 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/messenger.h |  243 +++++++
 3 files changed, 2398 insertions(+)
 create mode 100644 fs/ceph/decode.h
 create mode 100644 fs/ceph/messenger.c
 create mode 100644 fs/ceph/messenger.h

(limited to 'fs')

diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..fc2769df062d
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,136 @@
+#ifndef __CEPH_DECODE_H
+#define __CEPH_DECODE_H
+
+#include <asm/unaligned.h>
+
+/*
+ * in all cases,
+ *   void **p     pointer to position pointer
+ *   void *end    pointer to end of buffer (last byte + 1)
+ */
+
+/*
+ * bounds check input.
+ */
+#define ceph_decode_need(p, end, n, bad)		\
+	do {						\
+		if (unlikely(*(p) + (n) > (end))) 	\
+			goto bad;			\
+	} while (0)
+
+#define ceph_decode_64(p, v)					\
+	do {							\
+		v = get_unaligned_le64(*(p));			\
+		*(p) += sizeof(u64);				\
+	} while (0)
+#define ceph_decode_32(p, v)					\
+	do {							\
+		v = get_unaligned_le32(*(p));			\
+		*(p) += sizeof(u32);				\
+	} while (0)
+#define ceph_decode_16(p, v)					\
+	do {							\
+		v = get_unaligned_le16(*(p));			\
+		*(p) += sizeof(u16);				\
+	} while (0)
+#define ceph_decode_8(p, v)				\
+	do {						\
+		v = *(u8 *)*(p);			\
+		(*p)++;					\
+	} while (0)
+
+#define ceph_decode_copy(p, pv, n)			\
+	do {						\
+		memcpy(pv, *(p), n);			\
+		*(p) += n;				\
+	} while (0)
+
+/* bounds check too */
+#define ceph_decode_64_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u64), bad);	\
+		ceph_decode_64(p, v);				\
+	} while (0)
+#define ceph_decode_32_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u32), bad);	\
+		ceph_decode_32(p, v);				\
+	} while (0)
+#define ceph_decode_16_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u16), bad);	\
+		ceph_decode_16(p, v);				\
+	} while (0)
+
+#define ceph_decode_copy_safe(p, end, pv, n, bad)		\
+	do {							\
+		ceph_decode_need(p, end, n, bad);		\
+		ceph_decode_copy(p, pv, n);			\
+	} while (0)
+
+/*
+ * struct ceph_timespec <-> struct timespec
+ */
+#define ceph_decode_timespec(ts, tv)					\
+	do {								\
+		(ts)->tv_sec = le32_to_cpu((tv)->tv_sec);		\
+		(ts)->tv_nsec = le32_to_cpu((tv)->tv_nsec);		\
+	} while (0)
+#define ceph_encode_timespec(tv, ts)				\
+	do {							\
+		(tv)->tv_sec = cpu_to_le32((ts)->tv_sec);	\
+		(tv)->tv_nsec = cpu_to_le32((ts)->tv_nsec);	\
+	} while (0)
+
+
+/*
+ * encoders
+ */
+#define ceph_encode_64(p, v)						\
+	do {								\
+		put_unaligned_le64(v, (__le64 *)*(p));			\
+		*(p) += sizeof(u64);					\
+	} while (0)
+#define ceph_encode_32(p, v)					\
+	do {							\
+		put_unaligned_le32(v, (__le32 *)*(p));		\
+		*(p) += sizeof(u32);				\
+	} while (0)
+#define ceph_encode_16(p, v)					\
+	do {							\
+		put_unaligned_le16(v), (__le16 *)*(p));		\
+	*(p) += sizeof(u16);					\
+	} while (0)
+#define ceph_encode_8(p, v)			  \
+	do {					  \
+		*(u8 *)*(p) = v;		  \
+		(*(p))++;			  \
+	} while (0)
+
+/*
+ * filepath, string encoders
+ */
+static inline void ceph_encode_filepath(void **p, void *end,
+					u64 ino, const char *path)
+{
+	u32 len = path ? strlen(path) : 0;
+	BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
+	ceph_encode_64(p, ino);
+	ceph_encode_32(p, len);
+	if (len)
+		memcpy(*p, path, len);
+	*p += len;
+}
+
+static inline void ceph_encode_string(void **p, void *end,
+				      const char *s, u32 len)
+{
+	BUG_ON(*p + sizeof(len) + len > end);
+	ceph_encode_32(p, len);
+	if (len)
+		memcpy(*p, s, len);
+	*p += len;
+}
+
+
+#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..63f7f1359385
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2019 @@
+#include "ceph_debug.h"
+
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <net/tcp.h>
+
+#include "super.h"
+#include "messenger.h"
+
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system.  The messenger provides ordered and reliable
+ * delivery.  We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error).  Acks allow sent messages to be discarded by
+ * the sender.
+ */
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+
+
+static void queue_con(struct ceph_connection *con);
+static void con_work(struct work_struct *);
+static void ceph_fault(struct ceph_connection *con);
+
+const char *ceph_name_type_str(int t)
+{
+	switch (t) {
+	case CEPH_ENTITY_TYPE_MON: return "mon";
+	case CEPH_ENTITY_TYPE_MDS: return "mds";
+	case CEPH_ENTITY_TYPE_OSD: return "osd";
+	case CEPH_ENTITY_TYPE_CLIENT: return "client";
+	case CEPH_ENTITY_TYPE_ADMIN: return "admin";
+	default: return "???";
+	}
+}
+
+/*
+ * nicely render a sockaddr as a string.
+ */
+#define MAX_ADDR_STR 20
+static char addr_str[MAX_ADDR_STR][40];
+static DEFINE_SPINLOCK(addr_str_lock);
+static int last_addr_str;
+
+const char *pr_addr(const struct sockaddr_storage *ss)
+{
+	int i;
+	char *s;
+	struct sockaddr_in *in4 = (void *)ss;
+	unsigned char *quad = (void *)&in4->sin_addr.s_addr;
+	struct sockaddr_in6 *in6 = (void *)ss;
+
+	spin_lock(&addr_str_lock);
+	i = last_addr_str++;
+	if (last_addr_str == MAX_ADDR_STR)
+		last_addr_str = 0;
+	spin_unlock(&addr_str_lock);
+	s = addr_str[i];
+
+	switch (ss->ss_family) {
+	case AF_INET:
+		sprintf(s, "%u.%u.%u.%u:%u",
+			(unsigned int)quad[0],
+			(unsigned int)quad[1],
+			(unsigned int)quad[2],
+			(unsigned int)quad[3],
+			(unsigned int)ntohs(in4->sin_port));
+		break;
+
+	case AF_INET6:
+		sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
+			in6->sin6_addr.s6_addr16[0],
+			in6->sin6_addr.s6_addr16[1],
+			in6->sin6_addr.s6_addr16[2],
+			in6->sin6_addr.s6_addr16[3],
+			in6->sin6_addr.s6_addr16[4],
+			in6->sin6_addr.s6_addr16[5],
+			in6->sin6_addr.s6_addr16[6],
+			in6->sin6_addr.s6_addr16[7],
+			(unsigned int)ntohs(in6->sin6_port));
+		break;
+
+	default:
+		sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
+	}
+
+	return s;
+}
+
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+struct workqueue_struct *ceph_msgr_wq;
+
+int __init ceph_msgr_init(void)
+{
+	ceph_msgr_wq = create_workqueue("ceph-msgr");
+	if (IS_ERR(ceph_msgr_wq)) {
+		int ret = PTR_ERR(ceph_msgr_wq);
+		pr_err("msgr_init failed to create workqueue: %d\n", ret);
+		ceph_msgr_wq = NULL;
+		return ret;
+	}
+	return 0;
+}
+
+void ceph_msgr_exit(void)
+{
+	destroy_workqueue(ceph_msgr_wq);
+}
+
+/*
+ * socket callback functions
+ */
+
+/* data available on socket, or listen socket received a connect */
+static void ceph_data_ready(struct sock *sk, int count_unused)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+	if (sk->sk_state != TCP_CLOSE_WAIT) {
+		dout("ceph_data_ready on %p state = %lu, queueing work\n",
+		     con, con->state);
+		queue_con(con);
+	}
+}
+
+/* socket has buffer space for writing */
+static void ceph_write_space(struct sock *sk)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+
+	/* only queue to workqueue if there is data we want to write. */
+	if (test_bit(WRITE_PENDING, &con->state)) {
+		dout("ceph_write_space %p queueing write work\n", con);
+		queue_con(con);
+	} else {
+		dout("ceph_write_space %p nothing to write\n", con);
+	}
+
+	/* since we have our own write_space, clear the SOCK_NOSPACE flag */
+	clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
+
+/* socket's state has changed */
+static void ceph_state_change(struct sock *sk)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+
+	dout("ceph_state_change %p state = %lu sk_state = %u\n",
+	     con, con->state, sk->sk_state);
+
+	if (test_bit(CLOSED, &con->state))
+		return;
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+		dout("ceph_state_change TCP_CLOSE\n");
+	case TCP_CLOSE_WAIT:
+		dout("ceph_state_change TCP_CLOSE_WAIT\n");
+		if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
+			if (test_bit(CONNECTING, &con->state))
+				con->error_msg = "connection failed";
+			else
+				con->error_msg = "socket closed";
+			queue_con(con);
+		}
+		break;
+	case TCP_ESTABLISHED:
+		dout("ceph_state_change TCP_ESTABLISHED\n");
+		queue_con(con);
+		break;
+	}
+}
+
+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,
+			       struct ceph_connection *con)
+{
+	struct sock *sk = sock->sk;
+	sk->sk_user_data = (void *)con;
+	sk->sk_data_ready = ceph_data_ready;
+	sk->sk_write_space = ceph_write_space;
+	sk->sk_state_change = ceph_state_change;
+}
+
+
+/*
+ * socket helpers
+ */
+
+/*
+ * initiate connection to a remote socket.
+ */
+static struct socket *ceph_tcp_connect(struct ceph_connection *con)
+{
+	struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
+	struct socket *sock;
+	int ret;
+
+	BUG_ON(con->sock);
+	ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret)
+		return ERR_PTR(ret);
+	con->sock = sock;
+	sock->sk->sk_allocation = GFP_NOFS;
+
+	set_sock_callbacks(sock, con);
+
+	dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
+
+	ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
+	if (ret == -EINPROGRESS) {
+		dout("connect %s EINPROGRESS sk_state = %u\n",
+		     pr_addr(&con->peer_addr.in_addr),
+		     sock->sk->sk_state);
+		ret = 0;
+	}
+	if (ret < 0) {
+		pr_err("connect %s error %d\n",
+		       pr_addr(&con->peer_addr.in_addr), ret);
+		sock_release(sock);
+		con->sock = NULL;
+		con->error_msg = "connect error";
+	}
+
+	if (ret < 0)
+		return ERR_PTR(ret);
+	return sock;
+}
+
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+	struct kvec iov = {buf, len};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+	return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+}
+
+/*
+ * write something.  @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+		     size_t kvlen, size_t len, int more)
+{
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+	if (more)
+		msg.msg_flags |= MSG_MORE;
+	else
+		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
+
+	return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+}
+
+
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+static int con_close_socket(struct ceph_connection *con)
+{
+	int rc;
+
+	dout("con_close_socket on %p sock %p\n", con, con->sock);
+	if (!con->sock)
+		return 0;
+	set_bit(SOCK_CLOSED, &con->state);
+	rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+	sock_release(con->sock);
+	con->sock = NULL;
+	clear_bit(SOCK_CLOSED, &con->state);
+	return rc;
+}
+
+/*
+ * Reset a connection.  Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+	list_del_init(&msg->list_head);
+	ceph_msg_put(msg);
+}
+static void ceph_msg_remove_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+							list_head);
+		ceph_msg_remove(msg);
+	}
+}
+
+static void reset_connection(struct ceph_connection *con)
+{
+	/* reset connection, out_queue, msg_ and connect_seq */
+	/* discard existing out_queue and msg_seq */
+	mutex_lock(&con->out_mutex);
+	ceph_msg_remove_list(&con->out_queue);
+	ceph_msg_remove_list(&con->out_sent);
+
+	con->connect_seq = 0;
+	con->out_seq = 0;
+	con->out_msg = NULL;
+	con->in_seq = 0;
+	mutex_unlock(&con->out_mutex);
+}
+
+/*
+ * mark a peer down.  drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+	dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
+	reset_connection(con);
+	queue_con(con);
+}
+
+/*
+ * clean up connection state
+ */
+void ceph_con_shutdown(struct ceph_connection *con)
+{
+	dout("con_shutdown %p\n", con);
+	reset_connection(con);
+	set_bit(DEAD, &con->state);
+	con_close_socket(con); /* silently ignore errors */
+}
+
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+{
+	dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
+	set_bit(OPENING, &con->state);
+	clear_bit(CLOSED, &con->state);
+	memcpy(&con->peer_addr, addr, sizeof(*addr));
+	queue_con(con);
+}
+
+/*
+ * generic get/put
+ */
+struct ceph_connection *ceph_con_get(struct ceph_connection *con)
+{
+	dout("con_get %p nref = %d -> %d\n", con,
+	     atomic_read(&con->nref), atomic_read(&con->nref) + 1);
+	if (atomic_inc_not_zero(&con->nref))
+		return con;
+	return NULL;
+}
+
+void ceph_con_put(struct ceph_connection *con)
+{
+	dout("con_put %p nref = %d -> %d\n", con,
+	     atomic_read(&con->nref), atomic_read(&con->nref) - 1);
+	BUG_ON(atomic_read(&con->nref) == 0);
+	if (atomic_dec_and_test(&con->nref)) {
+		ceph_con_shutdown(con);
+		kfree(con);
+	}
+}
+
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+{
+	dout("con_init %p\n", con);
+	memset(con, 0, sizeof(*con));
+	atomic_set(&con->nref, 1);
+	con->msgr = msgr;
+	mutex_init(&con->out_mutex);
+	INIT_LIST_HEAD(&con->out_queue);
+	INIT_LIST_HEAD(&con->out_sent);
+	INIT_DELAYED_WORK(&con->work, con_work);
+}
+
+
+/*
+ * We maintain a global counter to order connection attempts.  Get
+ * a unique seq greater than @gt.
+ */
+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+	u32 ret;
+
+	spin_lock(&msgr->global_seq_lock);
+	if (msgr->global_seq < gt)
+		msgr->global_seq = gt;
+	ret = ++msgr->global_seq;
+	spin_unlock(&msgr->global_seq_lock);
+	return ret;
+}
+
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con, int v)
+{
+	struct ceph_msg *m = con->out_msg;
+
+	dout("prepare_write_message_footer %p\n", con);
+	con->out_kvec_is_msg = true;
+	con->out_kvec[v].iov_base = &m->footer;
+	con->out_kvec[v].iov_len = sizeof(m->footer);
+	con->out_kvec_bytes += sizeof(m->footer);
+	con->out_kvec_left++;
+	con->out_more = m->more_to_follow;
+	con->out_msg = NULL;   /* we're done with this one */
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	int v = 0;
+
+	con->out_kvec_bytes = 0;
+	con->out_kvec_is_msg = true;
+
+	/* Sneak an ack in there first?  If we can get it into the same
+	 * TCP packet that's a good thing. */
+	if (con->in_seq > con->in_seq_acked) {
+		con->in_seq_acked = con->in_seq;
+		con->out_kvec[v].iov_base = &tag_ack;
+		con->out_kvec[v++].iov_len = 1;
+		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+		con->out_kvec[v].iov_base = &con->out_temp_ack;
+		con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
+		con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+	}
+
+	/* move message to sending/sent list */
+	m = list_first_entry(&con->out_queue,
+		       struct ceph_msg, list_head);
+	list_move_tail(&m->list_head, &con->out_sent);
+	con->out_msg = m;   /* we don't bother taking a reference here. */
+
+	m->hdr.seq = cpu_to_le64(++con->out_seq);
+
+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+	     m, con->out_seq, le16_to_cpu(m->hdr.type),
+	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+	     le32_to_cpu(m->hdr.data_len),
+	     m->nr_pages);
+	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+
+	/* tag + hdr + front + middle */
+	con->out_kvec[v].iov_base = &tag_msg;
+	con->out_kvec[v++].iov_len = 1;
+	con->out_kvec[v].iov_base = &m->hdr;
+	con->out_kvec[v++].iov_len = sizeof(m->hdr);
+	con->out_kvec[v++] = m->front;
+	if (m->middle)
+		con->out_kvec[v++] = m->middle->vec;
+	con->out_kvec_left = v;
+	con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
+		(m->middle ? m->middle->vec.iov_len : 0);
+	con->out_kvec_cur = con->out_kvec;
+
+	/* fill in crc (except data pages), footer */
+	con->out_msg->hdr.crc =
+		cpu_to_le32(crc32c(0, (void *)&m->hdr,
+				      sizeof(m->hdr) - sizeof(m->hdr.crc)));
+	con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+	con->out_msg->footer.front_crc =
+		cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
+	if (m->middle)
+		con->out_msg->footer.middle_crc =
+			cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
+					   m->middle->vec.iov_len));
+	else
+		con->out_msg->footer.middle_crc = 0;
+	con->out_msg->footer.data_crc = 0;
+	dout("prepare_write_message front_crc %u data_crc %u\n",
+	     le32_to_cpu(con->out_msg->footer.front_crc),
+	     le32_to_cpu(con->out_msg->footer.middle_crc));
+
+	/* is there a data payload? */
+	if (le32_to_cpu(m->hdr.data_len) > 0) {
+		/* initialize page iterator */
+		con->out_msg_pos.page = 0;
+		con->out_msg_pos.page_pos =
+			le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+		con->out_msg_pos.data_pos = 0;
+		con->out_msg_pos.did_page_crc = 0;
+		con->out_more = 1;  /* data + footer will follow */
+	} else {
+		/* no, queue up footer too and be done */
+		prepare_write_message_footer(con, v);
+	}
+
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+	dout("prepare_write_ack %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con->out_kvec[0].iov_base = &tag_ack;
+	con->out_kvec[0].iov_len = 1;
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con->out_kvec[1].iov_base = &con->out_temp_ack;
+	con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
+	con->out_kvec_left = 2;
+	con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 1;  /* more will follow.. eventually.. */
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+	dout("prepare_write_keepalive %p\n", con);
+	con->out_kvec[0].iov_base = &tag_keepalive;
+	con->out_kvec[0].iov_len = 1;
+	con->out_kvec_left = 1;
+	con->out_kvec_bytes = 1;
+	con->out_kvec_cur = con->out_kvec;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_connect(struct ceph_messenger *msgr,
+				  struct ceph_connection *con)
+{
+	int len = strlen(CEPH_BANNER);
+	unsigned global_seq = get_global_seq(con->msgr, 0);
+	int proto;
+
+	switch (con->peer_name.type) {
+	case CEPH_ENTITY_TYPE_MON:
+		proto = CEPH_MONC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_OSD:
+		proto = CEPH_OSDC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_MDS:
+		proto = CEPH_MDSC_PROTOCOL;
+		break;
+	default:
+		BUG();
+	}
+
+	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+	     con->connect_seq, global_seq, proto);
+	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+	con->out_connect.global_seq = cpu_to_le32(global_seq);
+	con->out_connect.protocol_version = cpu_to_le32(proto);
+	con->out_connect.flags = 0;
+	if (test_bit(LOSSYTX, &con->state))
+		con->out_connect.flags = CEPH_MSG_CONNECT_LOSSY;
+
+	con->out_kvec[0].iov_base = CEPH_BANNER;
+	con->out_kvec[0].iov_len = len;
+	con->out_kvec[1].iov_base = &msgr->inst.addr;
+	con->out_kvec[1].iov_len = sizeof(msgr->inst.addr);
+	con->out_kvec[2].iov_base = &con->out_connect;
+	con->out_kvec[2].iov_len = sizeof(con->out_connect);
+	con->out_kvec_left = 3;
+	con->out_kvec_bytes = len + sizeof(msgr->inst.addr) +
+		sizeof(con->out_connect);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 0;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+static void prepare_write_connect_retry(struct ceph_messenger *msgr,
+					struct ceph_connection *con)
+{
+	dout("prepare_write_connect_retry %p\n", con);
+	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+	con->out_connect.global_seq =
+		cpu_to_le32(get_global_seq(con->msgr, 0));
+
+	con->out_kvec[0].iov_base = &con->out_connect;
+	con->out_kvec[0].iov_len = sizeof(con->out_connect);
+	con->out_kvec_left = 1;
+	con->out_kvec_bytes = sizeof(con->out_connect);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 0;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ *  1 -> done
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+	while (con->out_kvec_bytes > 0) {
+		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+				       con->out_kvec_left, con->out_kvec_bytes,
+				       con->out_more);
+		if (ret <= 0)
+			goto out;
+		con->out_kvec_bytes -= ret;
+		if (con->out_kvec_bytes == 0)
+			break;            /* done */
+		while (ret > 0) {
+			if (ret >= con->out_kvec_cur->iov_len) {
+				ret -= con->out_kvec_cur->iov_len;
+				con->out_kvec_cur++;
+				con->out_kvec_left--;
+			} else {
+				con->out_kvec_cur->iov_len -= ret;
+				con->out_kvec_cur->iov_base += ret;
+				ret = 0;
+				break;
+			}
+		}
+	}
+	con->out_kvec_left = 0;
+	con->out_kvec_is_msg = false;
+	ret = 1;
+out:
+	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+	     con->out_kvec_bytes, con->out_kvec_left, ret);
+	return ret;  /* done! */
+}
+
+/*
+ * Write as much message data payload as we can.  If we finish, queue
+ * up the footer.
+ *  1 -> done, footer is now queued in out_kvec[].
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_msg_pages(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+	unsigned data_len = le32_to_cpu(msg->hdr.data_len);
+	size_t len;
+	int crc = con->msgr->nocrc;
+	int ret;
+
+	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
+	     con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+	     con->out_msg_pos.page_pos);
+
+	while (con->out_msg_pos.page < con->out_msg->nr_pages) {
+		struct page *page = NULL;
+		void *kaddr = NULL;
+
+		/*
+		 * if we are calculating the data crc (the default), we need
+		 * to map the page.  if our pages[] has been revoked, use the
+		 * zero page.
+		 */
+		if (msg->pages) {
+			page = msg->pages[con->out_msg_pos.page];
+			if (crc)
+				kaddr = kmap(page);
+		} else {
+			page = con->msgr->zero_page;
+			if (crc)
+				kaddr = page_address(con->msgr->zero_page);
+		}
+		len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
+			  (int)(data_len - con->out_msg_pos.data_pos));
+		if (crc && !con->out_msg_pos.did_page_crc) {
+			void *base = kaddr + con->out_msg_pos.page_pos;
+			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+
+			BUG_ON(kaddr == NULL);
+			con->out_msg->footer.data_crc =
+				cpu_to_le32(crc32c(tmpcrc, base, len));
+			con->out_msg_pos.did_page_crc = 1;
+		}
+
+		ret = kernel_sendpage(con->sock, page,
+				      con->out_msg_pos.page_pos, len,
+				      MSG_DONTWAIT | MSG_NOSIGNAL |
+				      MSG_MORE);
+
+		if (crc && msg->pages)
+			kunmap(page);
+
+		if (ret <= 0)
+			goto out;
+
+		con->out_msg_pos.data_pos += ret;
+		con->out_msg_pos.page_pos += ret;
+		if (ret == len) {
+			con->out_msg_pos.page_pos = 0;
+			con->out_msg_pos.page++;
+			con->out_msg_pos.did_page_crc = 0;
+		}
+	}
+
+	dout("write_partial_msg_pages %p msg %p done\n", con, msg);
+
+	/* prepare and queue up footer, too */
+	if (!crc)
+		con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+	con->out_kvec_bytes = 0;
+	con->out_kvec_left = 0;
+	con->out_kvec_cur = con->out_kvec;
+	prepare_write_message_footer(con, 0);
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+	int ret;
+
+	while (con->out_skip > 0) {
+		struct kvec iov = {
+			.iov_base = page_address(con->msgr->zero_page),
+			.iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
+		};
+
+		ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
+		if (ret <= 0)
+			goto out;
+		con->out_skip -= ret;
+	}
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_connect(struct ceph_connection *con)
+{
+	dout("prepare_read_connect %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+	dout("prepare_read_tag %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+	dout("prepare_read_message %p\n", con);
+	BUG_ON(con->in_msg != NULL);
+	con->in_base_pos = 0;
+	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+	return 0;
+}
+
+
+static int read_partial(struct ceph_connection *con,
+			int *to, int size, void *object)
+{
+	*to += size;
+	while (con->in_base_pos < *to) {
+		int left = *to - con->in_base_pos;
+		int have = size - left;
+		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	return 1;
+}
+
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_connect(struct ceph_connection *con)
+{
+	int ret, to = 0;
+
+	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+	/* peer's banner */
+	ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
+			   &con->actual_peer_addr);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
+			   &con->peer_addr_for_me);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
+	if (ret <= 0)
+		goto out;
+
+	dout("read_partial_connect %p connect_seq = %u, global_seq = %u\n",
+	     con, le32_to_cpu(con->in_reply.connect_seq),
+	     le32_to_cpu(con->in_reply.global_seq));
+out:
+	return ret;
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+		pr_err("connect to/from %s has bad banner\n",
+		       pr_addr(&con->peer_addr.in_addr));
+		con->error_msg = "protocol error, bad banner";
+		return -1;
+	}
+	return 0;
+}
+
+static bool addr_is_blank(struct sockaddr_storage *ss)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+	case AF_INET6:
+		return
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+	}
+	return false;
+}
+
+static int addr_port(struct sockaddr_storage *ss)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *)ss)->sin_port;
+	case AF_INET6:
+		return ((struct sockaddr_in6 *)ss)->sin6_port;
+	}
+	return 0;
+}
+
+static void addr_set_port(struct sockaddr_storage *ss, int p)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		((struct sockaddr_in *)ss)->sin_port = htons(p);
+	case AF_INET6:
+		((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+	}
+}
+
+/*
+ * Parse an ip[:port] list into an addr array.  Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+		   struct ceph_entity_addr *addr,
+		   int max_count, int *count)
+{
+	int i;
+	const char *p = c;
+
+	dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+	for (i = 0; i < max_count; i++) {
+		const char *ipend;
+		struct sockaddr_storage *ss = &addr[i].in_addr;
+		struct sockaddr_in *in4 = (void *)ss;
+		struct sockaddr_in6 *in6 = (void *)ss;
+		int port;
+
+		memset(ss, 0, sizeof(*ss));
+		if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
+			     ',', &ipend)) {
+			ss->ss_family = AF_INET;
+		} else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+				    ',', &ipend)) {
+			ss->ss_family = AF_INET6;
+		} else {
+			goto bad;
+		}
+		p = ipend;
+
+		/* port? */
+		if (p < end && *p == ':') {
+			port = 0;
+			p++;
+			while (p < end && *p >= '0' && *p <= '9') {
+				port = (port * 10) + (*p - '0');
+				p++;
+			}
+			if (port > 65535 || port == 0)
+				goto bad;
+		} else {
+			port = CEPH_MON_PORT;
+		}
+
+		addr_set_port(ss, port);
+
+		dout("parse_ips got %s\n", pr_addr(ss));
+
+		if (p == end)
+			break;
+		if (*p != ',')
+			goto bad;
+		p++;
+	}
+
+	if (p != end)
+		goto bad;
+
+	if (count)
+		*count = i + 1;
+	return 0;
+
+bad:
+	pr_err("parse_ips bad ip '%s'\n", c);
+	return -EINVAL;
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+	if (verify_hello(con) < 0)
+		return -1;
+
+	/*
+	 * Make sure the other end is who we wanted.  note that the other
+	 * end may not yet know their ip address, so if it's 0.0.0.0, give
+	 * them the benefit of the doubt.
+	 */
+	if (!ceph_entity_addr_is_local(&con->peer_addr,
+				       &con->actual_peer_addr) &&
+	    !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+		pr_err("wrong peer, want %s/%d, "
+		       "got %s/%d, wtf\n",
+		       pr_addr(&con->peer_addr.in_addr),
+		       con->peer_addr.nonce,
+		       pr_addr(&con->actual_peer_addr.in_addr),
+		       con->actual_peer_addr.nonce);
+		con->error_msg = "protocol error, wrong peer";
+		return -1;
+	}
+
+	/*
+	 * did we learn our address?
+	 */
+	if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
+		int port = addr_port(&con->msgr->inst.addr.in_addr);
+
+		memcpy(&con->msgr->inst.addr.in_addr,
+		       &con->peer_addr_for_me.in_addr,
+		       sizeof(con->peer_addr_for_me.in_addr));
+		addr_set_port(&con->msgr->inst.addr.in_addr, port);
+		dout("process_connect learned my addr is %s\n",
+		     pr_addr(&con->msgr->inst.addr.in_addr));
+	}
+
+	switch (con->in_reply.tag) {
+	case CEPH_MSGR_TAG_BADPROTOVER:
+		dout("process_connect got BADPROTOVER my %d != their %d\n",
+		     le32_to_cpu(con->out_connect.protocol_version),
+		     le32_to_cpu(con->in_reply.protocol_version));
+		pr_err("%s%lld %s protocol version mismatch,"
+		       " my %d != server's %d\n",
+		       ENTITY_NAME(con->peer_name),
+		       pr_addr(&con->peer_addr.in_addr),
+		       le32_to_cpu(con->out_connect.protocol_version),
+		       le32_to_cpu(con->in_reply.protocol_version));
+		con->error_msg = "protocol version mismatch";
+		if (con->ops->bad_proto)
+			con->ops->bad_proto(con);
+		reset_connection(con);
+		set_bit(CLOSED, &con->state);  /* in case there's queued work */
+		return -1;
+
+
+	case CEPH_MSGR_TAG_RESETSESSION:
+		/*
+		 * If we connected with a large connect_seq but the peer
+		 * has no record of a session with us (no connection, or
+		 * connect_seq == 0), they will send RESETSESION to indicate
+		 * that they must have reset their session, and may have
+		 * dropped messages.
+		 */
+		dout("process_connect got RESET peer seq %u\n",
+		     le32_to_cpu(con->in_connect.connect_seq));
+		pr_err("%s%lld %s connection reset\n",
+		       ENTITY_NAME(con->peer_name),
+		       pr_addr(&con->peer_addr.in_addr));
+		reset_connection(con);
+		prepare_write_connect_retry(con->msgr, con);
+		prepare_read_connect(con);
+
+		/* Tell ceph about it. */
+		pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
+		if (con->ops->peer_reset)
+			con->ops->peer_reset(con);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_SESSION:
+		/*
+		 * If we sent a smaller connect_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
+		     le32_to_cpu(con->out_connect.connect_seq),
+		     le32_to_cpu(con->in_connect.connect_seq));
+		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+		prepare_write_connect_retry(con->msgr, con);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_GLOBAL:
+		/*
+		 * If we sent a smaller global_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_GLOBAL my %u, peer_gseq = %u\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_connect.global_seq));
+		get_global_seq(con->msgr,
+			       le32_to_cpu(con->in_connect.global_seq));
+		prepare_write_connect_retry(con->msgr, con);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_READY:
+		clear_bit(CONNECTING, &con->state);
+		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+			set_bit(LOSSYRX, &con->state);
+		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+		con->connect_seq++;
+		dout("process_connect got READY gseq %d cseq %d (%d)\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_reply.connect_seq),
+		     con->connect_seq);
+		WARN_ON(con->connect_seq !=
+			le32_to_cpu(con->in_reply.connect_seq));
+
+		con->delay = 0;  /* reset backoff memory */
+		prepare_read_tag(con);
+		break;
+
+	case CEPH_MSGR_TAG_WAIT:
+		/*
+		 * If there is a connection race (we are opening
+		 * connections to each other), one of us may just have
+		 * to WAIT.  This shouldn't happen if we are the
+		 * client.
+		 */
+		pr_err("process_connect peer connecting WAIT\n");
+
+	default:
+		pr_err("connect protocol error, will retry\n");
+		con->error_msg = "protocol error, garbage tag during connect";
+		return -1;
+	}
+	return 0;
+}
+
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+	int to = 0;
+
+	return read_partial(con, &to, sizeof(con->in_temp_ack),
+			    &con->in_temp_ack);
+}
+
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	u64 ack = le64_to_cpu(con->in_temp_ack);
+	u64 seq;
+
+	mutex_lock(&con->out_mutex);
+	while (!list_empty(&con->out_sent)) {
+		m = list_first_entry(&con->out_sent, struct ceph_msg,
+				     list_head);
+		seq = le64_to_cpu(m->hdr.seq);
+		if (seq > ack)
+			break;
+		dout("got ack for seq %llu type %d at %p\n", seq,
+		     le16_to_cpu(m->hdr.type), m);
+		ceph_msg_remove(m);
+	}
+	mutex_unlock(&con->out_mutex);
+	prepare_read_tag(con);
+}
+
+
+
+
+
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->in_msg;
+	void *p;
+	int ret;
+	int to, want, left;
+	unsigned front_len, middle_len, data_len, data_off;
+	int datacrc = con->msgr->nocrc;
+
+	dout("read_partial_message con %p msg %p\n", con, m);
+
+	/* header */
+	while (con->in_base_pos < sizeof(con->in_hdr)) {
+		left = sizeof(con->in_hdr) - con->in_base_pos;
+		ret = ceph_tcp_recvmsg(con->sock,
+				       (char *)&con->in_hdr + con->in_base_pos,
+				       left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+		if (con->in_base_pos == sizeof(con->in_hdr)) {
+			u32 crc = crc32c(0, (void *)&con->in_hdr,
+				 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
+			if (crc != le32_to_cpu(con->in_hdr.crc)) {
+				pr_err("read_partial_message bad hdr "
+				       " crc %u != expected %u\n",
+				       crc, con->in_hdr.crc);
+				return -EBADMSG;
+			}
+		}
+	}
+
+	front_len = le32_to_cpu(con->in_hdr.front_len);
+	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+		return -EIO;
+	middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+	data_len = le32_to_cpu(con->in_hdr.data_len);
+	if (data_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+
+	/* allocate message? */
+	if (!con->in_msg) {
+		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+		     con->in_hdr.front_len, con->in_hdr.data_len);
+		con->in_msg = con->ops->alloc_msg(con, &con->in_hdr);
+		if (!con->in_msg) {
+			/* skip this message */
+			dout("alloc_msg returned NULL, skipping message\n");
+			con->in_base_pos = -front_len - middle_len - data_len -
+				sizeof(m->footer);
+			con->in_tag = CEPH_MSGR_TAG_READY;
+			return 0;
+		}
+		if (IS_ERR(con->in_msg)) {
+			ret = PTR_ERR(con->in_msg);
+			con->in_msg = NULL;
+			con->error_msg = "out of memory for incoming message";
+			return ret;
+		}
+		m = con->in_msg;
+		m->front.iov_len = 0;    /* haven't read it yet */
+		memcpy(&m->hdr, &con->in_hdr, sizeof(con->in_hdr));
+	}
+
+	/* front */
+	while (m->front.iov_len < front_len) {
+		BUG_ON(m->front.iov_base == NULL);
+		left = front_len - m->front.iov_len;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)m->front.iov_base +
+				       m->front.iov_len, left);
+		if (ret <= 0)
+			return ret;
+		m->front.iov_len += ret;
+		if (m->front.iov_len == front_len)
+			con->in_front_crc = crc32c(0, m->front.iov_base,
+						      m->front.iov_len);
+	}
+
+	/* middle */
+	while (middle_len > 0 && (!m->middle ||
+				  m->middle->vec.iov_len < middle_len)) {
+		if (m->middle == NULL) {
+			ret = -EOPNOTSUPP;
+			if (con->ops->alloc_middle)
+				ret = con->ops->alloc_middle(con, m);
+			if (ret < 0) {
+				dout("alloc_middle failed, skipping payload\n");
+				con->in_base_pos = -middle_len - data_len
+					- sizeof(m->footer);
+				ceph_msg_put(con->in_msg);
+				con->in_msg = NULL;
+				con->in_tag = CEPH_MSGR_TAG_READY;
+				return 0;
+			}
+			m->middle->vec.iov_len = 0;
+		}
+		left = middle_len - m->middle->vec.iov_len;
+		ret = ceph_tcp_recvmsg(con->sock,
+				       (char *)m->middle->vec.iov_base +
+				       m->middle->vec.iov_len, left);
+		if (ret <= 0)
+			return ret;
+		m->middle->vec.iov_len += ret;
+		if (m->middle->vec.iov_len == middle_len)
+			con->in_middle_crc = crc32c(0, m->middle->vec.iov_base,
+						      m->middle->vec.iov_len);
+	}
+
+	/* (page) data */
+	data_off = le16_to_cpu(m->hdr.data_off);
+	if (data_len == 0)
+		goto no_data;
+
+	if (m->nr_pages == 0) {
+		con->in_msg_pos.page = 0;
+		con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+		con->in_msg_pos.data_pos = 0;
+		/* find pages for data payload */
+		want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+		ret = -1;
+		if (con->ops->prepare_pages)
+			ret = con->ops->prepare_pages(con, m, want);
+		if (ret < 0) {
+			dout("%p prepare_pages failed, skipping payload\n", m);
+			con->in_base_pos = -data_len - sizeof(m->footer);
+			ceph_msg_put(con->in_msg);
+			con->in_msg = NULL;
+			con->in_tag = CEPH_MSGR_TAG_READY;
+			return 0;
+		}
+		BUG_ON(m->nr_pages < want);
+	}
+	while (con->in_msg_pos.data_pos < data_len) {
+		left = min((int)(data_len - con->in_msg_pos.data_pos),
+			   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+		BUG_ON(m->pages == NULL);
+		p = kmap(m->pages[con->in_msg_pos.page]);
+		ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+				       left);
+		if (ret > 0 && datacrc)
+			con->in_data_crc =
+				crc32c(con->in_data_crc,
+					  p + con->in_msg_pos.page_pos, ret);
+		kunmap(m->pages[con->in_msg_pos.page]);
+		if (ret <= 0)
+			return ret;
+		con->in_msg_pos.data_pos += ret;
+		con->in_msg_pos.page_pos += ret;
+		if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+			con->in_msg_pos.page_pos = 0;
+			con->in_msg_pos.page++;
+		}
+	}
+
+no_data:
+	/* footer */
+	to = sizeof(m->hdr) + sizeof(m->footer);
+	while (con->in_base_pos < to) {
+		left = to - con->in_base_pos;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
+				       (con->in_base_pos - sizeof(m->hdr)),
+				       left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+	     m, front_len, m->footer.front_crc, middle_len,
+	     m->footer.middle_crc, data_len, m->footer.data_crc);
+
+	/* crc ok? */
+	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+		pr_err("read_partial_message %p front crc %u != exp. %u\n",
+		       m, con->in_front_crc, m->footer.front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+		pr_err("read_partial_message %p middle crc %u != exp %u\n",
+		       m, con->in_middle_crc, m->footer.middle_crc);
+		return -EBADMSG;
+	}
+	if (datacrc &&
+	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+		return -EBADMSG;
+	}
+
+	return 1; /* done! */
+}
+
+/*
+ * Process message.  This happens in the worker thread.  The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+static void process_message(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->in_msg;
+
+	con->in_msg = NULL;
+
+	/* if first message, set peer_name */
+	if (con->peer_name.type == 0)
+		con->peer_name = msg->hdr.src.name;
+
+	mutex_lock(&con->out_mutex);
+	con->in_seq++;
+	mutex_unlock(&con->out_mutex);
+
+	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+	     msg, le64_to_cpu(msg->hdr.seq),
+	     ENTITY_NAME(msg->hdr.src.name),
+	     le16_to_cpu(msg->hdr.type),
+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.data_len),
+	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+	con->ops->dispatch(con, msg);
+	prepare_read_tag(con);
+}
+
+
+/*
+ * Write something to the socket.  Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+static int try_write(struct ceph_connection *con)
+{
+	struct ceph_messenger *msgr = con->msgr;
+	int ret = 1;
+
+	dout("try_write start %p state %lu nref %d\n", con, con->state,
+	     atomic_read(&con->nref));
+
+	mutex_lock(&con->out_mutex);
+more:
+	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+
+	/* open the socket first? */
+	if (con->sock == NULL) {
+		/*
+		 * if we were STANDBY and are reconnecting _this_
+		 * connection, bump connect_seq now.  Always bump
+		 * global_seq.
+		 */
+		if (test_and_clear_bit(STANDBY, &con->state))
+			con->connect_seq++;
+
+		prepare_write_connect(msgr, con);
+		prepare_read_connect(con);
+		set_bit(CONNECTING, &con->state);
+
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		dout("try_write initiating connect on %p new state %lu\n",
+		     con, con->state);
+		con->sock = ceph_tcp_connect(con);
+		if (IS_ERR(con->sock)) {
+			con->sock = NULL;
+			con->error_msg = "connect error";
+			ret = -1;
+			goto out;
+		}
+	}
+
+more_kvec:
+	/* kvec data queued? */
+	if (con->out_skip) {
+		ret = write_partial_skip(con);
+		if (ret <= 0)
+			goto done;
+		if (ret < 0) {
+			dout("try_write write_partial_skip err %d\n", ret);
+			goto done;
+		}
+	}
+	if (con->out_kvec_left) {
+		ret = write_partial_kvec(con);
+		if (ret <= 0)
+			goto done;
+		if (ret < 0) {
+			dout("try_write write_partial_kvec err %d\n", ret);
+			goto done;
+		}
+	}
+
+	/* msg pages? */
+	if (con->out_msg) {
+		ret = write_partial_msg_pages(con);
+		if (ret == 1)
+			goto more_kvec;  /* we need to send the footer, too! */
+		if (ret == 0)
+			goto done;
+		if (ret < 0) {
+			dout("try_write write_partial_msg_pages err %d\n",
+			     ret);
+			goto done;
+		}
+	}
+
+	if (!test_bit(CONNECTING, &con->state)) {
+		/* is anything else pending? */
+		if (!list_empty(&con->out_queue)) {
+			prepare_write_message(con);
+			goto more;
+		}
+		if (con->in_seq > con->in_seq_acked) {
+			prepare_write_ack(con);
+			goto more;
+		}
+		if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+			prepare_write_keepalive(con);
+			goto more;
+		}
+	}
+
+	/* Nothing to do! */
+	clear_bit(WRITE_PENDING, &con->state);
+	dout("try_write nothing else to write.\n");
+done:
+	ret = 0;
+out:
+	mutex_unlock(&con->out_mutex);
+	dout("try_write done on %p\n", con);
+	return ret;
+}
+
+
+
+/*
+ * Read what we can from the socket.
+ */
+static int try_read(struct ceph_connection *con)
+{
+	struct ceph_messenger *msgr;
+	int ret = -1;
+
+	if (!con->sock)
+		return 0;
+
+	if (test_bit(STANDBY, &con->state))
+		return 0;
+
+	dout("try_read start on %p\n", con);
+	msgr = con->msgr;
+
+more:
+	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+	     con->in_base_pos);
+	if (test_bit(CONNECTING, &con->state)) {
+		dout("try_read connecting\n");
+		ret = read_partial_connect(con);
+		if (ret <= 0)
+			goto done;
+		if (process_connect(con) < 0) {
+			ret = -1;
+			goto out;
+		}
+		goto more;
+	}
+
+	if (con->in_base_pos < 0) {
+		/*
+		 * skipping + discarding content.
+		 *
+		 * FIXME: there must be a better way to do this!
+		 */
+		static char buf[1024];
+		int skip = min(1024, -con->in_base_pos);
+		dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
+		ret = ceph_tcp_recvmsg(con->sock, buf, skip);
+		if (ret <= 0)
+			goto done;
+		con->in_base_pos += ret;
+		if (con->in_base_pos)
+			goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_READY) {
+		/*
+		 * what's next?
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+		if (ret <= 0)
+			goto done;
+		dout("try_read got tag %d\n", (int)con->in_tag);
+		switch (con->in_tag) {
+		case CEPH_MSGR_TAG_MSG:
+			prepare_read_message(con);
+			break;
+		case CEPH_MSGR_TAG_ACK:
+			prepare_read_ack(con);
+			break;
+		case CEPH_MSGR_TAG_CLOSE:
+			set_bit(CLOSED, &con->state);   /* fixme */
+			goto done;
+		default:
+			goto bad_tag;
+		}
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+		ret = read_partial_message(con);
+		if (ret <= 0) {
+			switch (ret) {
+			case -EBADMSG:
+				con->error_msg = "bad crc";
+				ret = -EIO;
+				goto out;
+			case -EIO:
+				con->error_msg = "io error";
+				goto out;
+			default:
+				goto done;
+			}
+		}
+		if (con->in_tag == CEPH_MSGR_TAG_READY)
+			goto more;
+		process_message(con);
+		goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+		ret = read_partial_ack(con);
+		if (ret <= 0)
+			goto done;
+		process_ack(con);
+		goto more;
+	}
+
+done:
+	ret = 0;
+out:
+	dout("try_read done on %p\n", con);
+	return ret;
+
+bad_tag:
+	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+	con->error_msg = "protocol error, garbage tag";
+	ret = -1;
+	goto out;
+}
+
+
+/*
+ * Atomically queue work on a connection.  Bump @con reference to
+ * avoid races with connection teardown.
+ *
+ * There is some trickery going on with QUEUED and BUSY because we
+ * only want a _single_ thread operating on each connection at any
+ * point in time, but we want to use all available CPUs.
+ *
+ * The worker thread only proceeds if it can atomically set BUSY.  It
+ * clears QUEUED and does it's thing.  When it thinks it's done, it
+ * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
+ * (tries again to set BUSY).
+ *
+ * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
+ * try to queue work.  If that fails (work is already queued, or BUSY)
+ * we give up (work also already being done or is queued) but leave QUEUED
+ * set so that the worker thread will loop if necessary.
+ */
+static void queue_con(struct ceph_connection *con)
+{
+	if (test_bit(DEAD, &con->state)) {
+		dout("queue_con %p ignoring: DEAD\n",
+		     con);
+		return;
+	}
+
+	if (!con->ops->get(con)) {
+		dout("queue_con %p ref count 0\n", con);
+		return;
+	}
+
+	set_bit(QUEUED, &con->state);
+	if (test_bit(BUSY, &con->state)) {
+		dout("queue_con %p - already BUSY\n", con);
+		con->ops->put(con);
+	} else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
+		dout("queue_con %p - already queued\n", con);
+		con->ops->put(con);
+	} else {
+		dout("queue_con %p\n", con);
+	}
+}
+
+/*
+ * Do some work on a connection.  Drop a connection ref when we're done.
+ */
+static void con_work(struct work_struct *work)
+{
+	struct ceph_connection *con = container_of(work, struct ceph_connection,
+						   work.work);
+	int backoff = 0;
+
+more:
+	if (test_and_set_bit(BUSY, &con->state) != 0) {
+		dout("con_work %p BUSY already set\n", con);
+		goto out;
+	}
+	dout("con_work %p start, clearing QUEUED\n", con);
+	clear_bit(QUEUED, &con->state);
+
+	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
+		dout("con_work CLOSED\n");
+		con_close_socket(con);
+		goto done;
+	}
+	if (test_and_clear_bit(OPENING, &con->state)) {
+		/* reopen w/ new peer */
+		dout("con_work OPENING\n");
+		con_close_socket(con);
+	}
+
+	if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
+	    try_read(con) < 0 ||
+	    try_write(con) < 0) {
+		backoff = 1;
+		ceph_fault(con);     /* error/fault path */
+	}
+
+done:
+	clear_bit(BUSY, &con->state);
+	dout("con->state=%lu\n", con->state);
+	if (test_bit(QUEUED, &con->state)) {
+		if (!backoff) {
+			dout("con_work %p QUEUED reset, looping\n", con);
+			goto more;
+		}
+		dout("con_work %p QUEUED reset, but just faulted\n", con);
+		clear_bit(QUEUED, &con->state);
+	}
+	dout("con_work %p done\n", con);
+
+out:
+	con->ops->put(con);
+}
+
+
+/*
+ * Generic error/fault handler.  A retry mechanism is used with
+ * exponential backoff
+ */
+static void ceph_fault(struct ceph_connection *con)
+{
+	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+	       pr_addr(&con->peer_addr.in_addr), con->error_msg);
+	dout("fault %p state %lu to peer %s\n",
+	     con, con->state, pr_addr(&con->peer_addr.in_addr));
+
+	if (test_bit(LOSSYTX, &con->state)) {
+		dout("fault on LOSSYTX channel\n");
+		goto out;
+	}
+
+	clear_bit(BUSY, &con->state);  /* to avoid an improbable race */
+
+	con_close_socket(con);
+	con->in_msg = NULL;
+
+	/* If there are no messages in the queue, place the connection
+	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
+	mutex_lock(&con->out_mutex);
+	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
+		dout("fault setting STANDBY\n");
+		set_bit(STANDBY, &con->state);
+		mutex_unlock(&con->out_mutex);
+		goto out;
+	}
+
+	/* Requeue anything that hasn't been acked, and retry after a
+	 * delay. */
+	list_splice_init(&con->out_sent, &con->out_queue);
+	mutex_unlock(&con->out_mutex);
+
+	if (con->delay == 0)
+		con->delay = BASE_DELAY_INTERVAL;
+	else if (con->delay < MAX_DELAY_INTERVAL)
+		con->delay *= 2;
+
+	/* explicitly schedule work to try to reconnect again later. */
+	dout("fault queueing %p delay %lu\n", con, con->delay);
+	con->ops->get(con);
+	if (queue_delayed_work(ceph_msgr_wq, &con->work,
+			       round_jiffies_relative(con->delay)) == 0)
+		con->ops->put(con);
+
+out:
+	if (con->ops->fault)
+		con->ops->fault(con);
+}
+
+
+
+/*
+ * create a new messenger instance
+ */
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
+{
+	struct ceph_messenger *msgr;
+
+	msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
+	if (msgr == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&msgr->global_seq_lock);
+
+	/* the zero page is needed if a request is "canceled" while the message
+	 * is being written over the socket */
+	msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!msgr->zero_page) {
+		kfree(msgr);
+		return ERR_PTR(-ENOMEM);
+	}
+	kmap(msgr->zero_page);
+
+	if (myaddr)
+		msgr->inst.addr = *myaddr;
+
+	/* select a random nonce */
+	get_random_bytes(&msgr->inst.addr.nonce,
+			 sizeof(msgr->inst.addr.nonce));
+
+	dout("messenger_create %p\n", msgr);
+	return msgr;
+}
+
+void ceph_messenger_destroy(struct ceph_messenger *msgr)
+{
+	dout("destroy %p\n", msgr);
+	kunmap(msgr->zero_page);
+	__free_page(msgr->zero_page);
+	kfree(msgr);
+	dout("destroyed messenger %p\n", msgr);
+}
+
+/*
+ * Queue up an outgoing message on the given connection.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	if (test_bit(CLOSED, &con->state)) {
+		dout("con_send %p closed, dropping %p\n", con, msg);
+		ceph_msg_put(msg);
+		return;
+	}
+
+	/* set src+dst */
+	msg->hdr.src = con->msgr->inst;
+	msg->hdr.orig_src = con->msgr->inst;
+	msg->hdr.dst_erank = con->peer_addr.erank;
+
+	/* queue */
+	mutex_lock(&con->out_mutex);
+	BUG_ON(!list_empty(&msg->list_head));
+	list_add_tail(&msg->list_head, &con->out_queue);
+	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+	     ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.middle_len),
+	     le32_to_cpu(msg->hdr.data_len));
+	mutex_unlock(&con->out_mutex);
+
+	/* if there wasn't anything waiting to send before, queue
+	 * new work */
+	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+		queue_con(con);
+}
+
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	mutex_lock(&con->out_mutex);
+	if (!list_empty(&msg->list_head)) {
+		dout("con_revoke %p msg %p\n", con, msg);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+		msg->hdr.seq = 0;
+		if (con->out_msg == msg)
+			con->out_msg = NULL;
+		if (con->out_kvec_is_msg) {
+			con->out_skip = con->out_kvec_bytes;
+			con->out_kvec_is_msg = false;
+		}
+	} else {
+		dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
+	}
+	mutex_unlock(&con->out_mutex);
+}
+
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+	if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
+	    test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+		queue_con(con);
+}
+
+
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new(int type, int front_len,
+			      int page_len, int page_off, struct page **pages)
+{
+	struct ceph_msg *m;
+
+	m = kmalloc(sizeof(*m), GFP_NOFS);
+	if (m == NULL)
+		goto out;
+	atomic_set(&m->nref, 1);
+	INIT_LIST_HEAD(&m->list_head);
+
+	m->hdr.type = cpu_to_le16(type);
+	m->hdr.front_len = cpu_to_le32(front_len);
+	m->hdr.middle_len = 0;
+	m->hdr.data_len = cpu_to_le32(page_len);
+	m->hdr.data_off = cpu_to_le16(page_off);
+	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+	m->footer.front_crc = 0;
+	m->footer.middle_crc = 0;
+	m->footer.data_crc = 0;
+	m->front_max = front_len;
+	m->front_is_vmalloc = false;
+	m->more_to_follow = false;
+	m->pool = NULL;
+
+	/* front */
+	if (front_len) {
+		if (front_len > PAGE_CACHE_SIZE) {
+			m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+						      PAGE_KERNEL);
+			m->front_is_vmalloc = true;
+		} else {
+			m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+		}
+		if (m->front.iov_base == NULL) {
+			pr_err("msg_new can't allocate %d bytes\n",
+			     front_len);
+			goto out2;
+		}
+	} else {
+		m->front.iov_base = NULL;
+	}
+	m->front.iov_len = front_len;
+
+	/* middle */
+	m->middle = NULL;
+
+	/* data */
+	m->nr_pages = calc_pages_for(page_off, page_len);
+	m->pages = pages;
+
+	dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
+	     m->nr_pages);
+	return m;
+
+out2:
+	ceph_msg_put(m);
+out:
+	pr_err("msg_new can't create type %d len %d\n", type, front_len);
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Generic message allocator, for incoming messages.
+ */
+struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr)
+{
+	int type = le16_to_cpu(hdr->type);
+	int front_len = le32_to_cpu(hdr->front_len);
+	struct ceph_msg *msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+
+	if (!msg) {
+		pr_err("unable to allocate msg type %d len %d\n",
+		       type, front_len);
+		return ERR_PTR(-ENOMEM);
+	}
+	return msg;
+}
+
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg.  This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	int type = le16_to_cpu(msg->hdr.type);
+	int middle_len = le32_to_cpu(msg->hdr.middle_len);
+
+	dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+	     ceph_msg_type_name(type), middle_len);
+	BUG_ON(!middle_len);
+	BUG_ON(msg->middle);
+
+	msg->middle = ceph_buffer_new_alloc(middle_len, GFP_NOFS);
+	if (!msg->middle)
+		return -ENOMEM;
+	return 0;
+}
+
+
+/*
+ * Free a generically kmalloc'd message.
+ */
+void ceph_msg_kfree(struct ceph_msg *m)
+{
+	dout("msg_kfree %p\n", m);
+	if (m->front_is_vmalloc)
+		vfree(m->front.iov_base);
+	else
+		kfree(m->front.iov_base);
+	kfree(m);
+}
+
+/*
+ * Drop a msg ref.  Destroy as needed.
+ */
+void ceph_msg_put(struct ceph_msg *m)
+{
+	dout("ceph_msg_put %p %d -> %d\n", m, atomic_read(&m->nref),
+	     atomic_read(&m->nref)-1);
+	if (atomic_read(&m->nref) <= 0) {
+		pr_err("bad ceph_msg_put on %p %llu %d=%s %d+%d\n",
+		       m, le64_to_cpu(m->hdr.seq),
+		       le16_to_cpu(m->hdr.type),
+		       ceph_msg_type_name(le16_to_cpu(m->hdr.type)),
+		       le32_to_cpu(m->hdr.front_len),
+		       le32_to_cpu(m->hdr.data_len));
+		WARN_ON(1);
+	}
+	if (atomic_dec_and_test(&m->nref)) {
+		dout("ceph_msg_put last one on %p\n", m);
+		WARN_ON(!list_empty(&m->list_head));
+
+		/* drop middle, data, if any */
+		if (m->middle) {
+			ceph_buffer_put(m->middle);
+			m->middle = NULL;
+		}
+		m->nr_pages = 0;
+		m->pages = NULL;
+
+		if (m->pool)
+			ceph_msgpool_put(m->pool, m);
+		else
+			ceph_msg_kfree(m);
+	}
+}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..dcd98b64dca9
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,243 @@
+#ifndef __FS_CEPH_MESSENGER_H
+#define __FS_CEPH_MESSENGER_H
+
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/radix-tree.h>
+#include <linux/uio.h>
+#include <linux/version.h>
+#include <linux/workqueue.h>
+
+#include "types.h"
+#include "buffer.h"
+
+struct ceph_msg;
+struct ceph_connection;
+
+extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
+
+/*
+ * Ceph defines these callbacks for handling connection events.
+ */
+struct ceph_connection_operations {
+	struct ceph_connection *(*get)(struct ceph_connection *);
+	void (*put)(struct ceph_connection *);
+
+	/* handle an incoming message. */
+	void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
+
+	/* protocol version mismatch */
+	void (*bad_proto) (struct ceph_connection *con);
+
+	/* there was some error on the socket (disconnect, whatever) */
+	void (*fault) (struct ceph_connection *con);
+
+	/* a remote host as terminated a message exchange session, and messages
+	 * we sent (or they tried to send us) may be lost. */
+	void (*peer_reset) (struct ceph_connection *con);
+
+	struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
+					struct ceph_msg_header *hdr);
+	int (*alloc_middle) (struct ceph_connection *con,
+			     struct ceph_msg *msg);
+	/* an incoming message has a data payload; tell me what pages I
+	 * should read the data into. */
+	int (*prepare_pages) (struct ceph_connection *con, struct ceph_msg *m,
+			      int want);
+};
+
+extern const char *ceph_name_type_str(int t);
+
+/* use format string %s%d */
+#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+
+struct ceph_messenger {
+	struct ceph_entity_inst inst;    /* my name+address */
+	struct page *zero_page;          /* used in certain error cases */
+
+	bool nocrc;
+
+	/*
+	 * the global_seq counts connections i (attempt to) initiate
+	 * in order to disambiguate certain connect race conditions.
+	 */
+	u32 global_seq;
+	spinlock_t global_seq_lock;
+};
+
+/*
+ * a single message.  it contains a header (src, dest, message type, etc.),
+ * footer (crc values, mainly), a "front" message body, and possibly a
+ * data payload (stored in some number of pages).
+ */
+struct ceph_msg {
+	struct ceph_msg_header hdr;	/* header */
+	struct ceph_msg_footer footer;	/* footer */
+	struct kvec front;              /* unaligned blobs of message */
+	struct ceph_buffer *middle;
+	struct page **pages;            /* data payload.  NOT OWNER. */
+	unsigned nr_pages;              /* size of page array */
+	struct list_head list_head;
+	atomic_t nref;
+	bool front_is_vmalloc;
+	bool more_to_follow;
+	int front_max;
+
+	struct ceph_msgpool *pool;
+};
+
+struct ceph_msg_pos {
+	int page, page_pos;  /* which page; offset in page */
+	int data_pos;        /* offset in data payload */
+	int did_page_crc;    /* true if we've calculated crc for current page */
+};
+
+/* ceph connection fault delay defaults, for exponential backoff */
+#define BASE_DELAY_INTERVAL	(HZ/2)
+#define MAX_DELAY_INTERVAL	(5 * 60 * HZ)
+
+/*
+ * ceph_connection state bit flags
+ *
+ * QUEUED and BUSY are used together to ensure that only a single
+ * thread is currently opening, reading or writing data to the socket.
+ */
+#define LOSSYTX         0  /* we can close channel or drop messages on errors */
+#define LOSSYRX         1  /* peer may reset/drop messages */
+#define CONNECTING	2
+#define KEEPALIVE_PENDING      3
+#define WRITE_PENDING	4  /* we have data ready to send */
+#define QUEUED          5  /* there is work queued on this connection */
+#define BUSY            6  /* work is being done */
+#define STANDBY		8  /* no outgoing messages, socket closed.  we keep
+			    * the ceph_connection around to maintain shared
+			    * state with the peer. */
+#define CLOSED		10 /* we've closed the connection */
+#define SOCK_CLOSED	11 /* socket state changed to closed */
+#define REGISTERED      12 /* connection appears in con_tree */
+#define OPENING         13 /* open connection w/ (possibly new) peer */
+#define DEAD            14 /* dead, about to kfree */
+
+/*
+ * A single connection with another host.
+ *
+ * We maintain a queue of outgoing messages, and some session state to
+ * ensure that we can preserve the lossless, ordered delivery of
+ * messages in the case of a TCP disconnect.
+ */
+struct ceph_connection {
+	void *private;
+	atomic_t nref;
+
+	const struct ceph_connection_operations *ops;
+
+	struct ceph_messenger *msgr;
+	struct socket *sock;
+	unsigned long state;	/* connection state (see flags above) */
+	const char *error_msg;  /* error message, if any */
+
+	struct ceph_entity_addr peer_addr; /* peer address */
+	struct ceph_entity_name peer_name; /* peer name */
+	struct ceph_entity_addr peer_addr_for_me;
+	u32 connect_seq;      /* identify the most recent connection
+				 attempt for this connection, client */
+	u32 peer_global_seq;  /* peer's global seq for this connection */
+
+	/* out queue */
+	struct mutex out_mutex;
+	struct list_head out_queue;
+	struct list_head out_sent;   /* sending or sent but unacked */
+	u64 out_seq;		     /* last message queued for send */
+	u64 out_seq_sent;            /* last message sent */
+	bool out_keepalive_pending;
+
+	u64 in_seq, in_seq_acked;  /* last message received, acked */
+
+	/* connection negotiation temps */
+	char in_banner[CEPH_BANNER_MAX_LEN];
+	union {
+		struct {  /* outgoing connection */
+			struct ceph_msg_connect out_connect;
+			struct ceph_msg_connect_reply in_reply;
+		};
+		struct {  /* incoming */
+			struct ceph_msg_connect in_connect;
+			struct ceph_msg_connect_reply out_reply;
+		};
+	};
+	struct ceph_entity_addr actual_peer_addr;
+
+	/* message out temps */
+	struct ceph_msg *out_msg;        /* sending message (== tail of
+					    out_sent) */
+	struct ceph_msg_pos out_msg_pos;
+
+	struct kvec out_kvec[8],         /* sending header/footer data */
+		*out_kvec_cur;
+	int out_kvec_left;   /* kvec's left in out_kvec */
+	int out_skip;        /* skip this many bytes */
+	int out_kvec_bytes;  /* total bytes left */
+	bool out_kvec_is_msg; /* kvec refers to out_msg */
+	int out_more;        /* there is more data after the kvecs */
+	__le64 out_temp_ack; /* for writing an ack */
+
+	/* message in temps */
+	struct ceph_msg_header in_hdr;
+	struct ceph_msg *in_msg;
+	struct ceph_msg_pos in_msg_pos;
+	u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
+
+	char in_tag;         /* protocol control byte */
+	int in_base_pos;     /* bytes read */
+	__le64 in_temp_ack;  /* for reading an ack */
+
+	struct delayed_work work;	    /* send|recv work */
+	unsigned long       delay;          /* current delay interval */
+};
+
+
+extern const char *pr_addr(const struct sockaddr_storage *ss);
+extern int ceph_parse_ips(const char *c, const char *end,
+			  struct ceph_entity_addr *addr,
+			  int max_count, int *count);
+
+
+extern int ceph_msgr_init(void);
+extern void ceph_msgr_exit(void);
+
+extern struct ceph_messenger *ceph_messenger_create(
+	struct ceph_entity_addr *myaddr);
+extern void ceph_messenger_destroy(struct ceph_messenger *);
+
+extern void ceph_con_init(struct ceph_messenger *msgr,
+			  struct ceph_connection *con);
+extern void ceph_con_shutdown(struct ceph_connection *con);
+extern void ceph_con_open(struct ceph_connection *con,
+			  struct ceph_entity_addr *addr);
+extern void ceph_con_close(struct ceph_connection *con);
+extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_keepalive(struct ceph_connection *con);
+extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
+extern void ceph_con_put(struct ceph_connection *con);
+
+extern struct ceph_msg *ceph_msg_new(int type, int front_len,
+				     int page_len, int page_off,
+				     struct page **pages);
+extern void ceph_msg_kfree(struct ceph_msg *m);
+
+extern struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				       struct ceph_msg_header *hdr);
+extern int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg);
+
+
+static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+	dout("ceph_msg_get %p %d -> %d\n", msg, atomic_read(&msg->nref),
+	     atomic_read(&msg->nref)+1);
+	atomic_inc(&msg->nref);
+	return msg;
+}
+extern void ceph_msg_put(struct ceph_msg *msg);
+
+#endif
-- 
cgit v1.2.2


From 8fc91fd85950d106883852c6d215614ec28cc92d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:13 -0700
Subject: ceph: message pools

The msgpool is a basic mempool_t-like structure to preallocate
messages we expect to receive over the wire.  This ensures we have the
necessary memory preallocated to process replies to requests, or to
process unsolicited messages from various servers.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/msgpool.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/msgpool.h |  26 +++++++++
 2 files changed, 193 insertions(+)
 create mode 100644 fs/ceph/msgpool.c
 create mode 100644 fs/ceph/msgpool.h

(limited to 'fs')

diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..39d4d7ed82ce
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,167 @@
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include "msgpool.h"
+
+/*
+ * We use msg pools to preallocate memory for messages we expect to
+ * receive over the wire, to avoid getting ourselves into OOM
+ * conditions at unexpected times.  We take use a few different
+ * strategies:
+ *
+ *  - for request/response type interactions, we preallocate the
+ * memory needed for the response when we generate the request.
+ *
+ *  - for messages we can receive at any time from the MDS, we preallocate
+ * a pool of messages we can re-use.
+ *
+ *  - for writeback, we preallocate some number of messages to use for
+ * requests and their replies, so that we always make forward
+ * progress.
+ *
+ * The msgpool behaves like a mempool_t, but keeps preallocated
+ * ceph_msgs strung together on a list_head instead of using a pointer
+ * vector.  This avoids vector reallocation when we adjust the number
+ * of preallocated items (which happens frequently).
+ */
+
+
+/*
+ * Allocate or release as necessary to meet our target pool size.
+ */
+static int __fill_msgpool(struct ceph_msgpool *pool)
+{
+	struct ceph_msg *msg;
+
+	while (pool->num < pool->min) {
+		dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
+		     pool->min);
+		spin_unlock(&pool->lock);
+		msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
+		spin_lock(&pool->lock);
+		if (IS_ERR(msg))
+			return PTR_ERR(msg);
+		msg->pool = pool;
+		list_add(&msg->list_head, &pool->msgs);
+		pool->num++;
+	}
+	while (pool->num > pool->min) {
+		msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
+		dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
+		     pool->min, msg);
+		list_del_init(&msg->list_head);
+		pool->num--;
+		ceph_msg_kfree(msg);
+	}
+	return 0;
+}
+
+int ceph_msgpool_init(struct ceph_msgpool *pool,
+		      int front_len, int min, bool blocking)
+{
+	int ret;
+
+	dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
+	spin_lock_init(&pool->lock);
+	pool->front_len = front_len;
+	INIT_LIST_HEAD(&pool->msgs);
+	pool->num = 0;
+	pool->min = min;
+	pool->blocking = blocking;
+	init_waitqueue_head(&pool->wait);
+
+	spin_lock(&pool->lock);
+	ret = __fill_msgpool(pool);
+	spin_unlock(&pool->lock);
+	return ret;
+}
+
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+	dout("msgpool_destroy %p\n", pool);
+	spin_lock(&pool->lock);
+	pool->min = 0;
+	__fill_msgpool(pool);
+	spin_unlock(&pool->lock);
+}
+
+int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+{
+	int ret;
+
+	spin_lock(&pool->lock);
+	dout("msgpool_resv %p delta %d\n", pool, delta);
+	pool->min += delta;
+	ret = __fill_msgpool(pool);
+	spin_unlock(&pool->lock);
+	return ret;
+}
+
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool)
+{
+	wait_queue_t wait;
+	struct ceph_msg *msg;
+
+	if (pool->blocking) {
+		/* mempool_t behavior; first try to alloc */
+		msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
+		if (!IS_ERR(msg))
+			return msg;
+	}
+
+	while (1) {
+		spin_lock(&pool->lock);
+		if (likely(pool->num)) {
+			msg = list_entry(pool->msgs.next, struct ceph_msg,
+					 list_head);
+			list_del_init(&msg->list_head);
+			pool->num--;
+			dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
+			     pool->num, pool->min);
+			spin_unlock(&pool->lock);
+			return msg;
+		}
+		pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
+		       pool->min, pool->blocking ? "waiting" : "failing");
+		spin_unlock(&pool->lock);
+
+		if (!pool->blocking) {
+			WARN_ON(1);
+
+			/* maybe we can allocate it now? */
+			msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
+			if (!IS_ERR(msg))
+				return msg;
+
+			return ERR_PTR(-ENOMEM);
+		}
+
+		init_wait(&wait);
+		prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+		schedule();
+		finish_wait(&pool->wait, &wait);
+	}
+}
+
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+	spin_lock(&pool->lock);
+	if (pool->num < pool->min) {
+		ceph_msg_get(msg);   /* retake a single ref */
+		list_add(&msg->list_head, &pool->msgs);
+		pool->num++;
+		dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
+		     pool->num, pool->min);
+		spin_unlock(&pool->lock);
+		wake_up(&pool->wait);
+	} else {
+		dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
+		     pool->num, pool->min);
+		spin_unlock(&pool->lock);
+		ceph_msg_kfree(msg);
+	}
+}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..07a2decaa6d8
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,26 @@
+#ifndef _FS_CEPH_MSGPOOL
+#define _FS_CEPH_MSGPOOL
+
+#include "messenger.h"
+
+/*
+ * we use memory pools for preallocating messages we may receive, to
+ * avoid unexpected OOM conditions.
+ */
+struct ceph_msgpool {
+	spinlock_t lock;
+	int front_len;          /* preallocated payload size */
+	struct list_head msgs;  /* msgs in the pool; each has 1 ref */
+	int num, min;           /* cur, min # msgs in the pool */
+	bool blocking;
+	wait_queue_head_t wait;
+};
+
+extern int ceph_msgpool_init(struct ceph_msgpool *pool,
+			     int front_len, int size, bool blocking);
+extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
+extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
+extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *);
+extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
+
+#endif
-- 
cgit v1.2.2


From a8e63b7d51cce4557ee7bcd8f51be5cae8547d20 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:13 -0700
Subject: ceph: nfs re-export support

Basic NFS re-export support is included.  This mostly works.  However,
Ceph's MDS design precludes the ability to generate a (small)
filehandle that will be valid forever, so this is of limited utility.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/export.c | 223 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 223 insertions(+)
 create mode 100644 fs/ceph/export.c

(limited to 'fs')

diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..fc68e39cbad6
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,223 @@
+#include "ceph_debug.h"
+
+#include <linux/exportfs.h>
+#include <asm/unaligned.h>
+
+#include "super.h"
+
+/*
+ * NFS export support
+ *
+ * NFS re-export of a ceph mount is, at present, only semireliable.
+ * The basic issue is that the Ceph architectures doesn't lend itself
+ * well to generating filehandles that will remain valid forever.
+ *
+ * So, we do our best.  If you're lucky, your inode will be in the
+ * client's cache.  If it's not, and you have a connectable fh, then
+ * the MDS server may be able to find it for you.  Otherwise, you get
+ * ESTALE.
+ *
+ * There are ways to this more reliable, but in the non-connectable fh
+ * case, we won't every work perfectly, and in the connectable case,
+ * some changes are needed on the MDS side to work better.
+ */
+
+/*
+ * Basic fh
+ */
+struct ceph_nfs_fh {
+	u64 ino;
+} __attribute__ ((packed));
+
+/*
+ * Larger 'connectable' fh that includes parent ino and name hash.
+ * Use this whenever possible, as it works more reliably.
+ */
+struct ceph_nfs_confh {
+	u64 ino, parent_ino;
+	u32 parent_name_hash;
+} __attribute__ ((packed));
+
+static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
+			  int connectable)
+{
+	struct ceph_nfs_fh *fh = (void *)rawfh;
+	struct ceph_nfs_confh *cfh = (void *)rawfh;
+	struct dentry *parent = dentry->d_parent;
+	struct inode *inode = dentry->d_inode;
+	int type;
+
+	/* don't re-export snaps */
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EINVAL;
+
+	if (*max_len >= sizeof(*cfh)) {
+		dout("encode_fh %p connectable\n", dentry);
+		cfh->ino = ceph_ino(dentry->d_inode);
+		cfh->parent_ino = ceph_ino(parent->d_inode);
+		cfh->parent_name_hash = parent->d_name.hash;
+		*max_len = sizeof(*cfh);
+		type = 2;
+	} else if (*max_len > sizeof(*fh)) {
+		if (connectable)
+			return -ENOSPC;
+		dout("encode_fh %p\n", dentry);
+		fh->ino = ceph_ino(dentry->d_inode);
+		*max_len = sizeof(*fh);
+		type = 1;
+	} else {
+		return -ENOSPC;
+	}
+	return type;
+}
+
+/*
+ * convert regular fh to dentry
+ *
+ * FIXME: we should try harder by querying the mds for the ino.
+ */
+static struct dentry *__fh_to_dentry(struct super_block *sb,
+				     struct ceph_nfs_fh *fh)
+{
+	struct inode *inode;
+	struct dentry *dentry;
+	struct ceph_vino vino;
+	int err;
+
+	dout("__fh_to_dentry %llx\n", fh->ino);
+	vino.ino = fh->ino;
+	vino.snap = CEPH_NOSNAP;
+	inode = ceph_find_inode(sb, vino);
+	if (!inode)
+		return ERR_PTR(-ESTALE);
+
+	dentry = d_obtain_alias(inode);
+	if (!dentry) {
+		pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
+		       fh->ino, inode);
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	err = ceph_init_dentry(dentry);
+
+	if (err < 0) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
+	return dentry;
+}
+
+/*
+ * convert connectable fh to dentry
+ */
+static struct dentry *__cfh_to_dentry(struct super_block *sb,
+				      struct ceph_nfs_confh *cfh)
+{
+	struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
+	struct inode *inode;
+	struct dentry *dentry;
+	struct ceph_vino vino;
+	int err;
+
+	dout("__cfh_to_dentry %llx (%llx/%x)\n",
+	     cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
+
+	vino.ino = cfh->ino;
+	vino.snap = CEPH_NOSNAP;
+	inode = ceph_find_inode(sb, vino);
+	if (!inode) {
+		struct ceph_mds_request *req;
+
+		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
+					       USE_ANY_MDS);
+		if (IS_ERR(req))
+			return ERR_PTR(PTR_ERR(req));
+
+		req->r_ino1 = vino;
+		req->r_ino2.ino = cfh->parent_ino;
+		req->r_ino2.snap = CEPH_NOSNAP;
+		req->r_path2 = kmalloc(16, GFP_NOFS);
+		snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
+		req->r_num_caps = 1;
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		ceph_mdsc_put_request(req);
+		inode = ceph_find_inode(sb, vino);
+		if (!inode)
+			return ERR_PTR(err ? err : -ESTALE);
+	}
+
+	dentry = d_obtain_alias(inode);
+	if (!dentry) {
+		pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
+		       cfh->ino, inode);
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	err = ceph_init_dentry(dentry);
+	if (err < 0) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
+	return dentry;
+}
+
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
+					int fh_len, int fh_type)
+{
+	if (fh_type == 1)
+		return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
+	else
+		return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
+}
+
+/*
+ * get parent, if possible.
+ *
+ * FIXME: we could do better by querying the mds to discover the
+ * parent.
+ */
+static struct dentry *ceph_fh_to_parent(struct super_block *sb,
+					 struct fid *fid,
+					int fh_len, int fh_type)
+{
+	struct ceph_nfs_confh *cfh = (void *)fid->raw;
+	struct ceph_vino vino;
+	struct inode *inode;
+	struct dentry *dentry;
+	int err;
+
+	if (fh_type == 1)
+		return ERR_PTR(-ESTALE);
+
+	pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
+		 cfh->parent_name_hash);
+
+	vino.ino = cfh->ino;
+	vino.snap = CEPH_NOSNAP;
+	inode = ceph_find_inode(sb, vino);
+	if (!inode)
+		return ERR_PTR(-ESTALE);
+
+	dentry = d_obtain_alias(inode);
+	if (!dentry) {
+		pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
+		       cfh->ino, inode);
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	err = ceph_init_dentry(dentry);
+	if (err < 0) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
+	return dentry;
+}
+
+const struct export_operations ceph_export_ops = {
+	.encode_fh = ceph_encode_fh,
+	.fh_to_dentry = ceph_fh_to_dentry,
+	.fh_to_parent = ceph_fh_to_parent,
+};
-- 
cgit v1.2.2


From 8f4e91dee2a245e4be6942f4a8d83a769e13a47d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:14 -0700
Subject: ceph: ioctls

A few Ceph ioctls for getting and setting file layout (striping)
parameters, and learning the identity and network address of the OSD a
given region of a file is stored on.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ioctl.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/ioctl.h |  39 ++++++++++++++
 2 files changed, 196 insertions(+)
 create mode 100644 fs/ceph/ioctl.c
 create mode 100644 fs/ceph/ioctl.h

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..e4f99eff5d93
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,157 @@
+#include <linux/in.h>
+
+#include "ioctl.h"
+#include "super.h"
+#include "ceph_debug.h"
+
+
+/*
+ * ioctls
+ */
+
+/*
+ * get and set the file layout
+ */
+static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+	struct ceph_ioctl_layout l;
+	int err;
+
+	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+	if (!err) {
+		l.stripe_unit = ceph_file_layout_su(ci->i_layout);
+		l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+		l.object_size = ceph_file_layout_object_size(ci->i_layout);
+		l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+		if (copy_to_user(arg, &l, sizeof(l)))
+			return -EFAULT;
+	}
+
+	return err;
+}
+
+static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_ioctl_layout l;
+	int err, i;
+
+	/* copy and validate */
+	if (copy_from_user(&l, arg, sizeof(l)))
+		return -EFAULT;
+
+	if ((l.object_size & ~PAGE_MASK) ||
+	    (l.stripe_unit & ~PAGE_MASK) ||
+	    !l.stripe_unit ||
+	    (l.object_size &&
+	     (unsigned)l.object_size % (unsigned)l.stripe_unit))
+		return -EINVAL;
+
+	/* make sure it's a valid data pool */
+	if (l.data_pool > 0) {
+		mutex_lock(&mdsc->mutex);
+		err = -EINVAL;
+		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+				err = 0;
+				break;
+			}
+		mutex_unlock(&mdsc->mutex);
+		if (err)
+			return err;
+	}
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+	req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
+
+	req->r_args.setlayout.layout.fl_stripe_unit =
+		cpu_to_le32(l.stripe_unit);
+	req->r_args.setlayout.layout.fl_stripe_count =
+		cpu_to_le32(l.stripe_count);
+	req->r_args.setlayout.layout.fl_object_size =
+		cpu_to_le32(l.object_size);
+	req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
+	req->r_args.setlayout.layout.fl_pg_preferred = cpu_to_le32((s32)-1);
+
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Return object name, size/offset information, and location (OSD
+ * number, network address) for a given file offset.
+ */
+static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
+{
+	struct ceph_ioctl_dataloc dl;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+	u64 len = 1, olen;
+	u64 tmp;
+	struct ceph_object_layout ol;
+	union ceph_pg pgid;
+
+	/* copy and validate */
+	if (copy_from_user(&dl, arg, sizeof(dl)))
+		return -EFAULT;
+
+	down_read(&osdc->map_sem);
+	ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+				      &dl.object_no, &dl.object_offset, &olen);
+	dl.file_offset -= dl.object_offset;
+	dl.object_size = ceph_file_layout_object_size(ci->i_layout);
+	dl.block_size = ceph_file_layout_su(ci->i_layout);
+
+	/* block_offset = object_offset % block_size */
+	tmp = dl.object_offset;
+	dl.block_offset = do_div(tmp, dl.block_size);
+
+	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
+		 ceph_ino(inode), dl.object_no);
+	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
+				osdc->osdmap);
+
+	pgid.pg64 = le64_to_cpu(ol.ol_pgid);
+	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+	if (dl.osd >= 0) {
+		struct ceph_entity_addr *a =
+			ceph_osd_addr(osdc->osdmap, dl.osd);
+		if (a)
+			memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
+	} else {
+		memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
+	}
+	up_read(&osdc->map_sem);
+
+	/* send result back to user */
+	if (copy_to_user(arg, &dl, sizeof(dl)))
+		return -EFAULT;
+
+	return 0;
+}
+
+long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
+	switch (cmd) {
+	case CEPH_IOC_GET_LAYOUT:
+		return ceph_ioctl_get_layout(file, (void __user *)arg);
+
+	case CEPH_IOC_SET_LAYOUT:
+		return ceph_ioctl_set_layout(file, (void __user *)arg);
+
+	case CEPH_IOC_GET_DATALOC:
+		return ceph_ioctl_get_dataloc(file, (void __user *)arg);
+	}
+	return -ENOTTY;
+}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..3c511dab3730
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,39 @@
+#ifndef FS_CEPH_IOCTL_H
+#define FS_CEPH_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define CEPH_IOCTL_MAGIC 0x97
+
+/* just use u64 to align sanely on all archs */
+struct ceph_ioctl_layout {
+	__u64 stripe_unit, stripe_count, object_size;
+	__u64 data_pool;
+};
+
+#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1,		\
+				   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,		\
+				   struct ceph_ioctl_layout)
+
+/*
+ * Extract identity, address of the OSD and object storing a given
+ * file offset.
+ */
+struct ceph_ioctl_dataloc {
+	__u64 file_offset;           /* in+out: file offset */
+	__u64 object_offset;         /* out: offset in object */
+	__u64 object_no;             /* out: object # */
+	__u64 object_size;           /* out: object size */
+	char object_name[64];        /* out: object name */
+	__u64 block_offset;          /* out: offset in block */
+	__u64 block_size;            /* out: block length */
+	__s64 osd;                   /* out: osd # */
+	struct sockaddr_storage osd_addr; /* out: osd address */
+};
+
+#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3,	\
+				   struct ceph_ioctl_dataloc)
+
+#endif
-- 
cgit v1.2.2


From 76aa844d5b2fb8c839180d3f5874e333b297e5fd Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:14 -0700
Subject: ceph: debugfs

Basic state information is available via /sys/kernel/debug/ceph,
including instances of the client, fsids, current monitor, mds and osd
maps, outstanding server requests, and hooks to adjust debug levels.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/debugfs.c | 425 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 425 insertions(+)
 create mode 100644 fs/ceph/debugfs.c

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..9edbad32f116
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,425 @@
+#include "ceph_debug.h"
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
+ *      .../osdmap      - current osdmap
+ *      .../mdsmap      - current mdsmap
+ *      .../monmap      - current monmap
+ *      .../osdc        - active osd requests
+ *      .../mdsc        - active mds requests
+ *      .../monc        - mon client state
+ *      .../dentry_lru  - dump contents of dentry lru
+ *      .../caps        - expose cap (reservation) stats
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+
+	if (client->monc.monmap == NULL)
+		return 0;
+
+	seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+	for (i = 0; i < client->monc.monmap->num_mon; i++) {
+		struct ceph_entity_inst *inst =
+			&client->monc.monmap->mon_inst[i];
+
+		seq_printf(s, "\t%s%lld\t%s\n",
+			   ENTITY_NAME(inst->name),
+			   pr_addr(&inst->addr.in_addr));
+	}
+	return 0;
+}
+
+static int mdsmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+
+	if (client->mdsc.mdsmap == NULL)
+		return 0;
+	seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
+	seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+	seq_printf(s, "session_timeout %d\n",
+		       client->mdsc.mdsmap->m_session_timeout);
+	seq_printf(s, "session_autoclose %d\n",
+		       client->mdsc.mdsmap->m_session_autoclose);
+	for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+		struct ceph_entity_addr *addr =
+			&client->mdsc.mdsmap->m_info[i].addr;
+		int state = client->mdsc.mdsmap->m_info[i].state;
+
+		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+			       ceph_mds_state_name(state));
+	}
+	return 0;
+}
+
+static int osdmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+
+	if (client->osdc.osdmap == NULL)
+		return 0;
+	seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+	seq_printf(s, "flags%s%s\n",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+		   " NEARFULL" : "",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+		   " FULL" : "");
+	for (i = 0; i < client->osdc.osdmap->num_pools; i++) {
+		struct ceph_pg_pool_info *pool =
+			&client->osdc.osdmap->pg_pool[i];
+		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+			   i, pool->v.pg_num, pool->pg_num_mask,
+			   pool->v.lpg_num, pool->lpg_num_mask);
+	}
+	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+		struct ceph_entity_addr *addr =
+			&client->osdc.osdmap->osd_addr[i];
+		int state = client->osdc.osdmap->osd_state[i];
+		char sb[64];
+
+		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+			   i, pr_addr(&addr->in_addr),
+			   ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+			   ceph_osdmap_state_str(sb, sizeof(sb), state));
+	}
+	return 0;
+}
+
+static int monc_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_mon_statfs_request *req;
+	u64 nexttid = 0;
+	int got;
+	struct ceph_mon_client *monc = &client->monc;
+
+	mutex_lock(&monc->mutex);
+
+	if (monc->have_mdsmap)
+		seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+	if (monc->have_osdmap)
+		seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+	if (monc->want_next_osdmap)
+		seq_printf(s, "want next osdmap\n");
+
+	while (nexttid < monc->last_tid) {
+		got = radix_tree_gang_lookup(&monc->statfs_request_tree,
+					     (void **)&req, nexttid, 1);
+		if (got == 0)
+			break;
+		nexttid = req->tid + 1;
+
+		seq_printf(s, "%lld statfs\n", req->tid);
+	}
+	mutex_unlock(&monc->mutex);
+
+	return 0;
+}
+
+static int mdsc_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_mds_request *req;
+	u64 nexttid = 0;
+	int got;
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	int pathlen;
+	u64 pathbase;
+	char *path;
+
+	mutex_lock(&mdsc->mutex);
+	while (nexttid < mdsc->last_tid) {
+		got = radix_tree_gang_lookup(&mdsc->request_tree,
+					     (void **)&req, nexttid, 1);
+		if (got == 0)
+			break;
+		nexttid = req->r_tid + 1;
+
+		if (req->r_request)
+			seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
+		else
+			seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+
+		seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
+
+		if (req->r_got_unsafe)
+			seq_printf(s, "\t(unsafe)");
+		else
+			seq_printf(s, "\t");
+
+		if (req->r_inode) {
+			seq_printf(s, " #%llx", ceph_ino(req->r_inode));
+		} else if (req->r_dentry) {
+			path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+						    &pathbase, 0);
+			spin_lock(&req->r_dentry->d_lock);
+			seq_printf(s, " #%llx/%.*s (%s)",
+				   ceph_ino(req->r_dentry->d_parent->d_inode),
+				   req->r_dentry->d_name.len,
+				   req->r_dentry->d_name.name,
+				   path ? path : "");
+			spin_unlock(&req->r_dentry->d_lock);
+			kfree(path);
+		} else if (req->r_path1) {
+			seq_printf(s, " #%llx/%s", req->r_ino1.ino,
+				   req->r_path1);
+		}
+
+		if (req->r_old_dentry) {
+			path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+						    &pathbase, 0);
+			spin_lock(&req->r_old_dentry->d_lock);
+			seq_printf(s, " #%llx/%.*s (%s)",
+			   ceph_ino(req->r_old_dentry->d_parent->d_inode),
+				   req->r_old_dentry->d_name.len,
+				   req->r_old_dentry->d_name.name,
+				   path ? path : "");
+			spin_unlock(&req->r_old_dentry->d_lock);
+			kfree(path);
+		} else if (req->r_path2) {
+			if (req->r_ino2.ino)
+				seq_printf(s, " #%llx/%s", req->r_ino2.ino,
+					   req->r_path2);
+			else
+				seq_printf(s, " %s", req->r_path2);
+		}
+
+		seq_printf(s, "\n");
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	return 0;
+}
+
+static int osdc_show(struct seq_file *s, void *pp)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_osd_client *osdc = &client->osdc;
+	struct rb_node *p;
+
+	mutex_lock(&osdc->request_mutex);
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		struct ceph_osd_request *req;
+		struct ceph_osd_request_head *head;
+		struct ceph_osd_op *op;
+		int num_ops;
+		int opcode, olen;
+		int i;
+
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		seq_printf(s, "%lld\tosd%d\t", req->r_tid,
+			   req->r_osd ? req->r_osd->o_osd : -1);
+
+		head = req->r_request->front.iov_base;
+		op = (void *)(head + 1);
+
+		num_ops = le16_to_cpu(head->num_ops);
+		olen = le32_to_cpu(head->object_len);
+		seq_printf(s, "%.*s", olen,
+			   (const char *)(head->ops + num_ops));
+
+		if (req->r_reassert_version.epoch)
+			seq_printf(s, "\t%u'%llu",
+			   (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+			   le64_to_cpu(req->r_reassert_version.version));
+		else
+			seq_printf(s, "\t");
+
+		for (i = 0; i < num_ops; i++) {
+			opcode = le16_to_cpu(op->op);
+			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+			op++;
+		}
+
+		seq_printf(s, "\n");
+	}
+	mutex_unlock(&osdc->request_mutex);
+	return 0;
+}
+
+static int caps_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = p;
+	int total, avail, used, reserved;
+
+	ceph_reservation_status(client, &total, &avail, &used, &reserved);
+	seq_printf(s, "total\t\t%d\n"
+		      "avail\t\t%d\n"
+		      "used\t\t%d\n"
+		      "reserved\t%d\n",
+		   total, avail, used, reserved);
+	return 0;
+}
+
+static int dentry_lru_show(struct seq_file *s, void *ptr)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_dentry_info *di;
+
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_for_each_entry(di, &mdsc->dentry_lru, lru) {
+		struct dentry *dentry = di->dentry;
+		seq_printf(s, "%p %p\t%.*s\n",
+			   di, dentry, dentry->d_name.len, dentry->d_name.name);
+	}
+	spin_unlock(&mdsc->dentry_lru_lock);
+
+	return 0;
+}
+
+#define DEFINE_SHOW_FUNC(name) 						\
+static int name##_open(struct inode *inode, struct file *file)		\
+{									\
+	struct seq_file *sf;						\
+	int ret;							\
+									\
+	ret = single_open(file, name, NULL);				\
+	sf = file->private_data;					\
+	sf->private = inode->i_private;					\
+	return ret;							\
+}									\
+									\
+static const struct file_operations name##_fops = {			\
+	.open		= name##_open,					\
+	.read		= seq_read,					\
+	.llseek		= seq_lseek,					\
+	.release	= single_release,				\
+};
+
+DEFINE_SHOW_FUNC(monmap_show)
+DEFINE_SHOW_FUNC(mdsmap_show)
+DEFINE_SHOW_FUNC(osdmap_show)
+DEFINE_SHOW_FUNC(monc_show)
+DEFINE_SHOW_FUNC(mdsc_show)
+DEFINE_SHOW_FUNC(osdc_show)
+DEFINE_SHOW_FUNC(dentry_lru_show)
+DEFINE_SHOW_FUNC(caps_show)
+
+int __init ceph_debugfs_init(void)
+{
+	ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+	if (!ceph_debugfs_dir)
+		return -ENOMEM;
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+	debugfs_remove(ceph_debugfs_dir);
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+	int ret = 0;
+	char name[80];
+
+	snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
+		 PR_FSID(&client->monc.monmap->fsid), client->whoami);
+
+	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+	if (!client->debugfs_dir)
+		goto out;
+
+	client->monc.debugfs_file = debugfs_create_file("monc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &monc_show_fops);
+	if (!client->monc.debugfs_file)
+		goto out;
+
+	client->mdsc.debugfs_file = debugfs_create_file("mdsc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &mdsc_show_fops);
+	if (!client->mdsc.debugfs_file)
+		goto out;
+
+	client->osdc.debugfs_file = debugfs_create_file("osdc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &osdc_show_fops);
+	if (!client->osdc.debugfs_file)
+		goto out;
+
+	client->debugfs_monmap = debugfs_create_file("monmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&monmap_show_fops);
+	if (!client->debugfs_monmap)
+		goto out;
+
+	client->debugfs_mdsmap = debugfs_create_file("mdsmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&mdsmap_show_fops);
+	if (!client->debugfs_mdsmap)
+		goto out;
+
+	client->debugfs_osdmap = debugfs_create_file("osdmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&osdmap_show_fops);
+	if (!client->debugfs_osdmap)
+		goto out;
+
+	client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+					0600,
+					client->debugfs_dir,
+					client,
+					&dentry_lru_show_fops);
+	if (!client->debugfs_dentry_lru)
+		goto out;
+
+	client->debugfs_caps = debugfs_create_file("caps",
+						   0400,
+						   client->debugfs_dir,
+						   client,
+						   &caps_show_fops);
+	if (!client->debugfs_caps)
+		goto out;
+
+	return 0;
+
+out:
+	ceph_debugfs_client_cleanup(client);
+	return ret;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+	debugfs_remove(client->debugfs_caps);
+	debugfs_remove(client->debugfs_dentry_lru);
+	debugfs_remove(client->debugfs_osdmap);
+	debugfs_remove(client->debugfs_mdsmap);
+	debugfs_remove(client->debugfs_monmap);
+	debugfs_remove(client->osdc.debugfs_file);
+	debugfs_remove(client->mdsc.debugfs_file);
+	debugfs_remove(client->monc.debugfs_file);
+	debugfs_remove(client->debugfs_dir);
+}
+
-- 
cgit v1.2.2


From 9030aaf9bf0a1eee47a154c316c789e959638b0f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 11:31:15 -0700
Subject: ceph: Kconfig, Makefile

Kconfig options and Makefile.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/Kconfig       |  1 +
 fs/Makefile      |  1 +
 fs/ceph/Kconfig  | 26 ++++++++++++++++++++++++++
 fs/ceph/Makefile | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 64 insertions(+)
 create mode 100644 fs/ceph/Kconfig
 create mode 100644 fs/ceph/Makefile

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index d4bf8caad8d0..ca687092a334 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -234,6 +234,7 @@ config NFS_COMMON
 
 source "net/sunrpc/Kconfig"
 source "fs/smbfs/Kconfig"
+source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
 source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index af6d04700d9c..5ef73a0a9f25 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -124,3 +124,4 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_EXOFS_FS)          += exofs/
+obj-$(CONFIG_CEPH_FS)		+= ceph/
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..bc1fbd956187
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,26 @@
+config CEPH_FS
+        tristate "Ceph distributed file system (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	select LIBCRC32C
+	help
+	  Choose Y or M here to include support for mounting the
+	  experimental Ceph distributed file system.  Ceph is an extremely
+	  scalable file system designed to provide high performance,
+	  reliable access to petabytes of storage.
+
+	  More information at http://ceph.newdream.net/.
+
+	  If unsure, say N.
+
+config CEPH_FS_PRETTYDEBUG
+	bool "Include file:line in ceph debug output"
+	depends on CEPH_FS
+	default n
+	help
+	  If you say Y here, debug output will include a filename and
+	  line to aid debugging.  This icnreases kernel size and slows
+	  execution slightly when debug call sites are enabled (e.g.,
+	  via CONFIG_DYNAMIC_DEBUG).
+
+	  If unsure, say N.
+
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..7da6d69dba29
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,36 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_CEPH_FS) += ceph.o
+
+ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
+	export.o caps.o snap.o xattr.o \
+	messenger.o msgpool.o buffer.o \
+	mds_client.o mdsmap.o \
+	mon_client.o \
+	osd_client.o osdmap.o crush/crush.o crush/mapper.o \
+	debugfs.o \
+	ceph_fs.o ceph_strings.o ceph_frag.o
+
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: all
+
+all:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
+
+modules_install:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
+
+clean:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
-- 
cgit v1.2.2


From e324b8f991679a43e09dd13500bf1988c0bfc0ea Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 6 Oct 2009 12:20:56 -0700
Subject: ceph: document shared files in README

Document files shared between kernel and user code trees.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/README | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 fs/ceph/README

(limited to 'fs')

diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..231a1df5eafe
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,17 @@
+#
+# The following files are shared by (and manually synchronized
+# between) the Ceph userland and kernel client.
+#
+# userland                  kernel
+src/include/ceph_fs.h	    fs/ceph/ceph_fs.h
+src/include/ceph_fs.cc	    fs/ceph/ceph_fs.c
+src/include/msgr.h	    fs/ceph/msgr.h
+src/include/rados.h	    fs/ceph/rados.h
+src/include/ceph_strings.cc fs/ceph/ceph_strings.c
+src/include/ceph_frag.h	    fs/ceph/ceph_frag.h
+src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
+src/crush/crush.c	    fs/ceph/crush/crush.c
+src/crush/crush.h	    fs/ceph/crush/crush.h
+src/crush/mapper.c	    fs/ceph/crush/mapper.c
+src/crush/mapper.h	    fs/ceph/crush/mapper.h
+src/crush/hash.h	    fs/ceph/crush/hash.h
-- 
cgit v1.2.2


From fa0b72e9e2900ee87886aaf8bc4c4701be1e081d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 7 Oct 2009 10:59:10 -0700
Subject: ceph: show meaningful version on module load

Kill the old git revision; print the ceph version and protocol
versions instead.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_ver.h | 6 ------
 fs/ceph/super.c    | 5 +++--
 2 files changed, 3 insertions(+), 8 deletions(-)
 delete mode 100644 fs/ceph/ceph_ver.h

(limited to 'fs')

diff --git a/fs/ceph/ceph_ver.h b/fs/ceph/ceph_ver.h
deleted file mode 100644
index 66c3727b671b..000000000000
--- a/fs/ceph/ceph_ver.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __CEPH_VERSION_H
-#define __CEPH_VERSION_H
-
-#define CEPH_GIT_VER 335cd8f952b457095ea2a66aee3db50efb63c91d
-
-#endif
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 0723fb6e96a1..b3404a319c22 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -16,7 +16,6 @@
 #include <linux/version.h>
 #include <linux/vmalloc.h>
 
-#include "ceph_ver.h"
 #include "decode.h"
 #include "super.h"
 #include "mon_client.h"
@@ -903,7 +902,9 @@ static int __init init_ceph(void)
 	if (ret)
 		goto out_icache;
 
-	pr_info("loaded (%s)\n", STRINGIFY(CEPH_GIT_VER));
+	pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
+		CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
+		CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
 	return 0;
 
 out_icache:
-- 
cgit v1.2.2


From b195befd9acb514dd2afb722e63fdd880ed63217 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 7 Oct 2009 10:59:30 -0700
Subject: ceph: include preferred_osd in file layout virtual xattr

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 8eaac04d1b87..65b3a84bbb2e 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -91,11 +91,18 @@ static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
 static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 				   size_t size)
 {
-	return snprintf(val, size,
+	int ret;
+
+	ret = snprintf(val, size,
 		"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
 		(unsigned long long)ceph_file_layout_su(ci->i_layout),
 		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
 		(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+	if (ceph_file_layout_pg_preferred(ci->i_layout))
+		ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
+			    (unsigned long long)ceph_file_layout_pg_preferred(
+				    ci->i_layout));
+	return ret;
 }
 
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
-- 
cgit v1.2.2


From b28813a61d6ffe05ad353a86965607bb7a7fd60f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 7 Oct 2009 10:59:34 -0700
Subject: ceph: gracefully avoid empty crush buckets

This avoids a divide by zero when the input and/or map are
malformed.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/crush/mapper.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
index 0f0730c62695..c268393adfcb 100644
--- a/fs/ceph/crush/mapper.c
+++ b/fs/ceph/crush/mapper.c
@@ -299,7 +299,7 @@ static int crush_choose(struct crush_map *map,
 	struct crush_bucket *in = bucket;
 	int r;
 	int i;
-	int item;
+	int item = 0;
 	int itemtype;
 	int collide, reject;
 	const int orig_tries = 5; /* attempts before we fall back to search */
@@ -316,6 +316,7 @@ static int crush_choose(struct crush_map *map,
 			/* choose through intervening buckets */
 			flocal = 0;
 			do {
+				collide = 0;
 				retry_bucket = 0;
 				r = rep;
 				if (in->alg == CRUSH_BUCKET_UNIFORM) {
@@ -340,6 +341,10 @@ static int crush_choose(struct crush_map *map,
 				}
 
 				/* bucket choose */
+				if (in->size == 0) {
+					reject = 1;
+					goto reject;
+				}
 				if (flocal >= (in->size>>1) &&
 				    flocal > orig_tries)
 					item = bucket_perm_choose(in, x, r);
@@ -363,7 +368,6 @@ static int crush_choose(struct crush_map *map,
 				}
 
 				/* collision? */
-				collide = 0;
 				for (i = 0; i < outpos; i++) {
 					if (out[i] == item) {
 						collide = 1;
@@ -388,6 +392,7 @@ static int crush_choose(struct crush_map *map,
 						reject = 0;
 				}
 
+reject:
 				if (reject || collide) {
 					ftotal++;
 					flocal++;
-- 
cgit v1.2.2


From e251e288082d5e89604eee1fef0c31bed1fe8f02 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 7 Oct 2009 16:38:19 -0700
Subject: ceph: fix mdsmap decoding when multiple mds's are present

A misplaced sizeof() around namelen was throwing things off.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mdsmap.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 15913cbeb289..09180d8fafe4 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -85,28 +85,28 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		void *pexport_targets = NULL;
 
 		ceph_decode_need(p, end, sizeof(addr) + 1 + sizeof(u32), bad);
-		*p += sizeof(addr);          /* skip addr key */
+		ceph_decode_copy(p, &addr, sizeof(addr));
 		ceph_decode_8(p, infoversion);
 		ceph_decode_32(p, namelen);  /* skip mds name */
 		*p += namelen;
 
 		ceph_decode_need(p, end,
-				 5*sizeof(u32) + sizeof(u64) +
+				 4*sizeof(u32) + sizeof(u64) +
 				 sizeof(addr) + sizeof(struct ceph_timespec),
 				 bad);
 		ceph_decode_32(p, mds);
 		ceph_decode_32(p, inc);
 		ceph_decode_32(p, state);
 		ceph_decode_64(p, state_seq);
-		ceph_decode_copy(p, &addr, sizeof(addr));
+		*p += sizeof(addr);
 		*p += sizeof(struct ceph_timespec);
 		*p += sizeof(u32);
 		ceph_decode_32_safe(p, end, namelen, bad);
-		*p += sizeof(namelen);
+		*p += namelen;
 		if (infoversion >= 2) {
 			ceph_decode_32_safe(p, end, num_export_targets, bad);
 			pexport_targets = *p;
-			*p += sizeof(num_export_targets * sizeof(u32));
+			*p += num_export_targets * sizeof(u32);
 		} else {
 			num_export_targets = 0;
 		}
-- 
cgit v1.2.2


From 0656d11ba6ffa3dee0e8916a1903f96185651217 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 8 Oct 2009 10:25:46 -0700
Subject: ceph: renew mon subscription before it expires

Be conservative: renew subscription once half the interval has expired.

Do not reuse sub expiration to control hunting.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index b0c95cec5df8..9c34df17fa4b 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -213,7 +213,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
 		monc->hunting = false;
 	}
 	dout("handle_subscribe_ack after %d seconds\n", seconds);
-	monc->sub_renew_after = monc->sub_sent + seconds*HZ - 1;
+	monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
 	monc->sub_sent = 0;
 	mutex_unlock(&monc->mutex);
 	return;
@@ -512,7 +512,7 @@ static void delayed_work(struct work_struct *work)
 	if (monc->want_mount) {
 		__request_mount(monc);
 	} else {
-		if (__sub_expired(monc)) {
+		if (monc->hunting) {
 			__close_session(monc);
 			__open_session(monc);  /* continue hunting */
 		} else {
-- 
cgit v1.2.2


From c1ea8823be69ebebaface912142190e910711984 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 8 Oct 2009 16:55:47 -0700
Subject: ceph: fix osd request submission race

The osd request submission path registers the request, drops and retakes
the request_mutex, then sends it to the OSD.  A racing kick_requests could
sent it during that interval, causing the same msg to be sent twice and
BUGing in the msgr.

Fix by only sending the message if it hasn't been touched by other
threads.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 978593a4f466..d14019dd6868 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -837,7 +837,8 @@ static void kick_requests(struct ceph_osd_client *osdc,
 		}
 
 kick:
-		dout("kicking tid %llu osd%d\n", req->r_tid, req->r_osd->o_osd);
+		dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
+		     req->r_osd->o_osd);
 		req->r_flags |= CEPH_OSD_FLAG_RETRY;
 		err = __send_request(osdc, req);
 		if (err) {
@@ -1016,7 +1017,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 			    struct ceph_osd_request *req,
 			    bool nofail)
 {
-	int rc;
+	int rc = 0;
 
 	req->r_request->pages = req->r_pages;
 	req->r_request->nr_pages = req->r_num_pages;
@@ -1025,15 +1026,22 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 
 	down_read(&osdc->map_sem);
 	mutex_lock(&osdc->request_mutex);
-	rc = __send_request(osdc, req);
-	if (rc) {
-		if (nofail) {
-			dout("osdc_start_request failed send, marking %lld\n",
-			     req->r_tid);
-			req->r_resend = true;
-			rc = 0;
-		} else {
-			__unregister_request(osdc, req);
+	/*
+	 * a racing kick_requests() may have sent the message for us
+	 * while we dropped request_mutex above, so only send now if
+	 * the request still han't been touched yet.
+	 */
+	if (req->r_sent == 0) {
+		rc = __send_request(osdc, req);
+		if (rc) {
+			if (nofail) {
+				dout("osdc_start_request failed send, "
+				     " marking %lld\n", req->r_tid);
+				req->r_resend = true;
+				rc = 0;
+			} else {
+				__unregister_request(osdc, req);
+			}
 		}
 	}
 	mutex_unlock(&osdc->request_mutex);
-- 
cgit v1.2.2


From 0ba6478df7c6bef0f4b2625554545f941f89fb97 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 8 Oct 2009 16:57:16 -0700
Subject: ceph: revoke osd request message on request completion

If an osd has failed or returned and a request has been sent twice, it's
possible to get a reply and unregister the request while the request
message is queued for delivery.  Since the message references the caller's
page vector, we need to revoke it before completing.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index d14019dd6868..0aea8afaa072 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -469,10 +469,15 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 	rb_erase(&req->r_node, &osdc->requests);
 	osdc->num_requests--;
 
-	list_del_init(&req->r_osd_item);
-	if (list_empty(&req->r_osd->o_requests))
-		remove_osd(osdc, req->r_osd);
-	req->r_osd = NULL;
+	if (req->r_osd) {
+		/* make sure the original request isn't in flight. */
+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+
+		list_del_init(&req->r_osd_item);
+		if (list_empty(&req->r_osd->o_requests))
+			remove_osd(osdc, req->r_osd);
+		req->r_osd = NULL;
+	}
 
 	ceph_osdc_put_request(req);
 
-- 
cgit v1.2.2


From 991abb6ecfc8edf9863aa6a3f43249e63f9d4d4e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 8 Oct 2009 22:22:37 -0700
Subject: ceph: fail gracefully on corrupt osdmap (bad pg_temp mapping)

Return an error and report a corrupt map instead of crying BUG().

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index e38fe6309b1c..342e5f80996b 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -366,8 +366,8 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 /*
  * Insert a new pg_temp mapping
  */
-static void __insert_pg_mapping(struct ceph_pg_mapping *new,
-				struct rb_root *root)
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+			       struct rb_root *root)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
@@ -381,11 +381,12 @@ static void __insert_pg_mapping(struct ceph_pg_mapping *new,
 		else if (new->pgid > pg->pgid)
 			p = &(*p)->rb_right;
 		else
-			BUG();
+			return -EEXIST;
 	}
 
 	rb_link_node(&new->node, parent, p);
 	rb_insert_color(&new->node, root);
+	return 0;
 }
 
 /*
@@ -481,7 +482,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		for (j = 0; j < n; j++)
 			ceph_decode_32(p, pg->osds[j]);
 
-		__insert_pg_mapping(pg, &map->pg_temp);
+		err = __insert_pg_mapping(pg, &map->pg_temp);
+		if (err)
+			goto bad;
 		dout(" added pg_temp %llx len %d\n", pgid, len);
 	}
 
@@ -681,7 +684,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 			pg->len = pglen;
 			for (j = 0; j < len; j++)
 				ceph_decode_32(p, pg->osds[j]);
-			__insert_pg_mapping(pg, &map->pg_temp);
+			err = __insert_pg_mapping(pg, &map->pg_temp);
+			if (err)
+				goto bad;
 			dout(" added pg_temp %llx len %d\n", pgid, pglen);
 		}
 	}
-- 
cgit v1.2.2


From 81b024e70fed635a2cf5a4bf911db1649bb005f5 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 9 Oct 2009 10:29:18 -0700
Subject: ceph: reset osd session on fault, not peer_reset

The peer_reset just takes longer (until we reconnect and discover the osd
dropped the session... which it will).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 0aea8afaa072..4a8b4f08d4ae 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -294,10 +294,7 @@ __lookup_request_ge(struct ceph_osd_client *osdc,
 
 
 /*
- * The messaging layer will reconnect to the osd as needed.  If the
- * session has dropped, the OSD will have dropped the session state,
- * and we'll get notified by the messaging layer.  If that happens, we
- * need to resubmit all requests for that osd.
+ * If the osd connection drops, we need to resubmit all requests.
  */
 static void osd_reset(struct ceph_connection *con)
 {
@@ -1301,7 +1298,7 @@ const static struct ceph_connection_operations osd_con_ops = {
 	.put = put_osd_con,
 	.dispatch = dispatch,
 	.alloc_msg = alloc_msg,
-	.peer_reset = osd_reset,
+	.fault = osd_reset,
 	.alloc_middle = ceph_alloc_middle,
 	.prepare_pages = prepare_pages,
 };
-- 
cgit v1.2.2


From 266673db423e6ab247170d6c6d72ec36e530a911 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 9 Oct 2009 10:31:32 -0700
Subject: ceph: cancel osd requests before resending them

This ensures we don't submit the same request twice if we are kicking a
specific osd (as with an osd_reset), or when we hit a transient error and
resend.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 4a8b4f08d4ae..8e33928647f4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -813,10 +813,13 @@ static void kick_requests(struct ceph_osd_client *osdc,
 
 		if (req->r_resend) {
 			dout(" r_resend set on tid %llu\n", req->r_tid);
+			__cancel_request(req);
 			goto kick;
 		}
-		if (req->r_osd && kickosd == req->r_osd)
+		if (req->r_osd && kickosd == req->r_osd) {
+			__cancel_request(req);
 			goto kick;
+		}
 
 		err = __map_osds(osdc, req);
 		if (err == 0)
-- 
cgit v1.2.2


From 13e38c8ae771d73bf6d1f0f98e35f99c0f0d48ff Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 9 Oct 2009 16:36:34 -0700
Subject: ceph: update to mon client protocol v15

The mon request headers now include session_mon information that must
be properly initialized.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h    | 19 ++++++++++++-------
 fs/ceph/messenger.c  |  2 +-
 fs/ceph/mon_client.c |  8 ++++++--
 3 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 21ed51b127f2..acf24c6944c7 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -37,10 +37,10 @@
  */
 #define CEPH_OSD_PROTOCOL     7 /* cluster internal */
 #define CEPH_MDS_PROTOCOL     9 /* cluster internal */
-#define CEPH_MON_PROTOCOL     4 /* cluster internal */
+#define CEPH_MON_PROTOCOL     5 /* cluster internal */
 #define CEPH_OSDC_PROTOCOL   20 /* server/client */
 #define CEPH_MDSC_PROTOCOL   29 /* server/client */
-#define CEPH_MONC_PROTOCOL   14 /* server/client */
+#define CEPH_MONC_PROTOCOL   15 /* server/client */
 
 
 #define CEPH_INO_ROOT  1
@@ -118,9 +118,14 @@ struct ceph_file_layout {
 #define CEPH_MSG_OSD_OP           42
 #define CEPH_MSG_OSD_OPREPLY      43
 
+struct ceph_mon_request_header {
+	__le64 have_version;
+	__le16 session_mon;
+	__le64 session_mon_tid;
+} __attribute__ ((packed));
 
 struct ceph_mon_statfs {
-	__le64 have_version;
+	struct ceph_mon_request_header monhdr;
 	struct ceph_fsid fsid;
 	__le64 tid;
 } __attribute__ ((packed));
@@ -138,22 +143,22 @@ struct ceph_mon_statfs_reply {
 } __attribute__ ((packed));
 
 struct ceph_osd_getmap {
-	__le64 have_version;
+	struct ceph_mon_request_header monhdr;
 	struct ceph_fsid fsid;
 	__le32 start;
 } __attribute__ ((packed));
 
 struct ceph_mds_getmap {
-	__le64 have_version;
+	struct ceph_mon_request_header monhdr;
 	struct ceph_fsid fsid;
 } __attribute__ ((packed));
 
 struct ceph_client_mount {
-	__le64 have_version;
+	struct ceph_mon_request_header monhdr;
 } __attribute__ ((packed));
 
 struct ceph_mon_subscribe_item {
-	__le64 have;
+	__le64 have_version;	__le64 have;
 	__u8 onetime;
 } __attribute__ ((packed));
 
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 63f7f1359385..b48abc0b3be7 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -857,7 +857,7 @@ out:
 static int verify_hello(struct ceph_connection *con)
 {
 	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
-		pr_err("connect to/from %s has bad banner\n",
+		pr_err("connect to %s got bad banner\n",
 		       pr_addr(&con->peer_addr.in_addr));
 		con->error_msg = "protocol error, bad banner";
 		return -1;
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 9c34df17fa4b..dc698caccc42 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -273,7 +273,9 @@ static void __request_mount(struct ceph_mon_client *monc)
 	if (IS_ERR(msg))
 		return;
 	h = msg->front.iov_base;
-	h->have_version = 0;
+	h->monhdr.have_version = 0;
+	h->monhdr.session_mon = cpu_to_le16(-1);
+	h->monhdr.session_mon_tid = 0;
 	ceph_con_send(monc->con, msg);
 }
 
@@ -422,7 +424,9 @@ static int send_statfs(struct ceph_mon_client *monc,
 		return PTR_ERR(msg);
 	req->request = msg;
 	h = msg->front.iov_base;
-	h->have_version = 0;
+	h->monhdr.have_version = 0;
+	h->monhdr.session_mon = cpu_to_le16(-1);
+	h->monhdr.session_mon_tid = 0;
 	h->fsid = monc->monmap->fsid;
 	h->tid = cpu_to_le64(req->tid);
 	ceph_con_send(monc->con, msg);
-- 
cgit v1.2.2


From 752727a1b21a462d6ef634d552f180ae692f8947 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 9 Oct 2009 16:38:45 -0700
Subject: ceph: add file layout validation

This tracks updates to code shared with userspace.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.c | 24 ++++++++++++++++++++++++
 fs/ceph/ceph_fs.h |  2 ++
 2 files changed, 26 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
index 9371ff1c0002..a950b4083577 100644
--- a/fs/ceph/ceph_fs.c
+++ b/fs/ceph/ceph_fs.c
@@ -3,6 +3,30 @@
  */
 #include "types.h"
 
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+	__u32 su = le32_to_cpu(layout->fl_stripe_unit);
+	__u32 sc = le32_to_cpu(layout->fl_stripe_count);
+	__u32 os = le32_to_cpu(layout->fl_object_size);
+
+	/* stripe unit, object size must be non-zero, 64k increment */
+	if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+		return 0;
+	if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+		return 0;
+	/* object size must be a multiple of stripe unit */
+	if (os < su || os % su)
+		return 0;
+	/* stripe count must be non-zero */
+	if (!sc)
+		return 0;
+	return 1;
+}
+
+
 int ceph_flags_to_mode(int flags)
 {
 #ifdef O_DIRECTORY  /* fixme */
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index acf24c6944c7..b3bbab1952c9 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -72,7 +72,9 @@ struct ceph_file_layout {
 	__le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
 } __attribute__ ((packed));
 
+#define CEPH_MIN_STRIPE_UNIT 65536
 
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 
 
 /*********************************************
-- 
cgit v1.2.2


From 8fc57da4d32767cc6096ecaed24636dabefd1dbc Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 12 Oct 2009 10:28:13 -0700
Subject: ceph: ignore trailing data in monamp

This lets us extend the format more easily.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index dc698caccc42..d6c8e783467e 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -60,9 +60,6 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 	m->num_mon = num_mon;
 	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
 
-	if (p != end)
-		goto bad;
-
 	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
 	     m->num_mon);
 	for (i = 0; i < m->num_mon; i++)
-- 
cgit v1.2.2


From 572033069dbc2cff8d4a2d2b34c576e1813fda70 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 9 Oct 2009 21:52:34 -0700
Subject: ceph: remove unused CEPH_MSG_{OSD,MDS}_GETMAP

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h | 2 --
 fs/ceph/super.c   | 2 --
 2 files changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index b3bbab1952c9..56af192cb430 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -100,7 +100,6 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_MSG_MON_SUBSCRIBE_ACK      16
 
 /* client <-> mds */
-#define CEPH_MSG_MDS_GETMAP             20
 #define CEPH_MSG_MDS_MAP                21
 
 #define CEPH_MSG_CLIENT_SESSION         22
@@ -115,7 +114,6 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_MSG_CLIENT_CAPRELEASE      0x313
 
 /* osd */
-#define CEPH_MSG_OSD_GETMAP       40
 #define CEPH_MSG_OSD_MAP          41
 #define CEPH_MSG_OSD_OP           42
 #define CEPH_MSG_OSD_OPREPLY      43
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b3404a319c22..442a9900317e 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -231,7 +231,6 @@ const char *ceph_msg_type_name(int type)
 	case CEPH_MSG_CLIENT_MOUNT_ACK: return "client_mount_ack";
 	case CEPH_MSG_STATFS: return "statfs";
 	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
-	case CEPH_MSG_MDS_GETMAP: return "mds_getmap";
 	case CEPH_MSG_MDS_MAP: return "mds_map";
 	case CEPH_MSG_CLIENT_SESSION: return "client_session";
 	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
@@ -242,7 +241,6 @@ const char *ceph_msg_type_name(int type)
 	case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
 	case CEPH_MSG_CLIENT_SNAP: return "client_snap";
 	case CEPH_MSG_CLIENT_LEASE: return "client_lease";
-	case CEPH_MSG_OSD_GETMAP: return "osd_getmap";
 	case CEPH_MSG_OSD_MAP: return "osd_map";
 	case CEPH_MSG_OSD_OP: return "osd_op";
 	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
-- 
cgit v1.2.2


From 535bbb530764b1b2b3b732837f0e61e1baae7109 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 13 Oct 2009 12:55:26 -0700
Subject: ceph: add version field to message header

This makes it easier for individual message types to indicate
their particular encoding, and make future changes backward
compatible.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/msgr.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 73921ae43faa..9abc879e25b1 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -21,7 +21,7 @@
  * whenever the wire protocol changes.  try to keep this string length
  * constant.
  */
-#define CEPH_BANNER "ceph v021"
+#define CEPH_BANNER "ceph v022"
 #define CEPH_BANNER_MAX_LEN 30
 
 
@@ -125,6 +125,7 @@ struct ceph_msg_header {
 	__le64 seq;       /* message seq# for this session */
 	__le16 type;      /* message type */
 	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
 
 	__le32 front_len; /* bytes in main payload */
 	__le32 middle_len;/* bytes in middle payload */
-- 
cgit v1.2.2


From c89136ea4253c73e89e97f5138bb22d97ad9f564 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 14 Oct 2009 09:59:09 -0700
Subject: ceph: convert encode/decode macros to inlines

This avoids the fugly pass by reference and makes the code a bit easier
to read.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/decode.h     | 129 ++++++++++++++++++++++++++-------------------------
 fs/ceph/mds_client.c |  20 ++++----
 fs/ceph/mdsmap.c     |  38 +++++++--------
 fs/ceph/mon_client.c |   4 +-
 fs/ceph/osd_client.c |   8 ++--
 fs/ceph/osdmap.c     |  70 ++++++++++++++--------------
 6 files changed, 137 insertions(+), 132 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
index fc2769df062d..91179fb2cc3f 100644
--- a/fs/ceph/decode.h
+++ b/fs/ceph/decode.h
@@ -3,12 +3,44 @@
 
 #include <asm/unaligned.h>
 
+#include "types.h"
+
 /*
  * in all cases,
  *   void **p     pointer to position pointer
  *   void *end    pointer to end of buffer (last byte + 1)
  */
 
+static inline u64 ceph_decode_64(void **p)
+{
+	u64 v = get_unaligned_le64(*p);
+	*p += sizeof(u64);
+	return v;
+}
+static inline u32 ceph_decode_32(void **p)
+{
+	u32 v = get_unaligned_le32(*p);
+	*p += sizeof(u32);
+	return v;
+}
+static inline u16 ceph_decode_16(void **p)
+{
+	u16 v = get_unaligned_le16(*p);
+	*p += sizeof(u16);
+	return v;
+}
+static inline u8 ceph_decode_8(void **p)
+{
+	u8 v = *(u8 *)*p;
+	(*p)++;
+	return v;
+}
+static inline void ceph_decode_copy(void **p, void *pv, size_t n)
+{
+	memcpy(pv, *p, n);
+	*p += n;
+}
+
 /*
  * bounds check input.
  */
@@ -18,48 +50,20 @@
 			goto bad;			\
 	} while (0)
 
-#define ceph_decode_64(p, v)					\
-	do {							\
-		v = get_unaligned_le64(*(p));			\
-		*(p) += sizeof(u64);				\
-	} while (0)
-#define ceph_decode_32(p, v)					\
-	do {							\
-		v = get_unaligned_le32(*(p));			\
-		*(p) += sizeof(u32);				\
-	} while (0)
-#define ceph_decode_16(p, v)					\
-	do {							\
-		v = get_unaligned_le16(*(p));			\
-		*(p) += sizeof(u16);				\
-	} while (0)
-#define ceph_decode_8(p, v)				\
-	do {						\
-		v = *(u8 *)*(p);			\
-		(*p)++;					\
-	} while (0)
-
-#define ceph_decode_copy(p, pv, n)			\
-	do {						\
-		memcpy(pv, *(p), n);			\
-		*(p) += n;				\
-	} while (0)
-
-/* bounds check too */
 #define ceph_decode_64_safe(p, end, v, bad)			\
 	do {							\
 		ceph_decode_need(p, end, sizeof(u64), bad);	\
-		ceph_decode_64(p, v);				\
+		v = ceph_decode_64(p);				\
 	} while (0)
 #define ceph_decode_32_safe(p, end, v, bad)			\
 	do {							\
 		ceph_decode_need(p, end, sizeof(u32), bad);	\
-		ceph_decode_32(p, v);				\
+		v = ceph_decode_32(p);				\
 	} while (0)
 #define ceph_decode_16_safe(p, end, v, bad)			\
 	do {							\
 		ceph_decode_need(p, end, sizeof(u16), bad);	\
-		ceph_decode_16(p, v);				\
+		v = ceph_decode_16(p);				\
 	} while (0)
 
 #define ceph_decode_copy_safe(p, end, pv, n, bad)		\
@@ -71,41 +75,42 @@
 /*
  * struct ceph_timespec <-> struct timespec
  */
-#define ceph_decode_timespec(ts, tv)					\
-	do {								\
-		(ts)->tv_sec = le32_to_cpu((tv)->tv_sec);		\
-		(ts)->tv_nsec = le32_to_cpu((tv)->tv_nsec);		\
-	} while (0)
-#define ceph_encode_timespec(tv, ts)				\
-	do {							\
-		(tv)->tv_sec = cpu_to_le32((ts)->tv_sec);	\
-		(tv)->tv_nsec = cpu_to_le32((ts)->tv_nsec);	\
-	} while (0)
-
+static inline void ceph_decode_timespec(struct timespec *ts,
+					struct ceph_timespec *tv)
+{
+	ts->tv_sec = le32_to_cpu(tv->tv_sec);
+	ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
+}
+static inline void ceph_encode_timespec(struct ceph_timespec *tv,
+					struct timespec *ts)
+{
+	tv->tv_sec = cpu_to_le32(ts->tv_sec);
+	tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
+}
 
 /*
  * encoders
  */
-#define ceph_encode_64(p, v)						\
-	do {								\
-		put_unaligned_le64(v, (__le64 *)*(p));			\
-		*(p) += sizeof(u64);					\
-	} while (0)
-#define ceph_encode_32(p, v)					\
-	do {							\
-		put_unaligned_le32(v, (__le32 *)*(p));		\
-		*(p) += sizeof(u32);				\
-	} while (0)
-#define ceph_encode_16(p, v)					\
-	do {							\
-		put_unaligned_le16(v), (__le16 *)*(p));		\
-	*(p) += sizeof(u16);					\
-	} while (0)
-#define ceph_encode_8(p, v)			  \
-	do {					  \
-		*(u8 *)*(p) = v;		  \
-		(*(p))++;			  \
-	} while (0)
+static inline void ceph_encode_64(void **p, u64 v)
+{
+	put_unaligned_le64(v, (__le64 *)*p);
+	*p += sizeof(u64);
+}
+static inline void ceph_encode_32(void **p, u32 v)
+{
+	put_unaligned_le32(v, (__le32 *)*p);
+	*p += sizeof(u32);
+}
+static inline void ceph_encode_16(void **p, u16 v)
+{
+	put_unaligned_le16(v, (__le16 *)*p);
+	*p += sizeof(u16);
+}
+static inline void ceph_encode_8(void **p, u8 v)
+{
+	*(u8 *)*p = v;
+	(*p)++;
+}
 
 /*
  * filepath, string encoders
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index de8ba4a242ca..2b19da31a8b3 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -136,9 +136,9 @@ static int parse_reply_info_dir(void **p, void *end,
 		goto bad;
 
 	ceph_decode_need(p, end, sizeof(num) + 2, bad);
-	ceph_decode_32(p, num);
-	ceph_decode_8(p, info->dir_end);
-	ceph_decode_8(p, info->dir_complete);
+	num = ceph_decode_32(p);
+	info->dir_end = ceph_decode_8(p);
+	info->dir_complete = ceph_decode_8(p);
 	if (num == 0)
 		goto done;
 
@@ -160,7 +160,7 @@ static int parse_reply_info_dir(void **p, void *end,
 	while (num) {
 		/* dentry */
 		ceph_decode_need(p, end, sizeof(u32)*2, bad);
-		ceph_decode_32(p, info->dir_dname_len[i]);
+		info->dir_dname_len[i] = ceph_decode_32(p);
 		ceph_decode_need(p, end, info->dir_dname_len[i], bad);
 		info->dir_dname[i] = *p;
 		*p += info->dir_dname_len[i];
@@ -1791,10 +1791,10 @@ static void handle_forward(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 	from_mds = le64_to_cpu(msg->hdr.src.name.num);
 
 	ceph_decode_need(&p, end, sizeof(u64)+2*sizeof(u32), bad);
-	ceph_decode_64(&p, tid);
-	ceph_decode_32(&p, next_mds);
-	ceph_decode_32(&p, fwd_seq);
-	ceph_decode_8(&p, must_resend);
+	tid = ceph_decode_64(&p);
+	next_mds = ceph_decode_32(&p);
+	fwd_seq = ceph_decode_32(&p);
+	must_resend = ceph_decode_8(&p);
 
 	WARN_ON(must_resend);  /* shouldn't happen. */
 
@@ -2783,8 +2783,8 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 		pr_err("got mdsmap with wrong fsid\n");
 		return;
 	}
-	ceph_decode_32(&p, epoch);
-	ceph_decode_32(&p, maplen);
+	epoch = ceph_decode_32(&p);
+	maplen = ceph_decode_32(&p);
 	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
 
 	/* do we need it? */
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 09180d8fafe4..80daea064470 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -60,21 +60,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 	ceph_decode_16_safe(p, end, version, bad);
 
 	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
-	ceph_decode_32(p, m->m_epoch);
-	ceph_decode_32(p, m->m_client_epoch);
-	ceph_decode_32(p, m->m_last_failure);
-	ceph_decode_32(p, m->m_root);
-	ceph_decode_32(p, m->m_session_timeout);
-	ceph_decode_32(p, m->m_session_autoclose);
-	ceph_decode_64(p, m->m_max_file_size);
-	ceph_decode_32(p, m->m_max_mds);
+	m->m_epoch = ceph_decode_32(p);
+	m->m_client_epoch = ceph_decode_32(p);
+	m->m_last_failure = ceph_decode_32(p);
+	m->m_root = ceph_decode_32(p);
+	m->m_session_timeout = ceph_decode_32(p);
+	m->m_session_autoclose = ceph_decode_32(p);
+	m->m_max_file_size = ceph_decode_64(p);
+	m->m_max_mds = ceph_decode_32(p);
 
 	m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
 	if (m->m_info == NULL)
 		goto badmem;
 
 	/* pick out active nodes from mds_info (state > 0) */
-	ceph_decode_32(p, n);
+	n = ceph_decode_32(p);
 	for (i = 0; i < n; i++) {
 		u32 namelen;
 		s32 mds, inc, state;
@@ -86,18 +86,18 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 
 		ceph_decode_need(p, end, sizeof(addr) + 1 + sizeof(u32), bad);
 		ceph_decode_copy(p, &addr, sizeof(addr));
-		ceph_decode_8(p, infoversion);
-		ceph_decode_32(p, namelen);  /* skip mds name */
+		infoversion = ceph_decode_8(p);
+		namelen = ceph_decode_32(p);  /* skip mds name */
 		*p += namelen;
 
 		ceph_decode_need(p, end,
 				 4*sizeof(u32) + sizeof(u64) +
 				 sizeof(addr) + sizeof(struct ceph_timespec),
 				 bad);
-		ceph_decode_32(p, mds);
-		ceph_decode_32(p, inc);
-		ceph_decode_32(p, state);
-		ceph_decode_64(p, state_seq);
+		mds = ceph_decode_32(p);
+		inc = ceph_decode_32(p);
+		state = ceph_decode_32(p);
+		state_seq = ceph_decode_64(p);
 		*p += sizeof(addr);
 		*p += sizeof(struct ceph_timespec);
 		*p += sizeof(u32);
@@ -123,8 +123,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 					kcalloc(num_export_targets, sizeof(u32),
 						GFP_NOFS);
 				for (j = 0; j < num_export_targets; j++)
-					ceph_decode_32(&pexport_targets,
-					      m->m_info[mds].export_targets[j]);
+					m->m_info[mds].export_targets[j] =
+					       ceph_decode_32(&pexport_targets);
 			} else {
 				m->m_info[mds].export_targets = NULL;
 			}
@@ -139,8 +139,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		goto badmem;
 	ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
 	for (i = 0; i < n; i++)
-		ceph_decode_32(p, m->m_data_pg_pools[i]);
-	ceph_decode_32(p, m->m_cas_pg_pool);
+		m->m_data_pg_pools[i] = ceph_decode_32(p);
+	m->m_cas_pg_pool = ceph_decode_32(p);
 
 	/* ok, we don't care about the rest. */
 	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index d6c8e783467e..bea2be9077e4 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -45,9 +45,9 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 
 	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	ceph_decode_32(&p, epoch);
+	epoch = ceph_decode_32(&p);
 
-	ceph_decode_32(&p, num_mon);
+	num_mon = ceph_decode_32(&p);
 	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
 
 	if (num_mon >= CEPH_MAX_MON)
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 8e33928647f4..bbd9a5d23712 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -894,8 +894,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	dout(" %d inc maps\n", nr_maps);
 	while (nr_maps > 0) {
 		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
-		ceph_decode_32(&p, epoch);
-		ceph_decode_32(&p, maplen);
+		epoch = ceph_decode_32(&p);
+		maplen = ceph_decode_32(&p);
 		ceph_decode_need(&p, end, maplen, bad);
 		next = p + maplen;
 		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
@@ -927,8 +927,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	dout(" %d full maps\n", nr_maps);
 	while (nr_maps) {
 		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
-		ceph_decode_32(&p, epoch);
-		ceph_decode_32(&p, maplen);
+		epoch = ceph_decode_32(&p);
+		maplen = ceph_decode_32(&p);
 		ceph_decode_need(&p, end, maplen, bad);
 		if (nr_maps > 1) {
 			dout("skipping non-latest full map %u len %d\n",
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 342e5f80996b..6f0aeff4185a 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -67,7 +67,7 @@ static int crush_decode_uniform_bucket(void **p, void *end,
 {
 	dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
 	ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
-	ceph_decode_32(p, b->item_weight);
+	b->item_weight = ceph_decode_32(p);
 	return 0;
 bad:
 	return -EINVAL;
@@ -86,8 +86,8 @@ static int crush_decode_list_bucket(void **p, void *end,
 		return -ENOMEM;
 	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
 	for (j = 0; j < b->h.size; j++) {
-		ceph_decode_32(p, b->item_weights[j]);
-		ceph_decode_32(p, b->sum_weights[j]);
+		b->item_weights[j] = ceph_decode_32(p);
+		b->sum_weights[j] = ceph_decode_32(p);
 	}
 	return 0;
 bad:
@@ -105,7 +105,7 @@ static int crush_decode_tree_bucket(void **p, void *end,
 		return -ENOMEM;
 	ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
 	for (j = 0; j < b->num_nodes; j++)
-		ceph_decode_32(p, b->node_weights[j]);
+		b->node_weights[j] = ceph_decode_32(p);
 	return 0;
 bad:
 	return -EINVAL;
@@ -124,8 +124,8 @@ static int crush_decode_straw_bucket(void **p, void *end,
 		return -ENOMEM;
 	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
 	for (j = 0; j < b->h.size; j++) {
-		ceph_decode_32(p, b->item_weights[j]);
-		ceph_decode_32(p, b->straws[j]);
+		b->item_weights[j] = ceph_decode_32(p);
+		b->straws[j] = ceph_decode_32(p);
 	}
 	return 0;
 bad:
@@ -148,15 +148,15 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 		return ERR_PTR(-ENOMEM);
 
 	ceph_decode_need(p, end, 4*sizeof(u32), bad);
-	ceph_decode_32(p, magic);
+	magic = ceph_decode_32(p);
 	if (magic != CRUSH_MAGIC) {
 		pr_err("crush_decode magic %x != current %x\n",
 		       (unsigned)magic, (unsigned)CRUSH_MAGIC);
 		goto bad;
 	}
-	ceph_decode_32(p, c->max_buckets);
-	ceph_decode_32(p, c->max_rules);
-	ceph_decode_32(p, c->max_devices);
+	c->max_buckets = ceph_decode_32(p);
+	c->max_rules = ceph_decode_32(p);
+	c->max_devices = ceph_decode_32(p);
 
 	c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
 	if (c->device_parents == NULL)
@@ -208,11 +208,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 			goto badmem;
 
 		ceph_decode_need(p, end, 4*sizeof(u32), bad);
-		ceph_decode_32(p, b->id);
-		ceph_decode_16(p, b->type);
-		ceph_decode_16(p, b->alg);
-		ceph_decode_32(p, b->weight);
-		ceph_decode_32(p, b->size);
+		b->id = ceph_decode_32(p);
+		b->type = ceph_decode_16(p);
+		b->alg = ceph_decode_16(p);
+		b->weight = ceph_decode_32(p);
+		b->size = ceph_decode_32(p);
 
 		dout("crush_decode bucket size %d off %x %p to %p\n",
 		     b->size, (int)(*p-start), *p, end);
@@ -227,7 +227,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 
 		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
 		for (j = 0; j < b->size; j++)
-			ceph_decode_32(p, b->items[j]);
+			b->items[j] = ceph_decode_32(p);
 
 		switch (b->alg) {
 		case CRUSH_BUCKET_UNIFORM:
@@ -290,9 +290,9 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
 		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
 		for (j = 0; j < r->len; j++) {
-			ceph_decode_32(p, r->steps[j].op);
-			ceph_decode_32(p, r->steps[j].arg1);
-			ceph_decode_32(p, r->steps[j].arg2);
+			r->steps[j].op = ceph_decode_32(p);
+			r->steps[j].arg1 = ceph_decode_32(p);
+			r->steps[j].arg2 = ceph_decode_32(p);
 		}
 	}
 
@@ -411,11 +411,11 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 
 	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
 	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
-	ceph_decode_32(p, map->epoch);
+	map->epoch = ceph_decode_32(p);
 	ceph_decode_copy(p, &map->created, sizeof(map->created));
 	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
 
-	ceph_decode_32(p, map->num_pools);
+	map->num_pools = ceph_decode_32(p);
 	map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool),
 			       GFP_NOFS);
 	if (!map->pg_pool) {
@@ -425,7 +425,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	ceph_decode_32_safe(p, end, max, bad);
 	while (max--) {
 		ceph_decode_need(p, end, 4+sizeof(map->pg_pool->v), bad);
-		ceph_decode_32(p, i);
+		i = ceph_decode_32(p);
 		if (i >= map->num_pools)
 			goto bad;
 		ceph_decode_copy(p, &map->pg_pool[i].v,
@@ -438,7 +438,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 
 	ceph_decode_32_safe(p, end, map->flags, bad);
 
-	ceph_decode_32(p, max);
+	max = ceph_decode_32(p);
 
 	/* (re)alloc osd arrays */
 	err = osdmap_set_max_osd(map, max);
@@ -456,7 +456,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 
 	*p += 4; /* skip length field (should match max) */
 	for (i = 0; i < map->max_osd; i++)
-		ceph_decode_32(p, map->osd_weight[i]);
+		map->osd_weight[i] = ceph_decode_32(p);
 
 	*p += 4; /* skip length field (should match max) */
 	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
@@ -469,8 +469,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		struct ceph_pg_mapping *pg;
 
 		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
-		ceph_decode_64(p, pgid);
-		ceph_decode_32(p, n);
+		pgid = ceph_decode_64(p);
+		n = ceph_decode_32(p);
 		ceph_decode_need(p, end, n * sizeof(u32), bad);
 		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
 		if (!pg) {
@@ -480,7 +480,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		pg->pgid = pgid;
 		pg->len = n;
 		for (j = 0; j < n; j++)
-			ceph_decode_32(p, pg->osds[j]);
+			pg->osds[j] = ceph_decode_32(p);
 
 		err = __insert_pg_mapping(pg, &map->pg_temp);
 		if (err)
@@ -537,10 +537,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
 			 bad);
 	ceph_decode_copy(p, &fsid, sizeof(fsid));
-	ceph_decode_32(p, epoch);
+	epoch = ceph_decode_32(p);
 	BUG_ON(epoch != map->epoch+1);
 	ceph_decode_copy(p, &modified, sizeof(modified));
-	ceph_decode_32(p, new_flags);
+	new_flags = ceph_decode_32(p);
 
 	/* full map? */
 	ceph_decode_32_safe(p, end, len, bad);
@@ -568,7 +568,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	ceph_decode_need(p, end, 5*sizeof(u32), bad);
 
 	/* new max? */
-	ceph_decode_32(p, max);
+	max = ceph_decode_32(p);
 	if (max >= 0) {
 		err = osdmap_set_max_osd(map, max);
 		if (err < 0)
@@ -641,8 +641,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	while (len--) {
 		u32 osd, off;
 		ceph_decode_need(p, end, sizeof(u32)*2, bad);
-		ceph_decode_32(p, osd);
-		ceph_decode_32(p, off);
+		osd = ceph_decode_32(p);
+		off = ceph_decode_32(p);
 		pr_info("osd%d weight 0x%x %s\n", osd, off,
 		     off == CEPH_OSD_IN ? "(in)" :
 		     (off == CEPH_OSD_OUT ? "(out)" : ""));
@@ -659,8 +659,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		u64 pgid;
 		u32 pglen;
 		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
-		ceph_decode_64(p, pgid);
-		ceph_decode_32(p, pglen);
+		pgid = ceph_decode_64(p);
+		pglen = ceph_decode_32(p);
 
 		/* remove any? */
 		while (rbp && rb_entry(rbp, struct ceph_pg_mapping,
@@ -683,7 +683,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 			pg->pgid = pgid;
 			pg->len = pglen;
 			for (j = 0; j < len; j++)
-				ceph_decode_32(p, pg->osds[j]);
+				pg->osds[j] = ceph_decode_32(p);
 			err = __insert_pg_mapping(pg, &map->pg_temp);
 			if (err)
 				goto bad;
-- 
cgit v1.2.2


From f2cf418cec8d61df0651a0140a92a8c75246e14f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 14 Oct 2009 14:09:07 -0700
Subject: ceph: initialize sb->s_bdi, bdi_unregister after kill_anon_super

Writeback doesn't work without the bdi set, and writeback on
umount doesn't work if we unregister the bdi too early.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 442a9900317e..7f7d4759a443 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -97,6 +97,7 @@ static int ceph_syncfs(struct super_block *sb, int wait)
 	dout("sync_fs %d\n", wait);
 	ceph_osdc_sync(&ceph_client(sb)->osdc);
 	ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+	dout("sync_fs %d done\n", wait);
 	return 0;
 }
 
@@ -777,6 +778,7 @@ static int ceph_init_bdi(struct super_block *sb, struct ceph_client *client)
 	err = bdi_init(&client->backing_dev_info);
 	if (err < 0)
 		return err;
+	sb->s_bdi = &client->backing_dev_info;
 
 	/* set ra_pages based on rsize mount option? */
 	if (client->mount_args.rsize >= PAGE_CACHE_SIZE)
@@ -861,8 +863,8 @@ static void ceph_kill_sb(struct super_block *s)
 	struct ceph_client *client = ceph_sb_to_client(s);
 	dout("kill_sb %p\n", s);
 	ceph_mdsc_pre_umount(&client->mdsc);
-	bdi_unregister(&client->backing_dev_info);
 	kill_anon_super(s);    /* will call put_super after sb is r/o */
+	bdi_unregister(&client->backing_dev_info);
 	bdi_destroy(&client->backing_dev_info);
 	ceph_destroy_client(client);
 }
-- 
cgit v1.2.2


From cdc35f96277314bbfeefd0505410cabd69aebd8d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 14 Oct 2009 14:24:19 -0700
Subject: ceph: move generic flushing code into helper

Both callers of __mark_caps_flushing() do the same work; move it
into the helper.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 43 +++++++++++++++++++++----------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 5c7d0e9bbb7b..111439d883d2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1272,16 +1272,30 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
 /*
  * Add dirty inode to the flushing list.  Assigned a seq number so we
  * can wait for caps to flush without starving.
+ *
+ * Called under i_lock.
  */
-static void __mark_caps_flushing(struct inode *inode,
+static int __mark_caps_flushing(struct inode *inode,
 				 struct ceph_mds_session *session)
 {
 	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-
+	int flushing;
+	
+	BUG_ON(ci->i_dirty_caps == 0);
 	BUG_ON(list_empty(&ci->i_dirty_item));
+
+	flushing = ci->i_dirty_caps;
+	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
+	     ceph_cap_string(flushing),
+	     ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(ci->i_flushing_caps | flushing));
+	ci->i_flushing_caps |= flushing;
+	ci->i_dirty_caps = 0;
+
 	spin_lock(&mdsc->cap_dirty_lock);
 	if (list_empty(&ci->i_flushing_item)) {
+		list_del_init(&ci->i_dirty_item);
 		list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
 		mdsc->num_cap_flushing++;
 		ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
@@ -1289,6 +1303,8 @@ static void __mark_caps_flushing(struct inode *inode,
 		     ci->i_cap_flush_seq);
 	}
 	spin_unlock(&mdsc->cap_dirty_lock);
+
+	return flushing;
 }
 
 /*
@@ -1504,17 +1520,8 @@ ack:
 			took_snap_rwsem = 1;
 		}
 
-		if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
-			/* update dirty, flushing bits */
-			flushing = ci->i_dirty_caps;
-			dout(" flushing %s, flushing_caps %s -> %s\n",
-			     ceph_cap_string(flushing),
-			     ceph_cap_string(ci->i_flushing_caps),
-			     ceph_cap_string(ci->i_flushing_caps | flushing));
-			ci->i_flushing_caps |= flushing;
-			ci->i_dirty_caps = 0;
-			__mark_caps_flushing(inode, session);
-		}
+		if (cap == ci->i_auth_cap && ci->i_dirty_caps)
+			flushing = __mark_caps_flushing(inode, session);
 
 		mds = cap->mds;  /* remember mds, so we don't repeat */
 		sent++;
@@ -1605,15 +1612,7 @@ retry:
 		if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
 			goto out;
 
-		__mark_caps_flushing(inode, session);
-
-		flushing = ci->i_dirty_caps;
-		dout(" flushing %s, flushing_caps %s -> %s\n",
-		     ceph_cap_string(flushing),
-		     ceph_cap_string(ci->i_flushing_caps),
-		     ceph_cap_string(ci->i_flushing_caps | flushing));
-		ci->i_flushing_caps |= flushing;
-		ci->i_dirty_caps = 0;
+		flushing = __mark_caps_flushing(inode, session);
 
 		/* __send_cap drops i_lock */
 		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
-- 
cgit v1.2.2


From afcdaea3f2a78ce4873bd7e98a6d603bda23d167 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 14 Oct 2009 14:27:38 -0700
Subject: ceph: flush dirty caps via the cap_dirty list

Previously we were flushing dirty caps by passing an extra flag
when traversing the delayed caps list.  Besides being a bit ugly,
that can also miss caps that are dirty but didn't result in a
cap requeue: notably, mark_caps_dirty().

Separate the flushing into a separate helper, and traverse the
cap_dirty list.

This also brings i_dirty_item in line with i_dirty_caps: we are
on the list IFF caps != 0.  We carry an inode ref IFF
dirty_caps|flushing_caps != 0.

Lose the unused return value from __ceph_mark_caps_dirty().

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       | 76 ++++++++++++++++++++++++++++++++++++----------------
 fs/ceph/mds_client.c |  6 ++---
 fs/ceph/super.h      |  6 ++---
 3 files changed, 59 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 111439d883d2..40b8d3471244 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -997,7 +997,7 @@ void ceph_queue_caps_release(struct inode *inode)
 		if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
 			dout(" release msg %p full\n", msg);
 			list_move_tail(&msg->list_head,
-				      &session->s_cap_releases_done);
+				       &session->s_cap_releases_done);
 		} else {
 			dout(" release msg %p at %d/%d (%d)\n", msg,
 			     (int)le32_to_cpu(head->num),
@@ -1292,14 +1292,20 @@ static int __mark_caps_flushing(struct inode *inode,
 	     ceph_cap_string(ci->i_flushing_caps | flushing));
 	ci->i_flushing_caps |= flushing;
 	ci->i_dirty_caps = 0;
+	dout(" inode %p now !dirty\n", inode);
 
 	spin_lock(&mdsc->cap_dirty_lock);
+	list_del_init(&ci->i_dirty_item);
+
+	ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
 	if (list_empty(&ci->i_flushing_item)) {
-		list_del_init(&ci->i_dirty_item);
 		list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
 		mdsc->num_cap_flushing++;
-		ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
-		dout(" inode %p now flushing seq %lld\n", &ci->vfs_inode,
+		dout(" inode %p now flushing seq %lld\n", inode,
+		     ci->i_cap_flush_seq);
+	} else {
+		list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+		dout(" inode %p now flushing (more) seq %lld\n", inode,
 		     ci->i_cap_flush_seq);
 	}
 	spin_unlock(&mdsc->cap_dirty_lock);
@@ -1555,32 +1561,33 @@ ack:
  * Mark caps dirty.  If inode is newly dirty, add to the global dirty
  * list.
  */
-int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
 	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
 	struct inode *inode = &ci->vfs_inode;
-	int was = __ceph_caps_dirty(ci);
+	int was_dirty = ci->i_dirty_caps;
 	int dirty = 0;
 
 	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
 	     ceph_cap_string(mask), ceph_cap_string(ci->i_dirty_caps),
 	     ceph_cap_string(ci->i_dirty_caps | mask));
 	ci->i_dirty_caps |= mask;
-	if (!was) {
+	if (!was_dirty) {
 		dout(" inode %p now dirty\n", &ci->vfs_inode);
 		spin_lock(&mdsc->cap_dirty_lock);
 		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
 		spin_unlock(&mdsc->cap_dirty_lock);
-		igrab(inode);
-		dirty |= I_DIRTY_SYNC;
+		if (ci->i_flushing_caps == 0) {
+			igrab(inode);
+			dirty |= I_DIRTY_SYNC;
+		}
 	}
-	if ((was & CEPH_CAP_FILE_BUFFER) &&
+	if (((was_dirty | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
 	    (mask & CEPH_CAP_FILE_BUFFER))
 		dirty |= I_DIRTY_DATASYNC;
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);
 	__cap_delay_requeue(mdsc, ci);
-	return was;
 }
 
 /*
@@ -2327,7 +2334,7 @@ static void handle_cap_flush_ack(struct inode *inode,
 	int dirty = le32_to_cpu(m->dirty);
 	int cleaned = 0;
 	u64 flush_tid = le64_to_cpu(m->client_tid);
-	int old_dirty = 0, new_dirty = 0;
+	int drop = 0;
 	int i;
 
 	for (i = 0; i < CEPH_CAP_BITS; i++)
@@ -2344,9 +2351,7 @@ static void handle_cap_flush_ack(struct inode *inode,
 	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
 		goto out;
 
-	old_dirty = ci->i_dirty_caps | ci->i_flushing_caps;
 	ci->i_flushing_caps &= ~cleaned;
-	new_dirty = ci->i_dirty_caps | ci->i_flushing_caps;
 
 	spin_lock(&mdsc->cap_dirty_lock);
 	if (ci->i_flushing_caps == 0) {
@@ -2360,17 +2365,19 @@ static void handle_cap_flush_ack(struct inode *inode,
 		mdsc->num_cap_flushing--;
 		wake_up(&mdsc->cap_flushing_wq);
 		dout(" inode %p now !flushing\n", inode);
-	}
-	if (old_dirty && !new_dirty) {
-		dout(" inode %p now clean\n", inode);
-		list_del_init(&ci->i_dirty_item);
+
+		if (ci->i_dirty_caps == 0) {
+			dout(" inode %p now clean\n", inode);
+			BUG_ON(!list_empty(&ci->i_dirty_item));
+			drop = 1;
+		}
 	}
 	spin_unlock(&mdsc->cap_dirty_lock);
 	wake_up(&ci->i_cap_wq);
 
 out:
 	spin_unlock(&inode->i_lock);
-	if (old_dirty && !new_dirty)
+	if (drop)
 		iput(inode);
 }
 
@@ -2676,14 +2683,11 @@ bad:
 /*
  * Delayed work handler to process end of delayed cap release LRU list.
  */
-void ceph_check_delayed_caps(struct ceph_mds_client *mdsc, int flushdirty)
+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 {
 	struct ceph_inode_info *ci;
 	int flags = CHECK_CAPS_NODELAY;
 
-	if (flushdirty)
-		flags |= CHECK_CAPS_FLUSH;
-
 	dout("check_delayed_caps\n");
 	while (1) {
 		spin_lock(&mdsc->cap_delay_lock);
@@ -2703,6 +2707,32 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc, int flushdirty)
 	spin_unlock(&mdsc->cap_delay_lock);
 }
 
+/*
+ * Flush all dirty caps to the mds
+ */
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci;
+	struct inode *inode;
+
+	dout("flush_dirty_caps\n");
+	spin_lock(&mdsc->cap_dirty_lock);
+	while (!list_empty(&mdsc->cap_dirty)) {
+		ci = list_first_entry(&mdsc->cap_dirty,
+				      struct ceph_inode_info,
+				      i_dirty_item);
+		inode = igrab(&ci->vfs_inode);
+		spin_unlock(&mdsc->cap_dirty_lock);
+		if (inode) {
+			ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
+					NULL);
+			iput(inode);
+		}
+		spin_lock(&mdsc->cap_dirty_lock);
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+}
+
 /*
  * Drop open file reference.  If we were the last open file,
  * we may need to release capabilities to the MDS (or schedule
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 2b19da31a8b3..12d66c0572ac 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2504,7 +2504,7 @@ static void delayed_work(struct work_struct *work)
 	int renew_caps;
 
 	dout("mdsc delayed_work\n");
-	ceph_check_delayed_caps(mdsc, 0);
+	ceph_check_delayed_caps(mdsc);
 
 	mutex_lock(&mdsc->mutex);
 	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
@@ -2627,7 +2627,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 	mdsc->stopping = 1;
 
 	drop_leases(mdsc);
-	ceph_check_delayed_caps(mdsc, 1);
+	ceph_flush_dirty_caps(mdsc);
 	wait_requests(mdsc);
 }
 
@@ -2677,7 +2677,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 	mutex_unlock(&mdsc->mutex);
 	dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
 
-	ceph_check_delayed_caps(mdsc, 1);
+	ceph_flush_dirty_caps(mdsc);
 
 	wait_unsafe_requests(mdsc, want_tid);
 	wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cfd39ef4023e..0bbf58ab607e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -524,7 +524,7 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
 {
 	return ci->i_dirty_caps | ci->i_flushing_caps;
 }
-extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
 
 extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
 extern int __ceph_caps_used(struct ceph_inode_info *ci);
@@ -814,8 +814,8 @@ extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
 			       struct ceph_mds_session **psession);
 extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 			    struct ceph_mds_session *session);
-extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc,
-				    int flushdirty);
+extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
 
 extern int ceph_encode_inode_release(void **p, struct inode *inode,
 				     int mds, int drop, int unless, int force);
-- 
cgit v1.2.2


From 07bd10fb9853a41a7f0bb271721cca97d15eccae Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 14 Oct 2009 17:26:40 -0700
Subject: ceph: correct subscribe_ack msgpool payload size

Defined a struct for the SUBSCRIBE_ACK, and use that to size
the msgpool.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h    |  5 +++++
 fs/ceph/mon_client.c | 11 +++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 56af192cb430..9b16e2e06ea6 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -162,6 +162,11 @@ struct ceph_mon_subscribe_item {
 	__u8 onetime;
 } __attribute__ ((packed));
 
+struct ceph_mon_subscribe_ack {
+	__le32 duration;         /* seconds */
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
 /*
  * mds states
  *   > 0 -> in
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index bea2be9077e4..d52e52968d01 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -199,10 +199,12 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
 				 struct ceph_msg *msg)
 {
 	unsigned seconds;
-	void *p = msg->front.iov_base;
-	void *end = p + msg->front.iov_len;
+	struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	seconds = le32_to_cpu(h->duration);
 
-	ceph_decode_32_safe(&p, end, seconds, bad);
 	mutex_lock(&monc->mutex);
 	if (monc->hunting) {
 		pr_info("mon%d %s session established\n",
@@ -541,7 +543,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	err = ceph_msgpool_init(&monc->msgpool_mount_ack, 4096, 1, false);
 	if (err < 0)
 		goto out;
-	err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, 8, 1, false);
+	err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
+			       sizeof(struct ceph_mon_subscribe_ack), 1, false);
 	if (err < 0)
 		goto out;
 	err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
-- 
cgit v1.2.2


From 8f3bc053c610826a657714649ea596f07875db2e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 14 Oct 2009 17:36:07 -0700
Subject: ceph: warn on allocation from msgpool with larger front_len

Pass the front_len we need when pulling a message off a msgpool,
and WARN if it is greater than the pool's size.  Then try to
allocate a new message (to continue without failing).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c |  7 ++++---
 fs/ceph/msgpool.c    | 20 +++++++++++++++++---
 fs/ceph/msgpool.h    |  3 ++-
 fs/ceph/osd_client.c |  5 +++--
 4 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index d52e52968d01..e6e954cac6b9 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -639,14 +639,15 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 {
 	struct ceph_mon_client *monc = con->private;
 	int type = le16_to_cpu(hdr->type);
+	int front = le32_to_cpu(hdr->front_len);
 
 	switch (type) {
 	case CEPH_MSG_CLIENT_MOUNT_ACK:
-		return ceph_msgpool_get(&monc->msgpool_mount_ack);
+		return ceph_msgpool_get(&monc->msgpool_mount_ack, front);
 	case CEPH_MSG_MON_SUBSCRIBE_ACK:
-		return ceph_msgpool_get(&monc->msgpool_subscribe_ack);
+		return ceph_msgpool_get(&monc->msgpool_subscribe_ack, front);
 	case CEPH_MSG_STATFS_REPLY:
-		return ceph_msgpool_get(&monc->msgpool_statfs_reply);
+		return ceph_msgpool_get(&monc->msgpool_statfs_reply, front);
 	}
 	return ceph_alloc_msg(con, hdr);
 }
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index 39d4d7ed82ce..7599b3382076 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -101,14 +101,28 @@ int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
 	return ret;
 }
 
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool)
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
 {
 	wait_queue_t wait;
 	struct ceph_msg *msg;
 
+	if (front_len && front_len > pool->front_len) {
+		pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
+		       pool, front_len, pool->front_len);
+		WARN_ON(1);
+
+		/* try to alloc a fresh message */
+		msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+		if (!IS_ERR(msg))
+			return msg;
+	}
+
+	if (!front_len)
+		front_len = pool->front_len;
+
 	if (pool->blocking) {
 		/* mempool_t behavior; first try to alloc */
-		msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
+		msg = ceph_msg_new(0, front_len, 0, 0, NULL);
 		if (!IS_ERR(msg))
 			return msg;
 	}
@@ -133,7 +147,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool)
 			WARN_ON(1);
 
 			/* maybe we can allocate it now? */
-			msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
+			msg = ceph_msg_new(0, front_len, 0, 0, NULL);
 			if (!IS_ERR(msg))
 				return msg;
 
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index 07a2decaa6d8..bc834bfcd720 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -20,7 +20,8 @@ extern int ceph_msgpool_init(struct ceph_msgpool *pool,
 			     int front_len, int size, bool blocking);
 extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
 extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
-extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *);
+extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
+					 int front_len);
 extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
 
 #endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index bbd9a5d23712..0a254054a82a 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -161,7 +161,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	if (snapc)
 		msg_size += sizeof(u64) * snapc->num_snaps;
 	if (use_mempool)
-		msg = ceph_msgpool_get(&osdc->msgpool_op);
+		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
 	else
 		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
 	if (IS_ERR(msg)) {
@@ -1271,10 +1271,11 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 	struct ceph_osd *osd = con->private;
 	struct ceph_osd_client *osdc = osd->o_osdc;
 	int type = le16_to_cpu(hdr->type);
+	int front = le32_to_cpu(hdr->front_len);
 
 	switch (type) {
 	case CEPH_MSG_OSD_OPREPLY:
-		return ceph_msgpool_get(&osdc->msgpool_op_reply);
+		return ceph_msgpool_get(&osdc->msgpool_op_reply, front);
 	}
 	return ceph_alloc_msg(con, hdr);
 }
-- 
cgit v1.2.2


From 76e3b390d41db9d69e254a09dd1aedd3e6aac25f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 15 Oct 2009 18:13:53 -0700
Subject: ceph: move dirty caps code around

Cleanup only.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 70 +++++++++++++++++++++++++++++++---------------------------
 1 file changed, 37 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 40b8d3471244..7d166182e98d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1269,6 +1269,41 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
 	spin_unlock(&inode->i_lock);
 }
 
+/*
+ * Mark caps dirty.  If inode is newly dirty, add to the global dirty
+ * list.
+ */
+void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+{
+	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+	struct inode *inode = &ci->vfs_inode;
+	int was = ci->i_dirty_caps;
+	int dirty = 0;
+
+	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+	     ceph_cap_string(mask), ceph_cap_string(was),
+	     ceph_cap_string(was | mask));
+	ci->i_dirty_caps |= mask;
+	if (was == 0) {
+		dout(" inode %p now dirty\n", &ci->vfs_inode);
+		BUG_ON(!list_empty(&ci->i_dirty_item));
+		spin_lock(&mdsc->cap_dirty_lock);
+		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+		spin_unlock(&mdsc->cap_dirty_lock);
+		if (ci->i_flushing_caps == 0) {
+			igrab(inode);
+			dirty |= I_DIRTY_SYNC;
+		}
+	}
+	BUG_ON(list_empty(&ci->i_dirty_item));
+	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
+	    (mask & CEPH_CAP_FILE_BUFFER))
+		dirty |= I_DIRTY_DATASYNC;
+	if (dirty)
+		__mark_inode_dirty(inode, dirty);
+	__cap_delay_requeue(mdsc, ci);
+}
+
 /*
  * Add dirty inode to the flushing list.  Assigned a seq number so we
  * can wait for caps to flush without starving.
@@ -1557,39 +1592,6 @@ ack:
 		up_read(&mdsc->snap_rwsem);
 }
 
-/*
- * Mark caps dirty.  If inode is newly dirty, add to the global dirty
- * list.
- */
-void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
-{
-	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
-	struct inode *inode = &ci->vfs_inode;
-	int was_dirty = ci->i_dirty_caps;
-	int dirty = 0;
-
-	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
-	     ceph_cap_string(mask), ceph_cap_string(ci->i_dirty_caps),
-	     ceph_cap_string(ci->i_dirty_caps | mask));
-	ci->i_dirty_caps |= mask;
-	if (!was_dirty) {
-		dout(" inode %p now dirty\n", &ci->vfs_inode);
-		spin_lock(&mdsc->cap_dirty_lock);
-		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
-		spin_unlock(&mdsc->cap_dirty_lock);
-		if (ci->i_flushing_caps == 0) {
-			igrab(inode);
-			dirty |= I_DIRTY_SYNC;
-		}
-	}
-	if (((was_dirty | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
-	    (mask & CEPH_CAP_FILE_BUFFER))
-		dirty |= I_DIRTY_DATASYNC;
-	if (dirty)
-		__mark_inode_dirty(inode, dirty);
-	__cap_delay_requeue(mdsc, ci);
-}
-
 /*
  * Try to flush dirty caps back to the auth mds.
  */
@@ -2370,6 +2372,8 @@ static void handle_cap_flush_ack(struct inode *inode,
 			dout(" inode %p now clean\n", inode);
 			BUG_ON(!list_empty(&ci->i_dirty_item));
 			drop = 1;
+		} else {
+			BUG_ON(list_empty(&ci->i_dirty_item));
 		}
 	}
 	spin_unlock(&mdsc->cap_dirty_lock);
-- 
cgit v1.2.2


From 8fa9765576875200a7412a5300b5f0537211f038 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 16 Oct 2009 14:44:35 -0700
Subject: ceph: enable readahead

Initialized bdi->ra_pages to enable readahead.  Use 512KB default.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 1 +
 fs/ceph/super.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7f7d4759a443..ab950fce4172 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -327,6 +327,7 @@ static int parse_mount_args(struct ceph_client *client,
 	args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
 	args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
 	args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+	args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
 	args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
 	args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
 	args->max_readdir = 1024;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 0bbf58ab607e..75556e97e865 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -69,7 +69,7 @@ struct ceph_mount_args {
  * defaults
  */
 #define CEPH_MOUNT_TIMEOUT_DEFAULT  60
-#define CEPH_MOUNT_RSIZE_DEFAULT    (128*1024) /* readahead */
+#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
 
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
-- 
cgit v1.2.2


From ee7fdfaff7702bd209e3a013b2fc4643233f5465 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 19 Oct 2009 11:41:51 -0700
Subject: ceph: include preferred osd in placement seed

Mix the preferred osd (if any) into the placement seed that is fed into
the CRUSH object placement calculation.  This prevents all the placement
pgs from peering with the same osds.

Rev the osd client protocol with this change.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h | 2 +-
 fs/ceph/osdmap.c  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 9b16e2e06ea6..f8f27e28a6bf 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -38,7 +38,7 @@
 #define CEPH_OSD_PROTOCOL     7 /* cluster internal */
 #define CEPH_MDS_PROTOCOL     9 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
-#define CEPH_OSDC_PROTOCOL   20 /* server/client */
+#define CEPH_OSDC_PROTOCOL   21 /* server/client */
 #define CEPH_MDSC_PROTOCOL   29 /* server/client */
 #define CEPH_MONC_PROTOCOL   15 /* server/client */
 
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 6f0aeff4185a..72d75a239ac2 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -791,6 +791,8 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 	pgid.pg64 = 0;   /* start with it zeroed out */
 	pgid.pg.ps = ceph_full_name_hash(oid, strlen(oid));
 	pgid.pg.preferred = preferred;
+	if (preferred >= 0)
+		pgid.pg.ps += preferred;
 	pgid.pg.pool = le32_to_cpu(fl->fl_pg_pool);
 	if (preferred >= 0)
 		dout("calc_object_layout '%s' pgid %d.%xp%d (%llx)\n", oid,
-- 
cgit v1.2.2


From bb097ffaf833a40335b6dd5e4fa6f5ed0b223bdc Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 19 Oct 2009 16:17:31 -0700
Subject: ceph: v0.17 of client

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index f8f27e28a6bf..ae523828c538 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,8 +19,8 @@
  * Ceph release version
  */
 #define CEPH_VERSION_MAJOR 0
-#define CEPH_VERSION_MINOR 16
-#define CEPH_VERSION_PATCH 1
+#define CEPH_VERSION_MINOR 17
+#define CEPH_VERSION_PATCH 0
 
 #define _CEPH_STRINGIFY(x) #x
 #define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
-- 
cgit v1.2.2


From 232d4b01319767b3ffa5d08962a81c805962be49 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 21 Oct 2009 11:21:49 -0700
Subject: ceph: move directory size logic to ceph_getattr

We can't fill i_size with rbytes at the fill_file_size stage without
adding additional checks for directories.  Notably, we want st_blocks
to remain 0 on directories so that 'du' still works.

Fill in i_blocks, i_size specially in ceph_getattr instead.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 6097af790047..036873c42a78 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -568,8 +568,6 @@ static int fill_inode(struct inode *inode,
 	queue_trunc = ceph_fill_file_size(inode, issued,
 					  le32_to_cpu(info->truncate_seq),
 					  le64_to_cpu(info->truncate_size),
-					  S_ISDIR(inode->i_mode) ?
-					  ci->i_rbytes :
 					  le64_to_cpu(info->size));
 	ceph_fill_file_time(inode, issued,
 			    le32_to_cpu(info->time_warp_seq),
@@ -1603,6 +1601,7 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		 struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
 	int err;
 
 	err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
@@ -1613,8 +1612,11 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 			stat->dev = ceph_snap(inode);
 		else
 			stat->dev = 0;
-		if (S_ISDIR(inode->i_mode))
+		if (S_ISDIR(inode->i_mode)) {
+			stat->size = ci->i_rbytes;
+			stat->blocks = 0;
 			stat->blksize = 65536;
+		}
 	}
 	return err;
 }
-- 
cgit v1.2.2


From ecb19c4649d7396737eb0d91a475661fe9d7c028 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 22 Oct 2009 10:53:02 -0700
Subject: ceph: remove small mon addr limit; use CEPH_MAX_MON where appropriate

Get rid of separate max mon limit; use the system limit instead.  This
allows mounts when there are lots of mon addrs provided by mount.ceph (as
with a host with lots of A/AAAA records).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 4 ++--
 fs/ceph/super.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index ab950fce4172..81916250f0b6 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -314,7 +314,7 @@ static int parse_mount_args(struct ceph_client *client,
 	int err;
 	substring_t argstr[MAX_OPT_ARGS];
 	int num_mon;
-	struct ceph_entity_addr mon_addr[CEPH_MAX_MON_MOUNT_ADDR];
+	struct ceph_entity_addr mon_addr[CEPH_MAX_MON];
 	int i;
 
 	dout("parse_mount_args dev_name '%s'\n", dev_name);
@@ -344,7 +344,7 @@ static int parse_mount_args(struct ceph_client *client,
 
 	/* get mon ip(s) */
 	err = ceph_parse_ips(dev_name, *path, mon_addr,
-			     CEPH_MAX_MON_MOUNT_ADDR, &num_mon);
+			     CEPH_MAX_MON, &num_mon);
 	if (err < 0)
 		return err;
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 75556e97e865..3af42d9097ec 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -47,8 +47,6 @@
 	(!!((client)->mount_args.flags & CEPH_OPT_##opt))
 
 
-#define CEPH_MAX_MON_MOUNT_ADDR	5
-
 struct ceph_mount_args {
 	int sb_flags;
 	int flags;
-- 
cgit v1.2.2


From 7b813c46021e8f4909772a5bbfb5212bd140764c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 26 Oct 2009 22:07:53 -0700
Subject: ceph: reduce parse_mount_args stack usage

Since we've increased the max mon count, we shouldn't put the addr array
on the parse_mount_args stack.  Put it on the heap instead.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 81916250f0b6..deb51bd6ed83 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -314,12 +314,16 @@ static int parse_mount_args(struct ceph_client *client,
 	int err;
 	substring_t argstr[MAX_OPT_ARGS];
 	int num_mon;
-	struct ceph_entity_addr mon_addr[CEPH_MAX_MON];
+	struct ceph_entity_addr *mon_addr;
 	int i;
 
 	dout("parse_mount_args dev_name '%s'\n", dev_name);
 	memset(args, 0, sizeof(*args));
 
+	mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*mon_addr), GFP_KERNEL);
+	if (!mon_addr)
+		return -ENOMEM;
+
 	/* start with defaults */
 	args->sb_flags = flags;
 	args->flags = CEPH_OPT_DEFAULT;
@@ -333,27 +337,29 @@ static int parse_mount_args(struct ceph_client *client,
 	args->max_readdir = 1024;
 
 	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+	err = -EINVAL;
 	if (!dev_name)
-		return -EINVAL;
+		goto out;
 	*path = strstr(dev_name, ":/");
 	if (*path == NULL) {
 		pr_err("device name is missing path (no :/ in %s)\n",
 		       dev_name);
-		return -EINVAL;
+		goto out;
 	}
 
 	/* get mon ip(s) */
 	err = ceph_parse_ips(dev_name, *path, mon_addr,
 			     CEPH_MAX_MON, &num_mon);
 	if (err < 0)
-		return err;
+		goto out;
 
 	/* build initial monmap */
+	err = -ENOMEM;
 	client->monc.monmap = kzalloc(sizeof(*client->monc.monmap) +
 			       num_mon*sizeof(client->monc.monmap->mon_inst[0]),
 			       GFP_KERNEL);
 	if (!client->monc.monmap)
-		return -ENOMEM;
+		goto out;
 	for (i = 0; i < num_mon; i++) {
 		client->monc.monmap->mon_inst[i].addr = mon_addr[i];
 		client->monc.monmap->mon_inst[i].addr.erank = 0;
@@ -374,11 +380,11 @@ static int parse_mount_args(struct ceph_client *client,
 		int token, intval, ret;
 		if (!*c)
 			continue;
+		err = -EINVAL;
 		token = match_token((char *)c, arg_tokens, argstr);
 		if (token < 0) {
 			pr_err("bad mount option at '%s'\n", c);
-			return -EINVAL;
-
+			goto out;
 		}
 		if (token < Opt_ip) {
 			ret = match_int(&argstr[0], &intval);
@@ -468,8 +474,11 @@ static int parse_mount_args(struct ceph_client *client,
 			BUG_ON(token);
 		}
 	}
+	err = 0;
 
-	return 0;
+out:
+	kfree(mon_addr);
+	return err;
 }
 
 static void release_mount_args(struct ceph_mount_args *args)
-- 
cgit v1.2.2


From 6ca874e92d5e50beb8e351dfd8121947bafc79ec Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 26 Oct 2009 22:06:22 -0700
Subject: ceph: silence uninitialized variable warning

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index deb51bd6ed83..924e6cad0b66 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -808,7 +808,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 	struct ceph_client *client;
 	int err;
 	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
-	const char *path;
+	const char *path = 0;
 
 	dout("ceph_get_sb\n");
 
-- 
cgit v1.2.2


From e53c2fe075feda1fd4f009956ac026dc24c3a199 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 27 Oct 2009 10:19:28 -0700
Subject: ceph: fix, clean up string mount arg parsing

Clearly demark int and string argument options, and do not try to convert
string arguments to ints.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 924e6cad0b66..b094f5003ef8 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -264,9 +264,11 @@ enum {
 	Opt_caps_wanted_delay_min,
 	Opt_caps_wanted_delay_max,
 	Opt_readdir_max_entries,
+	Opt_last_int,
 	/* int args above */
 	Opt_snapdirname,
 	Opt_secret,
+	Opt_last_string,
 	/* string args above */
 	Opt_ip,
 	Opt_noshare,
@@ -386,14 +388,19 @@ static int parse_mount_args(struct ceph_client *client,
 			pr_err("bad mount option at '%s'\n", c);
 			goto out;
 		}
-		if (token < Opt_ip) {
+		if (token < Opt_last_int) {
 			ret = match_int(&argstr[0], &intval);
 			if (ret < 0) {
 				pr_err("bad mount option arg (not int) "
 				       "at '%s'\n", c);
 				continue;
 			}
-			dout("got token %d intval %d\n", token, intval);
+			dout("got int token %d val %d\n", token, intval);
+		} else if (token > Opt_last_int && token < Opt_last_string) {
+			dout("got string token %d val %s\n", token,
+			     argstr[0].from);
+		} else {
+			dout("got token %d\n", token);
 		}
 		switch (token) {
 		case Opt_fsidmajor:
-- 
cgit v1.2.2


From 6b8051855d983db8480ff1ea1b02ef2b49203c22 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 27 Oct 2009 11:50:50 -0700
Subject: ceph: allocate and parse mount args before client instance

This simplifies much of the error handling during mount.  It also means
that we have the mount args before client creation, and we can initialize
based on those options.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c       |   4 +-
 fs/ceph/caps.c       |   4 +-
 fs/ceph/dir.c        |   7 ++--
 fs/ceph/mds_client.c |   6 +--
 fs/ceph/mon_client.c |  38 ++++++++++++++++++
 fs/ceph/osd_client.c |   6 +--
 fs/ceph/super.c      | 106 +++++++++++++++++++++++----------------------------
 fs/ceph/super.h      |   8 ++--
 8 files changed, 105 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c7d673ffe023..bf535815592d 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -600,8 +600,8 @@ static int ceph_writepages_start(struct address_space *mapping,
 		pr_warning("writepage_start %p on forced umount\n", inode);
 		return -EIO; /* we're in a forced umount, don't write! */
 	}
-	if (client->mount_args.wsize && client->mount_args.wsize < wsize)
-		wsize = client->mount_args.wsize;
+	if (client->mount_args->wsize && client->mount_args->wsize < wsize)
+		wsize = client->mount_args->wsize;
 	if (wsize < PAGE_CACHE_SIZE)
 		wsize = PAGE_CACHE_SIZE;
 	max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7d166182e98d..8b863dbec70c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -270,7 +270,7 @@ static void put_cap(struct ceph_cap *cap,
 	 * lots of free/alloc churn.
 	 */
 	if (caps_avail_count >= caps_reserve_count +
-	    ceph_client(cap->ci->vfs_inode.i_sb)->mount_args.max_readdir) {
+	    ceph_client(cap->ci->vfs_inode.i_sb)->mount_args->max_readdir) {
 		caps_total_count--;
 		kmem_cache_free(ceph_cap_cachep, cap);
 	} else {
@@ -388,7 +388,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
 static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
 			       struct ceph_inode_info *ci)
 {
-	struct ceph_mount_args *ma = &mdsc->client->mount_args;
+	struct ceph_mount_args *ma = mdsc->client->mount_args;
 
 	ci->i_hold_caps_min = round_jiffies(jiffies +
 					    ma->caps_wanted_delay_min * HZ);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 7bb8db524e58..4f7467961b09 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -225,7 +225,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int err;
 	u32 ftype;
 	struct ceph_mds_reply_info_parsed *rinfo;
-	const int max_entries = client->mount_args.max_readdir;
+	const int max_entries = client->mount_args->max_readdir;
 
 	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
 	if (fi->at_end)
@@ -479,7 +479,8 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 	/* .snap dir? */
 	if (err == -ENOENT &&
 	    ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
-	    strcmp(dentry->d_name.name, client->mount_args.snapdir_name) == 0) {
+	    strcmp(dentry->d_name.name,
+		   client->mount_args->snapdir_name) == 0) {
 		struct inode *inode = ceph_get_snapdir(parent);
 		dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
 		     dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -550,7 +551,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 		spin_lock(&dir->i_lock);
 		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
 		if (strncmp(dentry->d_name.name,
-			    client->mount_args.snapdir_name,
+			    client->mount_args->snapdir_name,
 			    dentry->d_name.len) &&
 		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
 		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 12d66c0572ac..210cb6623ea2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -943,7 +943,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
 	int err = -ENOMEM;
 
 	if (extra < 0)
-		extra = mdsc->client->mount_args.cap_release_safety;
+		extra = mdsc->client->mount_args->cap_release_safety;
 
 	spin_lock(&session->s_cap_lock);
 
@@ -2601,7 +2601,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
 		mutex_unlock(&mdsc->mutex);
 		dout("wait_requests waiting for requests\n");
 		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
-				    client->mount_args.mount_timeout * HZ);
+				    client->mount_args->mount_timeout * HZ);
 		mutex_lock(&mdsc->mutex);
 
 		/* tear down remaining requests */
@@ -2693,7 +2693,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	int i;
 	int n;
 	struct ceph_client *client = mdsc->client;
-	unsigned long started, timeout = client->mount_args.mount_timeout * HZ;
+	unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
 
 	dout("close_sessions\n");
 
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index e6e954cac6b9..61263c99c6a8 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -527,6 +527,40 @@ static void delayed_work(struct work_struct *work)
 	mutex_unlock(&monc->mutex);
 }
 
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+	struct ceph_mount_args *args = monc->client->mount_args;
+	struct ceph_entity_addr *mon_addr = args->mon_addr;
+	int num_mon = args->num_mon;
+	int i;
+
+	/* build initial monmap */
+	monc->monmap = kzalloc(sizeof(*monc->monmap) +
+			       num_mon*sizeof(monc->monmap->mon_inst[0]),
+			       GFP_KERNEL);
+	if (!monc->monmap)
+		return -ENOMEM;
+	for (i = 0; i < num_mon; i++) {
+		monc->monmap->mon_inst[i].addr = mon_addr[i];
+		monc->monmap->mon_inst[i].addr.erank = 0;
+		monc->monmap->mon_inst[i].addr.nonce = 0;
+		monc->monmap->mon_inst[i].name.type =
+			CEPH_ENTITY_TYPE_MON;
+		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+	}
+	monc->monmap->num_mon = num_mon;
+
+	/* release addr memory */
+	kfree(args->mon_addr);
+	args->mon_addr = NULL;
+	args->num_mon = 0;
+	return 0;
+}
+
 int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 {
 	int err = 0;
@@ -537,6 +571,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	monc->monmap = NULL;
 	mutex_init(&monc->mutex);
 
+	err = build_initial_monmap(monc);
+	if (err)
+		goto out;
+
 	monc->con = NULL;
 
 	/* msg pools */
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 0a254054a82a..7dc0f6299a52 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -444,7 +444,7 @@ static void register_request(struct ceph_osd_client *osdc,
 	osdc->num_requests++;
 
 	req->r_timeout_stamp =
-		jiffies + osdc->client->mount_args.osd_timeout*HZ;
+		jiffies + osdc->client->mount_args->osd_timeout*HZ;
 
 	if (osdc->num_requests == 1) {
 		osdc->timeout_tid = req->r_tid;
@@ -609,7 +609,7 @@ static int __send_request(struct ceph_osd_client *osdc,
 	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
 	reqhead->reassert_version = req->r_reassert_version;
 
-	req->r_timeout_stamp = jiffies+osdc->client->mount_args.osd_timeout*HZ;
+	req->r_timeout_stamp = jiffies+osdc->client->mount_args->osd_timeout*HZ;
 
 	ceph_msg_get(req->r_request); /* send consumes a ref */
 	ceph_con_send(&req->r_osd->o_con, req->r_request);
@@ -632,7 +632,7 @@ static void handle_timeout(struct work_struct *work)
 		container_of(work, struct ceph_osd_client, timeout_work.work);
 	struct ceph_osd_request *req;
 	struct ceph_osd *osd;
-	unsigned long timeout = osdc->client->mount_args.osd_timeout * HZ;
+	unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
 	unsigned long next_timeout = timeout + jiffies;
 	struct rb_node *p;
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b094f5003ef8..9b7815dfc035 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -110,7 +110,7 @@ static int ceph_syncfs(struct super_block *sb, int wait)
 static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 {
 	struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
-	struct ceph_mount_args *args = &client->mount_args;
+	struct ceph_mount_args *args = client->mount_args;
 
 	if (args->flags & CEPH_OPT_FSID)
 		seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
@@ -307,24 +307,24 @@ static match_table_t arg_tokens = {
 };
 
 
-static int parse_mount_args(struct ceph_client *client,
-			    int flags, char *options, const char *dev_name,
-			    const char **path)
+static struct ceph_mount_args *parse_mount_args(int flags, char *options,
+						const char *dev_name,
+						const char **path)
 {
-	struct ceph_mount_args *args = &client->mount_args;
+	struct ceph_mount_args *args;
 	const char *c;
-	int err;
+	int err = -ENOMEM;
 	substring_t argstr[MAX_OPT_ARGS];
-	int num_mon;
-	struct ceph_entity_addr *mon_addr;
-	int i;
 
-	dout("parse_mount_args dev_name '%s'\n", dev_name);
-	memset(args, 0, sizeof(*args));
+	args = kzalloc(sizeof(*args), GFP_KERNEL);
+	if (!args)
+		return ERR_PTR(-ENOMEM);
+	args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
+				 GFP_KERNEL);
+	if (!args->mon_addr)
+		goto out;
 
-	mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*mon_addr), GFP_KERNEL);
-	if (!mon_addr)
-		return -ENOMEM;
+	dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
 
 	/* start with defaults */
 	args->sb_flags = flags;
@@ -350,29 +350,11 @@ static int parse_mount_args(struct ceph_client *client,
 	}
 
 	/* get mon ip(s) */
-	err = ceph_parse_ips(dev_name, *path, mon_addr,
-			     CEPH_MAX_MON, &num_mon);
+	err = ceph_parse_ips(dev_name, *path, args->mon_addr,
+			     CEPH_MAX_MON, &args->num_mon);
 	if (err < 0)
 		goto out;
 
-	/* build initial monmap */
-	err = -ENOMEM;
-	client->monc.monmap = kzalloc(sizeof(*client->monc.monmap) +
-			       num_mon*sizeof(client->monc.monmap->mon_inst[0]),
-			       GFP_KERNEL);
-	if (!client->monc.monmap)
-		goto out;
-	for (i = 0; i < num_mon; i++) {
-		client->monc.monmap->mon_inst[i].addr = mon_addr[i];
-		client->monc.monmap->mon_inst[i].addr.erank = 0;
-		client->monc.monmap->mon_inst[i].addr.nonce = 0;
-		client->monc.monmap->mon_inst[i].name.type =
-			CEPH_ENTITY_TYPE_MON;
-		client->monc.monmap->mon_inst[i].name.num = cpu_to_le64(i);
-	}
-	client->monc.monmap->num_mon = num_mon;
-	memset(&args->my_addr.in_addr, 0, sizeof(args->my_addr.in_addr));
-
 	/* path on server */
 	*path += 2;
 	dout("server path '%s'\n", *path);
@@ -415,7 +397,7 @@ static int parse_mount_args(struct ceph_client *client,
 					     &args->my_addr,
 					     1, NULL);
 			if (err < 0)
-				return err;
+				goto out;
 			args->flags |= CEPH_OPT_MYIP;
 			break;
 
@@ -481,25 +463,28 @@ static int parse_mount_args(struct ceph_client *client,
 			BUG_ON(token);
 		}
 	}
-	err = 0;
+	return args;
 
 out:
-	kfree(mon_addr);
-	return err;
+	kfree(args->mon_addr);
+	kfree(args);
+	return ERR_PTR(err);
 }
 
-static void release_mount_args(struct ceph_mount_args *args)
+static void destroy_mount_args(struct ceph_mount_args *args)
 {
+	dout("destroy_mount_args %p\n", args);
 	kfree(args->snapdir_name);
 	args->snapdir_name = NULL;
 	kfree(args->secret);
 	args->secret = NULL;
+	kfree(args);
 }
 
 /*
  * create a fresh client instance
  */
-static struct ceph_client *ceph_create_client(void)
+static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
 {
 	struct ceph_client *client;
 	int err = -ENOMEM;
@@ -515,6 +500,7 @@ static struct ceph_client *ceph_create_client(void)
 	client->sb = NULL;
 	client->mount_state = CEPH_MOUNT_MOUNTING;
 	client->whoami = -1;
+	client->mount_args = args;
 
 	client->msgr = NULL;
 
@@ -577,7 +563,7 @@ static void ceph_destroy_client(struct ceph_client *client)
 	if (client->wb_pagevec_pool)
 		mempool_destroy(client->wb_pagevec_pool);
 
-	release_mount_args(&client->mount_args);
+	destroy_mount_args(client->mount_args);
 
 	kfree(client);
 	dout("destroy_client %p done\n", client);
@@ -613,7 +599,7 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
 	req->r_ino1.ino = CEPH_INO_ROOT;
 	req->r_ino1.snap = CEPH_NOSNAP;
 	req->r_started = started;
-	req->r_timeout = client->mount_args.mount_timeout * HZ;
+	req->r_timeout = client->mount_args->mount_timeout * HZ;
 	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 	req->r_num_caps = 2;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
@@ -641,7 +627,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
 {
 	struct ceph_entity_addr *myaddr = NULL;
 	int err;
-	unsigned long timeout = client->mount_args.mount_timeout * HZ;
+	unsigned long timeout = client->mount_args->mount_timeout * HZ;
 	unsigned long started = jiffies;  /* note the start time */
 	struct dentry *root;
 
@@ -651,7 +637,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
 	/* initialize the messenger */
 	if (client->msgr == NULL) {
 		if (ceph_test_opt(client, MYIP))
-			myaddr = &client->mount_args.my_addr;
+			myaddr = &client->mount_args->my_addr;
 		client->msgr = ceph_messenger_create(myaddr);
 		if (IS_ERR(client->msgr)) {
 			err = PTR_ERR(client->msgr);
@@ -727,7 +713,7 @@ static int ceph_set_super(struct super_block *s, void *data)
 
 	dout("set_super %p data %p\n", s, data);
 
-	s->s_flags = client->mount_args.sb_flags;
+	s->s_flags = client->mount_args->sb_flags;
 	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
 
 	s->s_fs_info = client;
@@ -756,7 +742,7 @@ fail:
 static int ceph_compare_super(struct super_block *sb, void *data)
 {
 	struct ceph_client *new = data;
-	struct ceph_mount_args *args = &new->mount_args;
+	struct ceph_mount_args *args = new->mount_args;
 	struct ceph_client *other = ceph_sb_to_client(sb);
 	int i;
 
@@ -778,7 +764,7 @@ static int ceph_compare_super(struct super_block *sb, void *data)
 		}
 		dout("mon ip matches existing sb %p\n", sb);
 	}
-	if (args->sb_flags != other->mount_args.sb_flags) {
+	if (args->sb_flags != other->mount_args->sb_flags) {
 		dout("flags differ\n");
 		return 0;
 	}
@@ -798,9 +784,9 @@ static int ceph_init_bdi(struct super_block *sb, struct ceph_client *client)
 	sb->s_bdi = &client->backing_dev_info;
 
 	/* set ra_pages based on rsize mount option? */
-	if (client->mount_args.rsize >= PAGE_CACHE_SIZE)
+	if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
 		client->backing_dev_info.ra_pages =
-			(client->mount_args.rsize + PAGE_CACHE_SIZE - 1)
+			(client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
 			>> PAGE_SHIFT;
 
 	err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
@@ -816,19 +802,23 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 	int err;
 	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
 	const char *path = 0;
+	struct ceph_mount_args *args;
 
 	dout("ceph_get_sb\n");
+	args = parse_mount_args(flags, data, dev_name, &path);
+	if (IS_ERR(args)) {
+		err = PTR_ERR(args);
+		goto out_final;
+	}
 
 	/* create client (which we may/may not use) */
-	client = ceph_create_client();
-	if (IS_ERR(client))
-		return PTR_ERR(client);
-
-	err = parse_mount_args(client, flags, data, dev_name, &path);
-	if (err < 0)
-		goto out;
+	client = ceph_create_client(args);
+	if (IS_ERR(client)) {
+		err = PTR_ERR(client);
+		goto out_final;
+	}
 
-	if (client->mount_args.flags & CEPH_OPT_NOSHARE)
+	if (client->mount_args->flags & CEPH_OPT_NOSHARE)
 		compare_super = NULL;
 	sb = sget(fs_type, compare_super, ceph_set_super, client);
 	if (IS_ERR(sb)) {
@@ -846,7 +836,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 		/* set up mempools */
 		err = -ENOMEM;
 		client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
-			      client->mount_args.wsize >> PAGE_CACHE_SHIFT);
+			      client->mount_args->wsize >> PAGE_CACHE_SHIFT);
 		if (!client->wb_pagevec_pool)
 			goto out_splat;
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3af42d9097ec..a3d4943581d0 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -42,13 +42,15 @@
 #define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
 
 #define ceph_set_opt(client, opt) \
-	(client)->mount_args.flags |= CEPH_OPT_##opt;
+	(client)->mount_args->flags |= CEPH_OPT_##opt;
 #define ceph_test_opt(client, opt) \
-	(!!((client)->mount_args.flags & CEPH_OPT_##opt))
+	(!!((client)->mount_args->flags & CEPH_OPT_##opt))
 
 
 struct ceph_mount_args {
 	int sb_flags;
+	int num_mon;
+	struct ceph_entity_addr *mon_addr;
 	int flags;
 	int mount_timeout;
 	int caps_wanted_delay_min, caps_wanted_delay_max;
@@ -115,7 +117,7 @@ struct ceph_client {
 	struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
 
 	struct mutex mount_mutex;       /* serialize mount attempts */
-	struct ceph_mount_args mount_args;
+	struct ceph_mount_args *mount_args;
 	struct ceph_fsid fsid;
 
 	struct super_block *sb;
-- 
cgit v1.2.2


From fbbccec9c6218cbc9ff47c6d88bfc6b52079e3ea Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@noahdesu.com>
Date: Wed, 28 Oct 2009 11:54:49 -0700
Subject: ceph: replace list_entry with container_of

Usage of non-list.h list_entry function for container_of
functionality replaced with direct use of container_of.

Signed-off-by: Noah Watkins <noah@noahdesu.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index a3d4943581d0..05947b96c524 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -364,7 +364,7 @@ struct ceph_inode_info {
 
 static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
 {
-	return list_entry(inode, struct ceph_inode_info, vfs_inode);
+	return container_of(inode, struct ceph_inode_info, vfs_inode);
 }
 
 static inline void ceph_i_clear(struct inode *inode, unsigned mask)
-- 
cgit v1.2.2


From 35e054a66e07f508aa7cfabc7db1757379093689 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@noahdesu.com>
Date: Wed, 28 Oct 2009 14:04:48 -0700
Subject: ceph: remove redundant use of le32_to_cpu

Using stripe unit size calculated and saved on the stack to avoid
a redundant call to le32_to_cpu.

Signed-off-by: Noah Watkins <noah@noahdesu.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 72d75a239ac2..60012e05bdfd 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -735,7 +735,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 
 	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
 	     osize, su);
-	su_per_object = osize / le32_to_cpu(layout->fl_stripe_unit);
+	su_per_object = osize / su;
 	dout("osize %u / su %u = su_per_object %u\n", osize, su,
 	     su_per_object);
 
-- 
cgit v1.2.2


From 5600f5ebd318f7af6f4b19a29f08d18bb85264e5 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 28 Oct 2009 14:57:25 -0700
Subject: ceph: correct comment to match striping calculation

The object extent offset is the file offset _modulo_ the stripe unit.
The code was correct, the comment was wrong.

Reported-by: Noah Watkins <jayhawk@soe.ucsc.edu>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 60012e05bdfd..a9a4143234fa 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -752,7 +752,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 
 	*bno = objsetno * sc + stripepos;
 	dout("objset %u * sc %u = bno %u\n", objsetno, sc, (unsigned)*bno);
-	/* *oxoff = *off / layout->fl_stripe_unit; */
+	/* *oxoff = *off % layout->fl_stripe_unit; */
 	t = off;
 	*oxoff = do_div(t, su);
 	*oxlen = min_t(u64, *plen, su - *oxoff);
-- 
cgit v1.2.2


From 645a102581b3639836b17d147c35d574fd6e8267 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 28 Oct 2009 15:15:05 -0700
Subject: ceph: fix object striping calculation for non-default striping
 schemes

We were incorrectly calculationing of object offset.  If we have multiple
stripe units per object, we need to shift to the start of the current
su in addition to the offset within the su.

Also rename bno to ono (object number) to avoid some variable naming
confusion.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index a9a4143234fa..5a5520c5a2b3 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -723,7 +723,7 @@ bad:
  */
 void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 				   u64 off, u64 *plen,
-				   u64 *bno,
+				   u64 *ono,
 				   u64 *oxoff, u64 *oxlen)
 {
 	u32 osize = le32_to_cpu(layout->fl_object_size);
@@ -750,11 +750,14 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 	stripepos = bl % sc;
 	objsetno = stripeno / su_per_object;
 
-	*bno = objsetno * sc + stripepos;
-	dout("objset %u * sc %u = bno %u\n", objsetno, sc, (unsigned)*bno);
-	/* *oxoff = *off % layout->fl_stripe_unit; */
+	*ono = objsetno * sc + stripepos;
+	dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
+
+	/* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
 	t = off;
 	*oxoff = do_div(t, su);
+	*oxoff += (stripeno % su_per_object) * su;
+
 	*oxlen = min_t(u64, *plen, su - *oxoff);
 	*plen = *oxlen;
 
-- 
cgit v1.2.2


From ff1d1f7179363209b7f1493ea39b666f50d05cf4 Mon Sep 17 00:00:00 2001
From: Noah Watkins <noah@noahdesu.com>
Date: Fri, 30 Oct 2009 12:57:30 -0700
Subject: ceph: fix intra strip unit length calculation

Commit 645a102581b3639836b17d147c35d574fd6e8267 fixes calculation of object
offset for layouts with multiple stripes per object. This updates the
calculation of the length written to take into account multiple stripes per
object.

Signed-off-by: Noah Watkins <noah@noahdesu.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 5a5520c5a2b3..d62e111b8a34 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -731,7 +731,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 	u32 sc = le32_to_cpu(layout->fl_stripe_count);
 	u32 bl, stripeno, stripepos, objsetno;
 	u32 su_per_object;
-	u64 t;
+	u64 t, su_offset;
 
 	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
 	     osize, su);
@@ -755,10 +755,15 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 
 	/* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
 	t = off;
-	*oxoff = do_div(t, su);
-	*oxoff += (stripeno % su_per_object) * su;
-
-	*oxlen = min_t(u64, *plen, su - *oxoff);
+	su_offset = do_div(t, su);
+	*oxoff = su_offset + (stripeno % su_per_object) * su;
+
+	/*
+	 * Calculate the length of the extent being written to the selected
+	 * object. This is the minimum of the full length requested (plen) or
+	 * the remainder of the current stripe being written to.
+	 */
+	*oxlen = min_t(u64, *plen, su - su_offset);
 	*plen = *oxlen;
 
 	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
-- 
cgit v1.2.2


From 63ff78b25c4b204075b5b98afcac6ad3639d43fe Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sun, 1 Nov 2009 17:51:15 -0800
Subject: ceph: fix uninitialized err variable

Fixes warning
fs/ceph/xattr.c: In function '__build_xattrs':
fs/ceph/xattr.c:353: warning: 'err' may be used uninitialized in this function

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 65b3a84bbb2e..1a48a55c5109 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -350,7 +350,7 @@ static int __build_xattrs(struct inode *inode)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int xattr_version;
 	struct ceph_inode_xattr **xattrs = NULL;
-	int err;
+	int err = 0;
 	int i;
 
 	dout("__build_xattrs() len=%d\n",
-- 
cgit v1.2.2


From 33aa96e7430d215e2ee779f65cdad0f6d4571fe1 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sun, 1 Nov 2009 17:53:24 -0800
Subject: crush: always return a value from crush_bucket_choose

Even when we encounter a corrupt bucket.  We still BUG().  This fixes
the warning

fs/ceph/crush/mapper.c: In function 'crush_choose':
fs/ceph/crush/mapper.c:352: warning: control may reach end of non-void function
'crush_bucket_choose' being inlined

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/crush/mapper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
index c268393adfcb..54f3f402af60 100644
--- a/fs/ceph/crush/mapper.c
+++ b/fs/ceph/crush/mapper.c
@@ -253,7 +253,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
 					   x, r);
 	default:
 		BUG_ON(1);
-/* 		return in->items[0] */;
+ 		return in->items[0];
 	}
 }
 
-- 
cgit v1.2.2


From 859e7b149362475672e2a996f29b8f45cbb34d82 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 2 Nov 2009 09:32:47 -0800
Subject: ceph: init/destroy bdi in client create/destroy helpers

This keeps bdi setup/teardown in line with client life cycle.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9b7815dfc035..0ae40bad53c4 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -508,10 +508,14 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
 	client->signed_ticket = NULL;
 	client->signed_ticket_len = 0;
 
+	err = bdi_init(&client->backing_dev_info);
+	if (err < 0)
+		goto fail;
+
 	err = -ENOMEM;
 	client->wb_wq = create_workqueue("ceph-writeback");
 	if (client->wb_wq == NULL)
-		goto fail;
+		goto fail_bdi;
 	client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
 	if (client->pg_inv_wq == NULL)
 		goto fail_wb_wq;
@@ -537,6 +541,8 @@ fail_pg_inv_wq:
 	destroy_workqueue(client->pg_inv_wq);
 fail_wb_wq:
 	destroy_workqueue(client->wb_wq);
+fail_bdi:
+	bdi_destroy(&client->backing_dev_info);
 fail:
 	kfree(client);
 	return ERR_PTR(err);
@@ -774,13 +780,10 @@ static int ceph_compare_super(struct super_block *sb, void *data)
 /*
  * construct our own bdi so we can control readahead, etc.
  */
-static int ceph_init_bdi(struct super_block *sb, struct ceph_client *client)
+static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 {
 	int err;
 
-	err = bdi_init(&client->backing_dev_info);
-	if (err < 0)
-		return err;
 	sb->s_bdi = &client->backing_dev_info;
 
 	/* set ra_pages based on rsize mount option? */
@@ -840,7 +843,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 		if (!client->wb_pagevec_pool)
 			goto out_splat;
 
-		err = ceph_init_bdi(sb, client);
+		err = ceph_register_bdi(sb, client);
 		if (err < 0)
 			goto out_splat;
 	}
-- 
cgit v1.2.2


From 63f2d211954b790fea0a9caeae605c7956535af6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 3 Nov 2009 15:17:56 -0800
Subject: ceph: use fixed endian encoding for ceph_entity_addr

We exchange struct ceph_entity_addr over the wire and store it on disk.
The sockaddr_storage.ss_family field, however, is host endianness.  So,
fix ss_family endianness to big endian when sending/receiving over the
wire.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/decode.h     | 16 ++++++++++++++--
 fs/ceph/mdsmap.c     |  1 +
 fs/ceph/messenger.c  | 23 ++++++++++++++++++-----
 fs/ceph/messenger.h  |  1 +
 fs/ceph/mon_client.c |  2 ++
 fs/ceph/msgr.h       |  2 +-
 fs/ceph/osdmap.c     |  3 +++
 7 files changed, 40 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
index 91179fb2cc3f..a382aecc55bb 100644
--- a/fs/ceph/decode.h
+++ b/fs/ceph/decode.h
@@ -76,18 +76,30 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
  * struct ceph_timespec <-> struct timespec
  */
 static inline void ceph_decode_timespec(struct timespec *ts,
-					struct ceph_timespec *tv)
+					const struct ceph_timespec *tv)
 {
 	ts->tv_sec = le32_to_cpu(tv->tv_sec);
 	ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
 }
 static inline void ceph_encode_timespec(struct ceph_timespec *tv,
-					struct timespec *ts)
+					const struct timespec *ts)
 {
 	tv->tv_sec = cpu_to_le32(ts->tv_sec);
 	tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
 }
 
+/*
+ * sockaddr_storage <-> ceph_sockaddr
+ */
+static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+{
+	a->in_addr.ss_family = htons(a->in_addr.ss_family);
+}
+static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+{
+	a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
+}
+
 /*
  * encoders
  */
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 80daea064470..4226c810ce22 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -86,6 +86,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 
 		ceph_decode_need(p, end, sizeof(addr) + 1 + sizeof(u32), bad);
 		ceph_decode_copy(p, &addr, sizeof(addr));
+		ceph_decode_addr(&addr);
 		infoversion = ceph_decode_8(p);
 		namelen = ceph_decode_32(p);  /* skip mds name */
 		*p += namelen;
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index b48abc0b3be7..6ff44bbddf67 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -12,6 +12,7 @@
 
 #include "super.h"
 #include "messenger.h"
+#include "decode.h"
 
 /*
  * Ceph uses the messenger to exchange ceph_msg messages with other
@@ -97,6 +98,12 @@ const char *pr_addr(const struct sockaddr_storage *ss)
 	return s;
 }
 
+static void encode_my_addr(struct ceph_messenger *msgr)
+{
+	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
+	ceph_encode_addr(&msgr->my_enc_addr);
+}
+
 /*
  * work queue for all reading and writing to/from the socket.
  */
@@ -590,12 +597,12 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
 
 	con->out_kvec[0].iov_base = CEPH_BANNER;
 	con->out_kvec[0].iov_len = len;
-	con->out_kvec[1].iov_base = &msgr->inst.addr;
-	con->out_kvec[1].iov_len = sizeof(msgr->inst.addr);
+	con->out_kvec[1].iov_base = &msgr->my_enc_addr;
+	con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
 	con->out_kvec[2].iov_base = &con->out_connect;
 	con->out_kvec[2].iov_len = sizeof(con->out_connect);
 	con->out_kvec_left = 3;
-	con->out_kvec_bytes = len + sizeof(msgr->inst.addr) +
+	con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr) +
 		sizeof(con->out_connect);
 	con->out_kvec_cur = con->out_kvec;
 	con->out_more = 0;
@@ -976,6 +983,9 @@ static int process_connect(struct ceph_connection *con)
 	if (verify_hello(con) < 0)
 		return -1;
 
+	ceph_decode_addr(&con->actual_peer_addr);
+	ceph_decode_addr(&con->peer_addr_for_me);
+
 	/*
 	 * Make sure the other end is who we wanted.  note that the other
 	 * end may not yet know their ip address, so if it's 0.0.0.0, give
@@ -1005,6 +1015,7 @@ static int process_connect(struct ceph_connection *con)
 		       &con->peer_addr_for_me.in_addr,
 		       sizeof(con->peer_addr_for_me.in_addr));
 		addr_set_port(&con->msgr->inst.addr.in_addr, port);
+		encode_my_addr(con->msgr);
 		dout("process_connect learned my addr is %s\n",
 		     pr_addr(&con->msgr->inst.addr.in_addr));
 	}
@@ -1780,6 +1791,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
 	/* select a random nonce */
 	get_random_bytes(&msgr->inst.addr.nonce,
 			 sizeof(msgr->inst.addr.nonce));
+	encode_my_addr(msgr);
 
 	dout("messenger_create %p\n", msgr);
 	return msgr;
@@ -1806,8 +1818,9 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 	}
 
 	/* set src+dst */
-	msg->hdr.src = con->msgr->inst;
-	msg->hdr.orig_src = con->msgr->inst;
+	msg->hdr.src.name = con->msgr->inst.name;
+	msg->hdr.src.addr = con->msgr->my_enc_addr;
+	msg->hdr.orig_src = msg->hdr.src;
 	msg->hdr.dst_erank = con->peer_addr.erank;
 
 	/* queue */
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index dcd98b64dca9..e016fa7cf970 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -53,6 +53,7 @@ extern const char *ceph_name_type_str(int t);
 
 struct ceph_messenger {
 	struct ceph_entity_inst inst;    /* my name+address */
+	struct ceph_entity_addr my_enc_addr;
 	struct page *zero_page;          /* used in certain error cases */
 
 	bool nocrc;
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 61263c99c6a8..95b76e761e18 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -59,6 +59,8 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 	m->epoch = epoch;
 	m->num_mon = num_mon;
 	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+	for (i = 0; i < num_mon; i++)
+		ceph_decode_addr(&m->mon_inst[i].addr);
 
 	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
 	     m->num_mon);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 9abc879e25b1..8e3ea2eb1bf5 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -21,7 +21,7 @@
  * whenever the wire protocol changes.  try to keep this string length
  * constant.
  */
-#define CEPH_BANNER "ceph v022"
+#define CEPH_BANNER "ceph v023"
 #define CEPH_BANNER_MAX_LEN 30
 
 
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index d62e111b8a34..cd7bb265d789 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -460,6 +460,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 
 	*p += 4; /* skip length field (should match max) */
 	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+	for (i = 0; i < map->max_osd; i++)
+		ceph_decode_addr(&map->osd_addr[i]);
 
 	/* pg_temp */
 	ceph_decode_32_safe(p, end, len, bad);
@@ -619,6 +621,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		struct ceph_entity_addr addr;
 		ceph_decode_32_safe(p, end, osd, bad);
 		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+		ceph_decode_addr(&addr);
 		pr_info("osd%d up\n", osd);
 		BUG_ON(osd >= map->max_osd);
 		map->osd_state[osd] |= CEPH_OSD_UP;
-- 
cgit v1.2.2


From 51042122d4f85e0f8ee577a4230f172fcc57c456 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 4 Nov 2009 11:39:12 -0800
Subject: ceph: fix endian conversions for ceph_pg

The endian conversions don't quite work with the old union ceph_pg.  Just
make it a regular struct, and make each field __le.  This is simpler and it
has the added bonus of actually working.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ioctl.c      |   4 +--
 fs/ceph/osd_client.c |   8 ++---
 fs/ceph/osdmap.c     | 100 +++++++++++++++++++++++++++++++--------------------
 fs/ceph/osdmap.h     |   5 +--
 fs/ceph/rados.h      |  13 +++----
 5 files changed, 75 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index e4f99eff5d93..4c33e19fc241 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -99,7 +99,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	u64 len = 1, olen;
 	u64 tmp;
 	struct ceph_object_layout ol;
-	union ceph_pg pgid;
+	struct ceph_pg pgid;
 
 	/* copy and validate */
 	if (copy_from_user(&dl, arg, sizeof(dl)))
@@ -121,7 +121,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
 				osdc->osdmap);
 
-	pgid.pg64 = le64_to_cpu(ol.ol_pgid);
+	pgid = ol.ol_pgid;
 	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
 	if (dl.osd >= 0) {
 		struct ceph_entity_addr *a =
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 7dc0f6299a52..7db14ba6261c 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -520,7 +520,7 @@ static int __map_osds(struct ceph_osd_client *osdc,
 		      struct ceph_osd_request *req)
 {
 	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
-	union ceph_pg pgid;
+	struct ceph_pg pgid;
 	int o = -1;
 	int err;
 	struct ceph_osd *newosd = NULL;
@@ -530,7 +530,7 @@ static int __map_osds(struct ceph_osd_client *osdc,
 				      &req->r_file_layout, osdc->osdmap);
 	if (err)
 		return err;
-	pgid.pg64 = le64_to_cpu(reqhead->layout.ol_pgid);
+	pgid = reqhead->layout.ol_pgid;
 	o = ceph_calc_pg_primary(osdc->osdmap, pgid);
 
 	if ((req->r_osd && req->r_osd->o_osd == o &&
@@ -538,8 +538,8 @@ static int __map_osds(struct ceph_osd_client *osdc,
 	    (req->r_osd == NULL && o == -1))
 		return 0;  /* no change */
 
-	dout("map_osds tid %llu pgid %llx pool %d osd%d (was osd%d)\n",
-	     req->r_tid, pgid.pg64, pgid.pg.pool, o,
+	dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
+	     req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
 	     req->r_osd ? req->r_osd->o_osd : -1);
 
 	if (req->r_osd) {
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index cd7bb265d789..8b0cd1107507 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -366,19 +366,33 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 /*
  * Insert a new pg_temp mapping
  */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+	u64 a = *(u64 *)&l;
+	u64 b = *(u64 *)&r;
+
+	if (a < b)
+		return -1;
+	if (a > b)
+		return 1;
+	return 0;
+}
+
 static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 			       struct rb_root *root)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
 	struct ceph_pg_mapping *pg = NULL;
+	int c;
 
 	while (*p) {
 		parent = *p;
 		pg = rb_entry(parent, struct ceph_pg_mapping, node);
-		if (new->pgid < pg->pgid)
+		c = pgid_cmp(new->pgid, pg->pgid);
+		if (c < 0)
 			p = &(*p)->rb_left;
-		else if (new->pgid > pg->pgid)
+		else if (c > 0)
 			p = &(*p)->rb_right;
 		else
 			return -EEXIST;
@@ -467,11 +481,11 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	ceph_decode_32_safe(p, end, len, bad);
 	for (i = 0; i < len; i++) {
 		int n, j;
-		u64 pgid;
+		struct ceph_pg pgid;
 		struct ceph_pg_mapping *pg;
 
 		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
-		pgid = ceph_decode_64(p);
+		ceph_decode_copy(p, &pgid, sizeof(pgid));
 		n = ceph_decode_32(p);
 		ceph_decode_need(p, end, n * sizeof(u32), bad);
 		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
@@ -487,7 +501,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		err = __insert_pg_mapping(pg, &map->pg_temp);
 		if (err)
 			goto bad;
-		dout(" added pg_temp %llx len %d\n", pgid, len);
+		dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
 	}
 
 	/* crush */
@@ -659,19 +673,20 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	while (len--) {
 		struct ceph_pg_mapping *pg;
 		int j;
-		u64 pgid;
+		struct ceph_pg pgid;
 		u32 pglen;
 		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
-		pgid = ceph_decode_64(p);
+		ceph_decode_copy(p, &pgid, sizeof(pgid));
 		pglen = ceph_decode_32(p);
 
 		/* remove any? */
-		while (rbp && rb_entry(rbp, struct ceph_pg_mapping,
-				       node)->pgid <= pgid) {
+		while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
+						node)->pgid, pgid) <= 0) {
 			struct rb_node *cur = rbp;
 			rbp = rb_next(rbp);
 			dout(" removed pg_temp %llx\n",
-			     rb_entry(cur, struct ceph_pg_mapping, node)->pgid);
+			     *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+					       node)->pgid);
 			rb_erase(cur, &map->pg_temp);
 		}
 
@@ -690,14 +705,16 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 			err = __insert_pg_mapping(pg, &map->pg_temp);
 			if (err)
 				goto bad;
-			dout(" added pg_temp %llx len %d\n", pgid, pglen);
+			dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
+			     pglen);
 		}
 	}
 	while (rbp) {
 		struct rb_node *cur = rbp;
 		rbp = rb_next(rbp);
 		dout(" removed pg_temp %llx\n",
-		     rb_entry(cur, struct ceph_pg_mapping, node)->pgid);
+		     *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+				       node)->pgid);
 		rb_erase(cur, &map->pg_temp);
 	}
 
@@ -782,16 +799,19 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 			    struct ceph_osdmap *osdmap)
 {
 	unsigned num, num_mask;
-	union ceph_pg pgid;
+	struct ceph_pg pgid;
 	s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
 	int poolid = le32_to_cpu(fl->fl_pg_pool);
 	struct ceph_pg_pool_info *pool;
+	unsigned ps;
 
 	if (poolid >= osdmap->num_pools)
 		return -EIO;
-	pool = &osdmap->pg_pool[poolid];
 
+	pool = &osdmap->pg_pool[poolid];
+	ps = ceph_full_name_hash(oid, strlen(oid));
 	if (preferred >= 0) {
+		ps += preferred;
 		num = le32_to_cpu(pool->v.lpg_num);
 		num_mask = pool->lpg_num_mask;
 	} else {
@@ -799,22 +819,17 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 		num_mask = pool->pg_num_mask;
 	}
 
-	pgid.pg64 = 0;   /* start with it zeroed out */
-	pgid.pg.ps = ceph_full_name_hash(oid, strlen(oid));
-	pgid.pg.preferred = preferred;
-	if (preferred >= 0)
-		pgid.pg.ps += preferred;
-	pgid.pg.pool = le32_to_cpu(fl->fl_pg_pool);
+	pgid.ps = cpu_to_le16(ps);
+	pgid.preferred = cpu_to_le16(preferred);
+	pgid.pool = fl->fl_pg_pool;
 	if (preferred >= 0)
-		dout("calc_object_layout '%s' pgid %d.%xp%d (%llx)\n", oid,
-		     pgid.pg.pool, pgid.pg.ps, (int)preferred, pgid.pg64);
+		dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
+		     (int)preferred);
 	else
-		dout("calc_object_layout '%s' pgid %d.%x (%llx)\n", oid,
-		     pgid.pg.pool, pgid.pg.ps, pgid.pg64);
+		dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
 
-	ol->ol_pgid = cpu_to_le64(pgid.pg64);
+	ol->ol_pgid = pgid;
 	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
-
 	return 0;
 }
 
@@ -822,21 +837,24 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
  * Calculate raw osd vector for the given pgid.  Return pointer to osd
  * array, or NULL on failure.
  */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, union ceph_pg pgid,
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 			int *osds, int *num)
 {
 	struct rb_node *n = osdmap->pg_temp.rb_node;
 	struct ceph_pg_mapping *pg;
 	struct ceph_pg_pool_info *pool;
 	int ruleno;
-	unsigned pps; /* placement ps */
+	unsigned poolid, ps, pps;
+	int preferred;
+	int c;
 
 	/* pg_temp? */
 	while (n) {
 		pg = rb_entry(n, struct ceph_pg_mapping, node);
-		if (pgid.pg64 < pg->pgid)
+		c = pgid_cmp(pgid, pg->pgid);
+		if (c < 0)
 			n = n->rb_left;
-		else if (pgid.pg64 > pg->pgid)
+		else if (c > 0)
 			n = n->rb_right;
 		else {
 			*num = pg->len;
@@ -845,36 +863,40 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, union ceph_pg pgid,
 	}
 
 	/* crush */
-	if (pgid.pg.pool >= osdmap->num_pools)
+	poolid = le32_to_cpu(pgid.pool);
+	ps = le16_to_cpu(pgid.ps);
+	preferred = (s16)le16_to_cpu(pgid.preferred);
+
+	if (poolid >= osdmap->num_pools)
 		return NULL;
-	pool = &osdmap->pg_pool[pgid.pg.pool];
+	pool = &osdmap->pg_pool[poolid];
 	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
 				 pool->v.type, pool->v.size);
 	if (ruleno < 0) {
 		pr_err("no crush rule pool %d type %d size %d\n",
-		       pgid.pg.pool, pool->v.type, pool->v.size);
+		       poolid, pool->v.type, pool->v.size);
 		return NULL;
 	}
 
-	if (pgid.pg.preferred >= 0)
-		pps = ceph_stable_mod(pgid.pg.ps,
+	if (preferred >= 0)
+		pps = ceph_stable_mod(ps,
 				      le32_to_cpu(pool->v.lpgp_num),
 				      pool->lpgp_num_mask);
 	else
-		pps = ceph_stable_mod(pgid.pg.ps,
+		pps = ceph_stable_mod(ps,
 				      le32_to_cpu(pool->v.pgp_num),
 				      pool->pgp_num_mask);
-	pps += pgid.pg.pool;
+	pps += poolid;
 	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
 			     min_t(int, pool->v.size, *num),
-			     pgid.pg.preferred, osdmap->osd_weight);
+			     preferred, osdmap->osd_weight);
 	return osds;
 }
 
 /*
  * Return primary osd for given pgid, or -1 if none.
  */
-int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, union ceph_pg pgid)
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 {
 	int rawosds[10], *osds;
 	int i, num = ARRAY_SIZE(rawosds);
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
index 07127c6fb134..c4af8418aa00 100644
--- a/fs/ceph/osdmap.h
+++ b/fs/ceph/osdmap.h
@@ -25,7 +25,7 @@ struct ceph_pg_pool_info {
 
 struct ceph_pg_mapping {
 	struct rb_node node;
-	u64 pgid;
+	struct ceph_pg pgid;
 	int len;
 	int osds[];
 };
@@ -118,6 +118,7 @@ extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
 				   const char *oid,
 				   struct ceph_file_layout *fl,
 				   struct ceph_osdmap *osdmap);
-extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, union ceph_pg pgid);
+extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
+				struct ceph_pg pgid);
 
 #endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index a48cf4ae391e..85bdef78d142 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -55,13 +55,10 @@ struct ceph_timespec {
  * placement group.
  * we encode this into one __le64.
  */
-union ceph_pg {
-	__u64 pg64;
-	struct {
-		__s16 preferred; /* preferred primary osd */
-		__u16 ps;        /* placement seed */
-		__u32 pool;      /* object pool */
-	} __attribute__ ((packed)) pg;
+struct ceph_pg {
+	__le16 preferred; /* preferred primary osd */
+	__le16 ps;        /* placement seed */
+	__le32 pool;      /* object pool */
 } __attribute__ ((packed));
 
 /*
@@ -117,7 +114,7 @@ static inline int ceph_stable_mod(int x, int b, int bmask)
  * object layout - how a given object should be stored.
  */
 struct ceph_object_layout {
-	__le64 ol_pgid;           /* raw pg, with _full_ ps precision. */
+	struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
 	__le32 ol_stripe_unit;    /* for per-object parity, if any */
 } __attribute__ ((packed));
 
-- 
cgit v1.2.2


From 6a18be16f7513ea8a4923c161ce073987932cbdb Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 4 Nov 2009 11:40:05 -0800
Subject: ceph: fix sparse endian warning

Use the __le macro, even though for -1 it doesn't matter.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/file.c  | 2 +-
 fs/ceph/super.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 1bd57c8953bf..fc8aff4767d3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -52,7 +52,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
 	req->r_fmode = ceph_flags_to_mode(flags);
 	req->r_args.open.flags = cpu_to_le32(flags);
 	req->r_args.open.mode = cpu_to_le32(create_mode);
-	req->r_args.open.preferred = -1;
+	req->r_args.open.preferred = cpu_to_le32(-1);
 out:
 	return req;
 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 0ae40bad53c4..1ac7b07214f3 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -804,7 +804,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 	struct ceph_client *client;
 	int err;
 	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
-	const char *path = 0;
+	const char *path = NULL;
 	struct ceph_mount_args *args;
 
 	dout("ceph_get_sb\n");
-- 
cgit v1.2.2


From f28bcfbe660a3246621a367020054d4f1a179cd9 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 4 Nov 2009 11:46:35 -0800
Subject: ceph: convert port endianness

The port is informational only, but we should make it correct.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 6ff44bbddf67..5cc374850d8f 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -891,9 +891,9 @@ static int addr_port(struct sockaddr_storage *ss)
 {
 	switch (ss->ss_family) {
 	case AF_INET:
-		return ((struct sockaddr_in *)ss)->sin_port;
+		return ntohs(((struct sockaddr_in *)ss)->sin_port);
 	case AF_INET6:
-		return ((struct sockaddr_in6 *)ss)->sin6_port;
+		return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
 	}
 	return 0;
 }
-- 
cgit v1.2.2


From 1bdb70e59026838a79f77c440f8fe480a66e65e8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 6 Nov 2009 13:57:49 -0800
Subject: ceph: clean up 'osd%d down' console msg

No ceph prefix.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 8b0cd1107507..a025555b70af 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -648,7 +648,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		u32 osd;
 		ceph_decode_32_safe(p, end, osd, bad);
 		(*p)++;  /* clean flag */
-		pr_info("ceph osd%d down\n", osd);
+		pr_info("osd%d down\n", osd);
 		if (osd < map->max_osd)
 			map->osd_state[osd] &= ~CEPH_OSD_UP;
 	}
-- 
cgit v1.2.2


From c6cf726316abd613cfb7c325d950f3629f964ec6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 6 Nov 2009 16:39:26 -0800
Subject: ceph: make CRUSH hash functions non-inline

These are way to big to be inline.  I missed crush/* when doing the inline
audit for akpm's review.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/Makefile      |  2 +-
 fs/ceph/README        |  1 +
 fs/ceph/crush/crush.c | 11 ++++++
 fs/ceph/crush/crush.h | 11 +-----
 fs/ceph/crush/hash.c  | 86 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/crush/hash.h  | 92 ++++-----------------------------------------------
 6 files changed, 107 insertions(+), 96 deletions(-)
 create mode 100644 fs/ceph/crush/hash.c

(limited to 'fs')

diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 7da6d69dba29..8bad70aac363 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -11,7 +11,7 @@ ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
 	messenger.o msgpool.o buffer.o \
 	mds_client.o mdsmap.o \
 	mon_client.o \
-	osd_client.o osdmap.o crush/crush.o crush/mapper.o \
+	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
 	debugfs.o \
 	ceph_fs.o ceph_strings.o ceph_frag.o
 
diff --git a/fs/ceph/README b/fs/ceph/README
index 231a1df5eafe..660e00074d59 100644
--- a/fs/ceph/README
+++ b/fs/ceph/README
@@ -15,3 +15,4 @@ src/crush/crush.h	    fs/ceph/crush/crush.h
 src/crush/mapper.c	    fs/ceph/crush/mapper.c
 src/crush/mapper.h	    fs/ceph/crush/mapper.h
 src/crush/hash.h	    fs/ceph/crush/hash.h
+src/crush/hash.c	    fs/ceph/crush/hash.c
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
index 13755cdc4fb3..fabd302e5779 100644
--- a/fs/ceph/crush/crush.c
+++ b/fs/ceph/crush/crush.c
@@ -10,6 +10,17 @@
 
 #include "crush.h"
 
+const char *crush_bucket_alg_name(int alg)
+{
+	switch (alg) {
+	case CRUSH_BUCKET_UNIFORM: return "uniform";
+	case CRUSH_BUCKET_LIST: return "list";
+	case CRUSH_BUCKET_TREE: return "tree";
+	case CRUSH_BUCKET_STRAW: return "straw";
+	default: return "unknown";
+	}
+}
+
 /**
  * crush_get_bucket_item_weight - Get weight of an item in given bucket
  * @b: bucket pointer
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
index 9ac7e091126f..92c6b3c3a571 100644
--- a/fs/ceph/crush/crush.h
+++ b/fs/ceph/crush/crush.h
@@ -97,16 +97,7 @@ enum {
 	CRUSH_BUCKET_TREE = 3,
 	CRUSH_BUCKET_STRAW = 4
 };
-static inline const char *crush_bucket_alg_name(int alg)
-{
-	switch (alg) {
-	case CRUSH_BUCKET_UNIFORM: return "uniform";
-	case CRUSH_BUCKET_LIST: return "list";
-	case CRUSH_BUCKET_TREE: return "tree";
-	case CRUSH_BUCKET_STRAW: return "straw";
-	default: return "unknown";
-	}
-}
+extern const char *crush_bucket_alg_name(int alg);
 
 struct crush_bucket {
 	__s32 id;        /* this'll be negative */
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..b438c5d27816
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,86 @@
+
+#include <linux/types.h>
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do {			\
+		a = a-b;  a = a-c;  a = a^(c>>13);	\
+		b = b-c;  b = b-a;  b = b^(a<<8);	\
+		c = c-a;  c = c-b;  c = c^(b>>13);	\
+		a = a-b;  a = a-c;  a = a^(c>>12);	\
+		b = b-c;  b = b-a;  b = b^(a<<16);	\
+		c = c-a;  c = c-b;  c = c^(b>>5);	\
+		a = a-b;  a = a-c;  a = a^(c>>3);	\
+		b = b-c;  b = b-a;  b = b^(a<<10);	\
+		c = c-a;  c = c-b;  c = c^(b>>15);	\
+	} while (0)
+
+#define crush_hash_seed 1315423911
+
+__u32 crush_hash32(__u32 a)
+{
+	__u32 hash = crush_hash_seed ^ a;
+	__u32 b = a;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, a, hash);
+	return hash;
+}
+
+__u32 crush_hash32_2(__u32 a, __u32 b)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(x, a, hash);
+	crush_hashmix(b, y, hash);
+	return hash;
+}
+
+__u32 crush_hash32_3(__u32 a, __u32 b, __u32 c)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	return hash;
+}
+
+__u32 crush_hash32_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(a, x, hash);
+	crush_hashmix(y, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, d, hash);
+	return hash;
+}
+
+__u32 crush_hash32_5(__u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(e, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	crush_hashmix(d, x, hash);
+	crush_hashmix(y, e, hash);
+	return hash;
+}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
index 42f3312783a1..9ce89f85dc7d 100644
--- a/fs/ceph/crush/hash.h
+++ b/fs/ceph/crush/hash.h
@@ -1,90 +1,12 @@
 #ifndef _CRUSH_HASH_H
 #define _CRUSH_HASH_H
 
-/*
- * Robert Jenkins' function for mixing 32-bit values
- * http://burtleburtle.net/bob/hash/evahash.html
- * a, b = random bits, c = input and output
- */
-#define crush_hashmix(a, b, c) do {			\
-		a = a-b;  a = a-c;  a = a^(c>>13);	\
-		b = b-c;  b = b-a;  b = b^(a<<8);	\
-		c = c-a;  c = c-b;  c = c^(b>>13);	\
-		a = a-b;  a = a-c;  a = a^(c>>12);	\
-		b = b-c;  b = b-a;  b = b^(a<<16);	\
-		c = c-a;  c = c-b;  c = c^(b>>5);	\
-		a = a-b;  a = a-c;  a = a^(c>>3);	\
-		b = b-c;  b = b-a;  b = b^(a<<10);	\
-		c = c-a;  c = c-b;  c = c^(b>>15);	\
-	} while (0)
-
-#define crush_hash_seed 1315423911
-
-static inline __u32 crush_hash32(__u32 a)
-{
-	__u32 hash = crush_hash_seed ^ a;
-	__u32 b = a;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(b, x, hash);
-	crush_hashmix(y, a, hash);
-	return hash;
-}
-
-static inline __u32 crush_hash32_2(__u32 a, __u32 b)
-{
-	__u32 hash = crush_hash_seed ^ a ^ b;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(a, b, hash);
-	crush_hashmix(x, a, hash);
-	crush_hashmix(b, y, hash);
-	return hash;
-}
-
-static inline __u32 crush_hash32_3(__u32 a, __u32 b, __u32 c)
-{
-	__u32 hash = crush_hash_seed ^ a ^ b ^ c;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(a, b, hash);
-	crush_hashmix(c, x, hash);
-	crush_hashmix(y, a, hash);
-	crush_hashmix(b, x, hash);
-	crush_hashmix(y, c, hash);
-	return hash;
-}
-
-static inline __u32 crush_hash32_4(__u32 a, __u32 b, __u32 c,
-				   __u32 d)
-{
-	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(a, b, hash);
-	crush_hashmix(c, d, hash);
-	crush_hashmix(a, x, hash);
-	crush_hashmix(y, b, hash);
-	crush_hashmix(c, x, hash);
-	crush_hashmix(y, d, hash);
-	return hash;
-}
-
-static inline __u32 crush_hash32_5(__u32 a, __u32 b, __u32 c,
-				   __u32 d, __u32 e)
-{
-	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
-	__u32 x = 231232;
-	__u32 y = 1232;
-	crush_hashmix(a, b, hash);
-	crush_hashmix(c, d, hash);
-	crush_hashmix(e, x, hash);
-	crush_hashmix(y, a, hash);
-	crush_hashmix(b, x, hash);
-	crush_hashmix(y, c, hash);
-	crush_hashmix(d, x, hash);
-	crush_hashmix(y, e, hash);
-	return hash;
-}
+extern __u32 crush_hash32(__u32 a);
+extern __u32 crush_hash32_2(__u32 a, __u32 b);
+extern __u32 crush_hash32_3(__u32 a, __u32 b, __u32 c);
+extern __u32 crush_hash32_4(__u32 a, __u32 b, __u32 c,
+			    __u32 d);
+extern __u32 crush_hash32_5(__u32 a, __u32 b, __u32 c,
+			    __u32 d, __u32 e);
 
 #endif
-- 
cgit v1.2.2


From cfbbcd24a6bfd794295ee7ad76dfbff40ad6b934 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 6 Nov 2009 16:44:05 -0800
Subject: ceph: use strong hash function for mapping objects to pgs

We were using the (weak) dcache hash function, but it was leaving lower
bits consecutive for consecutive (inode) objects.  We really want to make
the object to pg mapping random and uniform, so use a proper hash function
here.

This is Robert Jenkin's public domain hash function (with some minor
cleanup):
	http://burtleburtle.net/bob/hash/evahash.html

This is a protocol revision.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.c | 93 +++++++++++++++++++++++++++++++++++++++++--------------
 fs/ceph/ceph_fs.h |  2 +-
 2 files changed, 71 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
index a950b4083577..b3ecf1b07521 100644
--- a/fs/ceph/ceph_fs.c
+++ b/fs/ceph/ceph_fs.c
@@ -73,32 +73,79 @@ int ceph_caps_for_mode(int mode)
 	return 0;
 }
 
-/* Name hashing routines. Initial hash value */
-/* Hash courtesy of the R5 hash in reiserfs modulo sign bits */
-#define ceph_init_name_hash()		0
-
-/* partial hash update function. Assume roughly 4 bits per character */
-static unsigned long ceph_partial_name_hash(unsigned long c,
-					    unsigned long prevhash)
-{
-	return (prevhash + (c << 4) + (c >> 4)) * 11;
-}
-
 /*
- * Finally: cut down the number of bits to a int value (and try to avoid
- * losing bits)
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
  */
-static unsigned long ceph_end_name_hash(unsigned long hash)
-{
-	return hash & 0xffffffff;
-}
+#define mix(a, b, c)						\
+	do {							\
+		a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 8);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 16);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 10);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
+	} while (0)
 
-/* Compute the hash for a name string. */
-unsigned int ceph_full_name_hash(const char *name, unsigned int len)
+unsigned int ceph_full_name_hash(const char *str, unsigned int length)
 {
-	unsigned long hash = ceph_init_name_hash();
-	while (len--)
-		hash = ceph_partial_name_hash(*name++, hash);
-	return ceph_end_name_hash(hash);
+	const unsigned char *k = (const unsigned char *)str;
+	__u32 a, b, c;  /* the internal state */
+	__u32 len;      /* how many key bytes still need mixing */
+
+	/* Set up the internal state */
+	len = length;
+	a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
+	b = a;
+	c = 0;               /* variable initialization of internal state */
+
+	/* handle most of the key */
+	while (len >= 12) {
+		a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+			 ((__u32)k[3] << 24));
+		b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+			 ((__u32)k[7] << 24));
+		c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+			 ((__u32)k[11] << 24));
+		mix(a, b, c);
+		k = k + 12;
+		len = len - 12;
+	}
+
+	/* handle the last 11 bytes */
+	c = c + length;
+	switch (len) {            /* all the case statements fall through */
+	case 11:
+		c = c + ((__u32)k[10] << 24);
+	case 10:
+		c = c + ((__u32)k[9] << 16);
+	case 9:
+		c = c + ((__u32)k[8] << 8);
+		/* the first byte of c is reserved for the length */
+	case 8:
+		b = b + ((__u32)k[7] << 24);
+	case 7:
+		b = b + ((__u32)k[6] << 16);
+	case 6:
+		b = b + ((__u32)k[5] << 8);
+	case 5:
+		b = b + k[4];
+	case 4:
+		a = a + ((__u32)k[3] << 24);
+	case 3:
+		a = a + ((__u32)k[2] << 16);
+	case 2:
+		a = a + ((__u32)k[1] << 8);
+	case 1:
+		a = a + k[0];
+		/* case 0: nothing left to add */
+	}
+	mix(a, b, c);
+
+	return c;
 }
 
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index ae523828c538..25fc537f4140 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -38,7 +38,7 @@
 #define CEPH_OSD_PROTOCOL     7 /* cluster internal */
 #define CEPH_MDS_PROTOCOL     9 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
-#define CEPH_OSDC_PROTOCOL   21 /* server/client */
+#define CEPH_OSDC_PROTOCOL   22 /* server/client */
 #define CEPH_MDSC_PROTOCOL   29 /* server/client */
 #define CEPH_MONC_PROTOCOL   15 /* server/client */
 
-- 
cgit v1.2.2


From 1654dd0cf5ee1827322aca156af7d96d757201c7 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 6 Nov 2009 21:55:25 -0800
Subject: ceph: make object hash a pg_pool property

The object will be hashed to a placement seed (ps) based on the pg_pool's
hash function.  This allows new hashes to be introduced into an existing
object store, or selection of a hash appropriate to the objects that
will be stored in a particular pool.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/Makefile    |   2 +-
 fs/ceph/README      |   2 +
 fs/ceph/ceph_fs.c   |  77 ----------------------------------
 fs/ceph/ceph_fs.h   |   2 -
 fs/ceph/ceph_hash.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/ceph_hash.h |  13 ++++++
 fs/ceph/osdmap.c    |   2 +-
 fs/ceph/rados.h     |   1 +
 fs/ceph/types.h     |   1 +
 9 files changed, 137 insertions(+), 81 deletions(-)
 create mode 100644 fs/ceph/ceph_hash.c
 create mode 100644 fs/ceph/ceph_hash.h

(limited to 'fs')

diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 8bad70aac363..bdd3e6fc1609 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -13,7 +13,7 @@ ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
 	mon_client.o \
 	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
 	debugfs.o \
-	ceph_fs.o ceph_strings.o ceph_frag.o
+	ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
 
 else
 #Otherwise we were called directly from the command
diff --git a/fs/ceph/README b/fs/ceph/README
index 660e00074d59..18352fab37c0 100644
--- a/fs/ceph/README
+++ b/fs/ceph/README
@@ -10,6 +10,8 @@ src/include/rados.h	    fs/ceph/rados.h
 src/include/ceph_strings.cc fs/ceph/ceph_strings.c
 src/include/ceph_frag.h	    fs/ceph/ceph_frag.h
 src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
+src/include/ceph_hash.h	    fs/ceph/ceph_hash.h
+src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
 src/crush/crush.c	    fs/ceph/crush/crush.c
 src/crush/crush.h	    fs/ceph/crush/crush.h
 src/crush/mapper.c	    fs/ceph/crush/mapper.c
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
index b3ecf1b07521..79d76bc4303f 100644
--- a/fs/ceph/ceph_fs.c
+++ b/fs/ceph/ceph_fs.c
@@ -72,80 +72,3 @@ int ceph_caps_for_mode(int mode)
 	}
 	return 0;
 }
-
-/*
- * Robert Jenkin's hash function.
- * http://burtleburtle.net/bob/hash/evahash.html
- * This is in the public domain.
- */
-#define mix(a, b, c)						\
-	do {							\
-		a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
-		b = b - c;  b = b - a;  b = b ^ (a << 8);	\
-		c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
-		a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
-		b = b - c;  b = b - a;  b = b ^ (a << 16);	\
-		c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
-		a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
-		b = b - c;  b = b - a;  b = b ^ (a << 10);	\
-		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
-	} while (0)
-
-unsigned int ceph_full_name_hash(const char *str, unsigned int length)
-{
-	const unsigned char *k = (const unsigned char *)str;
-	__u32 a, b, c;  /* the internal state */
-	__u32 len;      /* how many key bytes still need mixing */
-
-	/* Set up the internal state */
-	len = length;
-	a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
-	b = a;
-	c = 0;               /* variable initialization of internal state */
-
-	/* handle most of the key */
-	while (len >= 12) {
-		a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
-			 ((__u32)k[3] << 24));
-		b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
-			 ((__u32)k[7] << 24));
-		c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
-			 ((__u32)k[11] << 24));
-		mix(a, b, c);
-		k = k + 12;
-		len = len - 12;
-	}
-
-	/* handle the last 11 bytes */
-	c = c + length;
-	switch (len) {            /* all the case statements fall through */
-	case 11:
-		c = c + ((__u32)k[10] << 24);
-	case 10:
-		c = c + ((__u32)k[9] << 16);
-	case 9:
-		c = c + ((__u32)k[8] << 8);
-		/* the first byte of c is reserved for the length */
-	case 8:
-		b = b + ((__u32)k[7] << 24);
-	case 7:
-		b = b + ((__u32)k[6] << 16);
-	case 6:
-		b = b + ((__u32)k[5] << 8);
-	case 5:
-		b = b + k[4];
-	case 4:
-		a = a + ((__u32)k[3] << 24);
-	case 3:
-		a = a + ((__u32)k[2] << 16);
-	case 2:
-		a = a + ((__u32)k[1] << 8);
-	case 1:
-		a = a + k[0];
-		/* case 0: nothing left to add */
-	}
-	mix(a, b, c);
-
-	return c;
-}
-
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 25fc537f4140..36becb024788 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -49,8 +49,6 @@
 #define CEPH_MAX_MON   31
 
 
-unsigned int ceph_full_name_hash(const char *name, unsigned int len);
-
 
 /*
  * ceph_file_layout - describe data layout for a file/inode
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..ac8be54631fe
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
+
+#include "types.h"
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c)						\
+	do {							\
+		a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 8);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 16);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 10);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
+	} while (0)
+
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+	const unsigned char *k = (const unsigned char *)str;
+	__u32 a, b, c;  /* the internal state */
+	__u32 len;      /* how many key bytes still need mixing */
+
+	/* Set up the internal state */
+	len = length;
+	a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
+	b = a;
+	c = 0;               /* variable initialization of internal state */
+
+	/* handle most of the key */
+	while (len >= 12) {
+		a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+			 ((__u32)k[3] << 24));
+		b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+			 ((__u32)k[7] << 24));
+		c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+			 ((__u32)k[11] << 24));
+		mix(a, b, c);
+		k = k + 12;
+		len = len - 12;
+	}
+
+	/* handle the last 11 bytes */
+	c = c + length;
+	switch (len) {            /* all the case statements fall through */
+	case 11:
+		c = c + ((__u32)k[10] << 24);
+	case 10:
+		c = c + ((__u32)k[9] << 16);
+	case 9:
+		c = c + ((__u32)k[8] << 8);
+		/* the first byte of c is reserved for the length */
+	case 8:
+		b = b + ((__u32)k[7] << 24);
+	case 7:
+		b = b + ((__u32)k[6] << 16);
+	case 6:
+		b = b + ((__u32)k[5] << 8);
+	case 5:
+		b = b + k[4];
+	case 4:
+		a = a + ((__u32)k[3] << 24);
+	case 3:
+		a = a + ((__u32)k[2] << 16);
+	case 2:
+		a = a + ((__u32)k[1] << 8);
+	case 1:
+		a = a + k[0];
+		/* case 0: nothing left to add */
+	}
+	mix(a, b, c);
+
+	return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+        unsigned long hash = 0;
+	unsigned char c;
+
+        while (length-- > 0) {
+		c = *str++;
+		hash = (hash + (c << 4) + (c >> 4)) * 11;
+	}
+        return hash;
+}
+
+
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return ceph_str_hash_linux(s, len);
+	case CEPH_STR_HASH_RJENKINS:
+		return ceph_str_hash_rjenkins(s, len);
+	default:
+		return -1;
+	}
+}
+
+const char *ceph_str_hash_name(int type) 
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return "linux";
+	case CEPH_STR_HASH_RJENKINS:
+		return "rjenkins";
+	default:
+		return "unknown";
+	}
+}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
+#ifndef _FS_CEPH_HASH_H
+#define _FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+
+#endif
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index a025555b70af..68478270ad70 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -809,7 +809,7 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 		return -EIO;
 
 	pool = &osdmap->pg_pool[poolid];
-	ps = ceph_full_name_hash(oid, strlen(oid));
+	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
 	if (preferred >= 0) {
 		ps += preferred;
 		num = le32_to_cpu(pool->v.lpg_num);
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 85bdef78d142..fb23ff9297c9 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -84,6 +84,7 @@ struct ceph_pg_pool {
 	__u8 type;                /* CEPH_PG_TYPE_* */
 	__u8 size;                /* number of osds in each pg */
 	__u8 crush_ruleset;       /* crush placement rule */
+	__u8 object_hash;         /* hash mapping object name to ps */
 	__le32 pg_num, pgp_num;   /* number of pg's */
 	__le32 lpg_num, lpgp_num; /* number of localized pg's */
 	__le32 last_change;       /* most recent epoch changed */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
index 8a514568cab2..28b35a005ec2 100644
--- a/fs/ceph/types.h
+++ b/fs/ceph/types.h
@@ -9,6 +9,7 @@
 
 #include "ceph_fs.h"
 #include "ceph_frag.h"
+#include "ceph_hash.h"
 
 /*
  * Identify inodes by both their ino AND snapshot id (a u64).
-- 
cgit v1.2.2


From fb690390e305ea51e1883b105c7d3c52d7100ba5 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sat, 7 Nov 2009 20:18:22 -0800
Subject: ceph: make CRUSH hash function a bucket property

Make the integer hash function a property of the bucket it is used on.  This
allows us to gracefully add support for new hash functions without starting
from scatch.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/crush/crush.h  |  3 ++-
 fs/ceph/crush/hash.c   | 73 ++++++++++++++++++++++++++++++++++++++++++++++----
 fs/ceph/crush/hash.h   | 19 ++++++++-----
 fs/ceph/crush/mapper.c | 16 ++++++-----
 fs/ceph/osdmap.c       |  3 ++-
 5 files changed, 93 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
index 92c6b3c3a571..dcd7e7523700 100644
--- a/fs/ceph/crush/crush.h
+++ b/fs/ceph/crush/crush.h
@@ -102,7 +102,8 @@ extern const char *crush_bucket_alg_name(int alg);
 struct crush_bucket {
 	__s32 id;        /* this'll be negative */
 	__u16 type;      /* non-zero; type=0 is reserved for devices */
-	__u16 alg;       /* one of CRUSH_BUCKET_* */
+	__u8 alg;        /* one of CRUSH_BUCKET_* */
+	__u8 hash;       /* which hash function to use, CRUSH_HASH_* */
 	__u32 weight;    /* 16-bit fixed point */
 	__u32 size;      /* num items */
 	__s32 *items;
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
index b438c5d27816..5873aed694bf 100644
--- a/fs/ceph/crush/hash.c
+++ b/fs/ceph/crush/hash.c
@@ -1,5 +1,6 @@
 
 #include <linux/types.h>
+#include "hash.h"
 
 /*
  * Robert Jenkins' function for mixing 32-bit values
@@ -20,7 +21,7 @@
 
 #define crush_hash_seed 1315423911
 
-__u32 crush_hash32(__u32 a)
+static __u32 crush_hash32_rjenkins1(__u32 a)
 {
 	__u32 hash = crush_hash_seed ^ a;
 	__u32 b = a;
@@ -31,7 +32,7 @@ __u32 crush_hash32(__u32 a)
 	return hash;
 }
 
-__u32 crush_hash32_2(__u32 a, __u32 b)
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
 {
 	__u32 hash = crush_hash_seed ^ a ^ b;
 	__u32 x = 231232;
@@ -42,7 +43,7 @@ __u32 crush_hash32_2(__u32 a, __u32 b)
 	return hash;
 }
 
-__u32 crush_hash32_3(__u32 a, __u32 b, __u32 c)
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
 {
 	__u32 hash = crush_hash_seed ^ a ^ b ^ c;
 	__u32 x = 231232;
@@ -55,7 +56,7 @@ __u32 crush_hash32_3(__u32 a, __u32 b, __u32 c)
 	return hash;
 }
 
-__u32 crush_hash32_4(__u32 a, __u32 b, __u32 c, __u32 d)
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
 {
 	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
 	__u32 x = 231232;
@@ -69,7 +70,8 @@ __u32 crush_hash32_4(__u32 a, __u32 b, __u32 c, __u32 d)
 	return hash;
 }
 
-__u32 crush_hash32_5(__u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+				      __u32 e)
 {
 	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
 	__u32 x = 231232;
@@ -84,3 +86,64 @@ __u32 crush_hash32_5(__u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
 	crush_hashmix(y, e, hash);
 	return hash;
 }
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1(a);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_2(a, b);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_3(a, b, c);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_4(a, b, c, d);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_5(a, b, c, d, e);
+	default:
+		return 0;
+	}
+}
+
+const char *crush_hash_name(int type)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return "rjenkins1";
+	default:
+		return "unknown";
+	}
+}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
index 9ce89f85dc7d..ff48e110e4bb 100644
--- a/fs/ceph/crush/hash.h
+++ b/fs/ceph/crush/hash.h
@@ -1,12 +1,17 @@
 #ifndef _CRUSH_HASH_H
 #define _CRUSH_HASH_H
 
-extern __u32 crush_hash32(__u32 a);
-extern __u32 crush_hash32_2(__u32 a, __u32 b);
-extern __u32 crush_hash32_3(__u32 a, __u32 b, __u32 c);
-extern __u32 crush_hash32_4(__u32 a, __u32 b, __u32 c,
-			    __u32 d);
-extern __u32 crush_hash32_5(__u32 a, __u32 b, __u32 c,
-			    __u32 d, __u32 e);
+#define CRUSH_HASH_RJENKINS1   0
+
+#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
+
+extern const char *crush_hash_name(int type);
+
+extern __u32 crush_hash32(int type, __u32 a);
+extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
+extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
+extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
+extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
+			    __u32 e);
 
 #endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
index 54f3f402af60..2523d448445c 100644
--- a/fs/ceph/crush/mapper.c
+++ b/fs/ceph/crush/mapper.c
@@ -78,7 +78,7 @@ static int bucket_perm_choose(struct crush_bucket *bucket,
 
 		/* optimize common r=0 case */
 		if (pr == 0) {
-			s = crush_hash32_3(x, bucket->id, 0) %
+			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
 				bucket->size;
 			bucket->perm[0] = s;
 			bucket->perm_n = 0xffff;   /* magic value, see below */
@@ -103,7 +103,7 @@ static int bucket_perm_choose(struct crush_bucket *bucket,
 		unsigned p = bucket->perm_n;
 		/* no point in swapping the final entry */
 		if (p < bucket->size - 1) {
-			i = crush_hash32_3(x, bucket->id, p) %
+			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
 				(bucket->size - p);
 			if (i) {
 				unsigned t = bucket->perm[p + i];
@@ -138,8 +138,8 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
 	int i;
 
 	for (i = bucket->h.size-1; i >= 0; i--) {
-		__u64 w = crush_hash32_4(x, bucket->h.items[i], r,
-					 bucket->h.id);
+		__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+					 r, bucket->h.id);
 		w &= 0xffff;
 		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
 			"sw %x rand %llx",
@@ -198,7 +198,8 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
 	while (!terminal(n)) {
 		/* pick point in [0, w) */
 		w = bucket->node_weights[n];
-		t = (__u64)crush_hash32_4(x, n, r, bucket->h.id) * (__u64)w;
+		t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+					  bucket->h.id) * (__u64)w;
 		t = t >> 32;
 
 		/* descend to the left or right? */
@@ -224,7 +225,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
 	__u64 draw;
 
 	for (i = 0; i < bucket->h.size; i++) {
-		draw = crush_hash32_3(x, bucket->h.items[i], r);
+		draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
 		draw &= 0xffff;
 		draw *= bucket->straws[i];
 		if (i == 0 || draw > high_draw) {
@@ -267,7 +268,8 @@ static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
 		return 0;
 	if (weight[item] == 0)
 		return 1;
-	if ((crush_hash32_2(x, item) & 0xffff) < weight[item])
+	if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+	    < weight[item])
 		return 0;
 	return 1;
 }
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 68478270ad70..8c994c714781 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -210,7 +210,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 		ceph_decode_need(p, end, 4*sizeof(u32), bad);
 		b->id = ceph_decode_32(p);
 		b->type = ceph_decode_16(p);
-		b->alg = ceph_decode_16(p);
+		b->alg = ceph_decode_8(p);
+		b->hash = ceph_decode_8(p);
 		b->weight = ceph_decode_32(p);
 		b->size = ceph_decode_32(p);
 
-- 
cgit v1.2.2


From 685f9a5d14194fc35db73e5e7370740ccc14b64a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 9 Nov 2009 12:05:48 -0800
Subject: ceph: do not confuse stale and dead (unreconnected) caps

We were using the cap_gen to track both stale caps (caps that timed out
due to temporarily losing touch with the mds) and dead caps that did not
reconnect after an MDS failure.  Introduce a recon_gen counter to track
reconnections to restarted MDSs and kill dead caps based on that instead.

Rename gen to cap_gen while we're at it to make it more clear which is
which.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       | 20 +++++++++++++++-----
 fs/ceph/mds_client.c |  9 ++++++---
 fs/ceph/mds_client.h |  2 ++
 fs/ceph/super.h      |  4 +++-
 4 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8b863dbec70c..775e6f6fc970 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -609,7 +609,8 @@ retry:
 	cap->seq = seq;
 	cap->issue_seq = seq;
 	cap->mseq = mseq;
-	cap->gen = session->s_cap_gen;
+	cap->cap_gen = session->s_cap_gen;
+	cap->recon_gen = session->s_recon_gen;
 
 	if (fmode >= 0)
 		__ceph_get_fmode(ci, fmode);
@@ -626,17 +627,25 @@ retry:
 static int __cap_is_valid(struct ceph_cap *cap)
 {
 	unsigned long ttl;
-	u32 gen;
+	u32 gen, recon_gen;
 
 	spin_lock(&cap->session->s_cap_lock);
 	gen = cap->session->s_cap_gen;
+	recon_gen = cap->session->s_recon_gen;
 	ttl = cap->session->s_cap_ttl;
 	spin_unlock(&cap->session->s_cap_lock);
 
-	if (cap->gen < gen || time_after_eq(jiffies, ttl)) {
+	if (cap->recon_gen != recon_gen) {
+		dout("__cap_is_valid %p cap %p issued %s "
+		     "but DEAD (recon_gen %u vs %u)\n", &cap->ci->vfs_inode,
+		     cap, ceph_cap_string(cap->issued), cap->recon_gen,
+		     recon_gen);
+		return 0;
+	}
+	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
 		dout("__cap_is_valid %p cap %p issued %s "
 		     "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
-		     cap, ceph_cap_string(cap->issued), cap->gen, gen);
+		     cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
 		return 0;
 	}
 
@@ -2203,7 +2212,8 @@ restart:
 	issued = __ceph_caps_issued(ci, &implemented);
 	issued |= implemented | __ceph_caps_dirty(ci);
 
-	cap->gen = session->s_cap_gen;
+	cap->cap_gen = session->s_cap_gen;
+	cap->recon_gen = session->s_recon_gen;
 
 	__check_cap_issue(ci, cap, newcaps);
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 210cb6623ea2..828417ae16f9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -329,6 +329,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
 
 	spin_lock_init(&s->s_cap_lock);
+	s->s_recon_gen = 0;
 	s->s_cap_gen = 0;
 	s->s_cap_ttl = 0;
 	s->s_renew_requested = 0;
@@ -738,10 +739,11 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 	struct ceph_mds_session *session = arg;
 
 	spin_lock(&inode->i_lock);
-	if (cap->gen != session->s_cap_gen) {
+	if (cap->recon_gen != session->s_recon_gen) {
 		pr_err("failed reconnect %p %llx.%llx cap %p "
-		       "(gen %d < session %d)\n", inode, ceph_vinop(inode),
-		       cap, cap->gen, session->s_cap_gen);
+		       "(recon_gen %d < session %d)\n", inode,
+		       ceph_vinop(inode), cap,
+		       cap->recon_gen, session->s_recon_gen);
 		__ceph_remove_cap(cap, NULL);
 	}
 	wake_up(&ceph_inode(inode)->i_cap_wq);
@@ -2050,6 +2052,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 
 		session->s_state = CEPH_MDS_SESSION_RECONNECTING;
 		session->s_seq = 0;
+		session->s_recon_gen++;
 
 		ceph_con_open(&session->s_con,
 			      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index f566e9c84295..c0846b1c482b 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -98,6 +98,8 @@ struct ceph_mds_session {
 	u64               s_seq;      /* incoming msg seq # */
 	struct mutex      s_mutex;    /* serialize session messages */
 
+	int               s_recon_gen; /* inc on reconnect to recovered mds */
+
 	struct ceph_connection s_con;
 
 	/* protected by s_cap_lock */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 05947b96c524..25793559a2e5 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -169,7 +169,9 @@ struct ceph_cap {
 	int issued;       /* latest, from the mds */
 	int implemented;  /* implemented superset of issued (for revocation) */
 	int mds_wanted;
-	u32 seq, issue_seq, mseq, gen;
+	u32 seq, issue_seq, mseq;
+	u32 cap_gen;      /* active/stale cycle */
+	u32 recon_gen;    /* mds restart reconnect cycle */
 	unsigned long last_used;
 	struct list_head caps_item;
 };
-- 
cgit v1.2.2


From eed0ef2caf928327332da54d23579debe629d5bc Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 10 Nov 2009 14:34:36 -0800
Subject: ceph: separate banner and connect during handshake into distinct
 stages

We need to make sure we only swab the address during the banner once.  So
break process_banner out of process_connect, and clean up the surrounding
code so that these are distinct phases of the handshake.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 117 +++++++++++++++++++++++++++++++++-------------------
 fs/ceph/messenger.h |   4 +-
 2 files changed, 77 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 5cc374850d8f..e389656b2a6b 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -564,10 +564,26 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 /*
  * We connected to a peer and are saying hello.
  */
-static void prepare_write_connect(struct ceph_messenger *msgr,
-				  struct ceph_connection *con)
+static void prepare_write_banner(struct ceph_messenger *msgr,
+				 struct ceph_connection *con)
 {
 	int len = strlen(CEPH_BANNER);
+
+	con->out_kvec[0].iov_base = CEPH_BANNER;
+	con->out_kvec[0].iov_len = len;
+	con->out_kvec[1].iov_base = &msgr->my_enc_addr;
+	con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
+	con->out_kvec_left = 2;
+	con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 0;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+static void prepare_write_connect(struct ceph_messenger *msgr,
+				  struct ceph_connection *con,
+				  int after_banner)
+{
 	unsigned global_seq = get_global_seq(con->msgr, 0);
 	int proto;
 
@@ -595,32 +611,14 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
 	if (test_bit(LOSSYTX, &con->state))
 		con->out_connect.flags = CEPH_MSG_CONNECT_LOSSY;
 
-	con->out_kvec[0].iov_base = CEPH_BANNER;
-	con->out_kvec[0].iov_len = len;
-	con->out_kvec[1].iov_base = &msgr->my_enc_addr;
-	con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
-	con->out_kvec[2].iov_base = &con->out_connect;
-	con->out_kvec[2].iov_len = sizeof(con->out_connect);
-	con->out_kvec_left = 3;
-	con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr) +
-		sizeof(con->out_connect);
-	con->out_kvec_cur = con->out_kvec;
-	con->out_more = 0;
-	set_bit(WRITE_PENDING, &con->state);
-}
-
-static void prepare_write_connect_retry(struct ceph_messenger *msgr,
-					struct ceph_connection *con)
-{
-	dout("prepare_write_connect_retry %p\n", con);
-	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
-	con->out_connect.global_seq =
-		cpu_to_le32(get_global_seq(con->msgr, 0));
-
-	con->out_kvec[0].iov_base = &con->out_connect;
-	con->out_kvec[0].iov_len = sizeof(con->out_connect);
-	con->out_kvec_left = 1;
-	con->out_kvec_bytes = sizeof(con->out_connect);
+	if (!after_banner) {
+		con->out_kvec_left = 0;
+		con->out_kvec_bytes = 0;
+	}
+	con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
+	con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
+	con->out_kvec_left++;
+	con->out_kvec_bytes += sizeof(con->out_connect);
 	con->out_kvec_cur = con->out_kvec;
 	con->out_more = 0;
 	set_bit(WRITE_PENDING, &con->state);
@@ -778,6 +776,12 @@ out:
 /*
  * Prepare to read connection handshake, or an ack.
  */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+	dout("prepare_read_banner %p\n", con);
+	con->in_base_pos = 0;
+}
+
 static void prepare_read_connect(struct ceph_connection *con)
 {
 	dout("prepare_read_connect %p\n", con);
@@ -829,11 +833,11 @@ static int read_partial(struct ceph_connection *con,
 /*
  * Read all or part of the connect-side handshake on a new connection
  */
-static int read_partial_connect(struct ceph_connection *con)
+static int read_partial_banner(struct ceph_connection *con)
 {
 	int ret, to = 0;
 
-	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
 
 	/* peer's banner */
 	ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
@@ -847,6 +851,16 @@ static int read_partial_connect(struct ceph_connection *con)
 			   &con->peer_addr_for_me);
 	if (ret <= 0)
 		goto out;
+out:
+	return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+	int ret, to = 0;
+
+	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
 	ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
 	if (ret <= 0)
 		goto out;
@@ -856,6 +870,7 @@ static int read_partial_connect(struct ceph_connection *con)
 	     le32_to_cpu(con->in_reply.global_seq));
 out:
 	return ret;
+
 }
 
 /*
@@ -976,9 +991,9 @@ bad:
 	return -EINVAL;
 }
 
-static int process_connect(struct ceph_connection *con)
+static int process_banner(struct ceph_connection *con)
 {
-	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+	dout("process_banner on %p\n", con);
 
 	if (verify_hello(con) < 0)
 		return -1;
@@ -1016,10 +1031,19 @@ static int process_connect(struct ceph_connection *con)
 		       sizeof(con->peer_addr_for_me.in_addr));
 		addr_set_port(&con->msgr->inst.addr.in_addr, port);
 		encode_my_addr(con->msgr);
-		dout("process_connect learned my addr is %s\n",
+		dout("process_banner learned my addr is %s\n",
 		     pr_addr(&con->msgr->inst.addr.in_addr));
 	}
 
+	set_bit(NEGOTIATING, &con->state);
+	prepare_read_connect(con);
+	return 0;
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
 	switch (con->in_reply.tag) {
 	case CEPH_MSGR_TAG_BADPROTOVER:
 		dout("process_connect got BADPROTOVER my %d != their %d\n",
@@ -1053,7 +1077,7 @@ static int process_connect(struct ceph_connection *con)
 		       ENTITY_NAME(con->peer_name),
 		       pr_addr(&con->peer_addr.in_addr));
 		reset_connection(con);
-		prepare_write_connect_retry(con->msgr, con);
+		prepare_write_connect(con->msgr, con, 0);
 		prepare_read_connect(con);
 
 		/* Tell ceph about it. */
@@ -1071,7 +1095,7 @@ static int process_connect(struct ceph_connection *con)
 		     le32_to_cpu(con->out_connect.connect_seq),
 		     le32_to_cpu(con->in_connect.connect_seq));
 		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
-		prepare_write_connect_retry(con->msgr, con);
+		prepare_write_connect(con->msgr, con, 0);
 		prepare_read_connect(con);
 		break;
 
@@ -1080,19 +1104,17 @@ static int process_connect(struct ceph_connection *con)
 		 * If we sent a smaller global_seq than the peer has, try
 		 * again with a larger value.
 		 */
-		dout("process_connect got RETRY_GLOBAL my %u, peer_gseq = %u\n",
+		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
 		     con->peer_global_seq,
 		     le32_to_cpu(con->in_connect.global_seq));
 		get_global_seq(con->msgr,
 			       le32_to_cpu(con->in_connect.global_seq));
-		prepare_write_connect_retry(con->msgr, con);
+		prepare_write_connect(con->msgr, con, 0);
 		prepare_read_connect(con);
 		break;
 
 	case CEPH_MSGR_TAG_READY:
 		clear_bit(CONNECTING, &con->state);
-		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			set_bit(LOSSYRX, &con->state);
 		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
 		con->connect_seq++;
 		dout("process_connect got READY gseq %d cseq %d (%d)\n",
@@ -1420,9 +1442,11 @@ more:
 		if (test_and_clear_bit(STANDBY, &con->state))
 			con->connect_seq++;
 
-		prepare_write_connect(msgr, con);
-		prepare_read_connect(con);
+		prepare_write_banner(msgr, con);
+		prepare_write_connect(msgr, con, 1);
+		prepare_read_banner(con);
 		set_bit(CONNECTING, &con->state);
+		clear_bit(NEGOTIATING, &con->state);
 
 		con->in_tag = CEPH_MSGR_TAG_READY;
 		dout("try_write initiating connect on %p new state %lu\n",
@@ -1521,7 +1545,16 @@ more:
 	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
 	     con->in_base_pos);
 	if (test_bit(CONNECTING, &con->state)) {
-		dout("try_read connecting\n");
+		if (!test_bit(NEGOTIATING, &con->state)) {
+			dout("try_read connecting\n");
+			ret = read_partial_banner(con);
+			if (ret <= 0)
+				goto done;
+			if (process_banner(con) < 0) {
+				ret = -1;
+				goto out;
+			}
+		}
 		ret = read_partial_connect(con);
 		if (ret <= 0)
 			goto done;
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index e016fa7cf970..80f7e1e94448 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -104,8 +104,8 @@ struct ceph_msg_pos {
  * thread is currently opening, reading or writing data to the socket.
  */
 #define LOSSYTX         0  /* we can close channel or drop messages on errors */
-#define LOSSYRX         1  /* peer may reset/drop messages */
-#define CONNECTING	2
+#define CONNECTING	1
+#define NEGOTIATING	2
 #define KEEPALIVE_PENDING      3
 #define WRITE_PENDING	4  /* we have data ready to send */
 #define QUEUED          5  /* there is work queued on this connection */
-- 
cgit v1.2.2


From cdac830313fa6bf2831693af80fefe4aaac11b7d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 10 Nov 2009 16:02:23 -0800
Subject: ceph: remove recon_gen logic

We don't get an explicit affirmative confirmation that our caps reconnect,
nor do we necessarily want to pay that cost.  So, take all this code out
for now.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       | 12 +-----------
 fs/ceph/mds_client.c | 15 +--------------
 fs/ceph/mds_client.h |  2 --
 fs/ceph/super.h      |  1 -
 4 files changed, 2 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 775e6f6fc970..d8132b6e770d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -610,7 +610,6 @@ retry:
 	cap->issue_seq = seq;
 	cap->mseq = mseq;
 	cap->cap_gen = session->s_cap_gen;
-	cap->recon_gen = session->s_recon_gen;
 
 	if (fmode >= 0)
 		__ceph_get_fmode(ci, fmode);
@@ -627,21 +626,13 @@ retry:
 static int __cap_is_valid(struct ceph_cap *cap)
 {
 	unsigned long ttl;
-	u32 gen, recon_gen;
+	u32 gen;
 
 	spin_lock(&cap->session->s_cap_lock);
 	gen = cap->session->s_cap_gen;
-	recon_gen = cap->session->s_recon_gen;
 	ttl = cap->session->s_cap_ttl;
 	spin_unlock(&cap->session->s_cap_lock);
 
-	if (cap->recon_gen != recon_gen) {
-		dout("__cap_is_valid %p cap %p issued %s "
-		     "but DEAD (recon_gen %u vs %u)\n", &cap->ci->vfs_inode,
-		     cap, ceph_cap_string(cap->issued), cap->recon_gen,
-		     recon_gen);
-		return 0;
-	}
 	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
 		dout("__cap_is_valid %p cap %p issued %s "
 		     "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
@@ -2213,7 +2204,6 @@ restart:
 	issued |= implemented | __ceph_caps_dirty(ci);
 
 	cap->cap_gen = session->s_cap_gen;
-	cap->recon_gen = session->s_recon_gen;
 
 	__check_cap_issue(ci, cap, newcaps);
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 828417ae16f9..aad10d90feee 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -329,7 +329,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
 
 	spin_lock_init(&s->s_cap_lock);
-	s->s_recon_gen = 0;
 	s->s_cap_gen = 0;
 	s->s_cap_ttl = 0;
 	s->s_renew_requested = 0;
@@ -736,25 +735,14 @@ static void remove_session_caps(struct ceph_mds_session *session)
 static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 			      void *arg)
 {
-	struct ceph_mds_session *session = arg;
-
-	spin_lock(&inode->i_lock);
-	if (cap->recon_gen != session->s_recon_gen) {
-		pr_err("failed reconnect %p %llx.%llx cap %p "
-		       "(recon_gen %d < session %d)\n", inode,
-		       ceph_vinop(inode), cap,
-		       cap->recon_gen, session->s_recon_gen);
-		__ceph_remove_cap(cap, NULL);
-	}
 	wake_up(&ceph_inode(inode)->i_cap_wq);
-	spin_unlock(&inode->i_lock);
 	return 0;
 }
 
 static void wake_up_session_caps(struct ceph_mds_session *session)
 {
 	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
-	iterate_session_caps(session, wake_up_session_cb, session);
+	iterate_session_caps(session, wake_up_session_cb, NULL);
 }
 
 /*
@@ -2052,7 +2040,6 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 
 		session->s_state = CEPH_MDS_SESSION_RECONNECTING;
 		session->s_seq = 0;
-		session->s_recon_gen++;
 
 		ceph_con_open(&session->s_con,
 			      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c0846b1c482b..f566e9c84295 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -98,8 +98,6 @@ struct ceph_mds_session {
 	u64               s_seq;      /* incoming msg seq # */
 	struct mutex      s_mutex;    /* serialize session messages */
 
-	int               s_recon_gen; /* inc on reconnect to recovered mds */
-
 	struct ceph_connection s_con;
 
 	/* protected by s_cap_lock */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 25793559a2e5..06b62c02f513 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -171,7 +171,6 @@ struct ceph_cap {
 	int mds_wanted;
 	u32 seq, issue_seq, mseq;
 	u32 cap_gen;      /* active/stale cycle */
-	u32 recon_gen;    /* mds restart reconnect cycle */
 	unsigned long last_used;
 	struct list_head caps_item;
 };
-- 
cgit v1.2.2


From 09b8a7d2af83ae96dc052f9708e50140d06a9b6c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 11 Nov 2009 15:21:27 -0800
Subject: ceph: exclude snapdir from readdir results

It was hidden from sync readdir, but not the cached dcache version.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4f7467961b09..32ef54367224 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -131,6 +131,7 @@ more:
 			goto out_unlock;
 		}
 		if (!d_unhashed(dentry) && dentry->d_inode &&
+		    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
 		    filp->f_pos <= di->offset)
 			break;
 		dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
-- 
cgit v1.2.2


From b377ff13b31778c19203f3089d14080beb40a692 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 11 Nov 2009 15:22:37 -0800
Subject: ceph: initialize i_size/i_rbytes on snapdir

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 036873c42a78..074ee42bd344 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -62,6 +62,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
 		.snap = CEPH_SNAPDIR,
 	};
 	struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+	struct ceph_inode_info *ci = ceph_inode(inode);
 
 	BUG_ON(!S_ISDIR(parent->i_mode));
 	if (IS_ERR(inode))
@@ -71,7 +72,8 @@ struct inode *ceph_get_snapdir(struct inode *parent)
 	inode->i_gid = parent->i_gid;
 	inode->i_op = &ceph_dir_iops;
 	inode->i_fop = &ceph_dir_fops;
-	ceph_inode(inode)->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+	ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+	ci->i_rbytes = 0;
 	return inode;
 }
 
-- 
cgit v1.2.2


From fef320ff8887c702cde7ca6b8dbfff3a341d49fe Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 11 Nov 2009 15:50:12 -0800
Subject: ceph: pr_info when mds reconnect completes

This helps the user know what's going on during the (involved) reconnect
process.  They already see when the mds fails and reconnect starts.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index aad10d90feee..44cac576f15e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2234,6 +2234,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 		 */
 		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
 		    newstate >= CEPH_MDS_STATE_ACTIVE) {
+			pr_info("mds%d reconnect completed\n", s->s_mds);
 			kick_requests(mdsc, i, 1);
 			ceph_kick_flushing_caps(mdsc, s);
 		}
-- 
cgit v1.2.2


From 039934b895c89c2bb40aa5132efe00e60b70efca Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 12 Nov 2009 15:05:52 -0800
Subject: ceph: build cleanly without CONFIG_DEBUG_FS

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/debugfs.c    | 23 +++++++++++++++++++++++
 fs/ceph/mds_client.h |  2 ++
 fs/ceph/mon_client.h |  2 ++
 fs/ceph/osd_client.h |  2 ++
 fs/ceph/super.h      |  2 ++
 5 files changed, 31 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 9edbad32f116..9b2020696680 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -8,6 +8,8 @@
 #include "super.h"
 #include "mds_client.h"
 
+#ifdef CONFIG_DEBUG_FS
+
 /*
  * Implement /sys/kernel/debug/ceph fun
  *
@@ -423,3 +425,24 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
 	debugfs_remove(client->debugfs_dir);
 }
 
+#else  // CONFIG_DEBUG_FS
+
+int __init ceph_debugfs_init(void)
+{
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+	return 0;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif  // CONFIG_DEBUG_FS
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index f566e9c84295..0751b821f231 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -256,7 +256,9 @@ struct ceph_mds_client {
 	spinlock_t        cap_dirty_lock;   /* protects above items */
 	wait_queue_head_t cap_flushing_wq;
 
+#ifdef CONFIG_DEBUG_FS
 	struct dentry 	  *debugfs_file;
+#endif
 
 	spinlock_t	  dentry_lru_lock;
 	struct list_head  dentry_lru;
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index 5258c5693b03..9f6db45bf469 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -78,7 +78,9 @@ struct ceph_mon_client {
 	int want_next_osdmap; /* 1 = want, 2 = want+asked */
 	u32 have_osdmap, have_mdsmap;
 
+#ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs_file;
+#endif
 };
 
 extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index 9a4addf7d651..766c8dc80aff 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -83,7 +83,9 @@ struct ceph_osd_client {
 	struct rb_root         requests;      /* pending requests */
 	int                    num_requests;
 	struct delayed_work    timeout_work;
+#ifdef CONFIG_DEBUG_FS
 	struct dentry 	       *debugfs_file;
+#endif
 
 	mempool_t              *req_mempool;
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 06b62c02f513..8aa1ffba6c0d 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -112,9 +112,11 @@ static inline unsigned long time_sub(unsigned long a, unsigned long b)
  */
 struct ceph_client {
 	__s64 whoami;                   /* my client number */
+#ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs_monmap;
 	struct dentry *debugfs_mdsmap, *debugfs_osdmap;
 	struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+#endif
 
 	struct mutex mount_mutex;       /* serialize mount attempts */
 	struct ceph_mount_args *mount_args;
-- 
cgit v1.2.2


From 11ea8eda064aa4dc6e44a6dade1891b69ebd5255 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 12 Nov 2009 15:07:24 -0800
Subject: ceph: fix page invalidation deadlock

We occasionally want to make a best-effort attempt to invalidate cache
pages without fear of blocking.  If this fails, we fall back to an async
invalidate in another thread.

Use invalidate_mapping_pages instead of invalidate_inode_page2, as that
will skip locked pages, and not deadlock.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d8132b6e770d..9dd110602cda 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1440,7 +1440,7 @@ retry_locked:
 
 		dout("check_caps trying to invalidate on %p\n", inode);
 		spin_unlock(&inode->i_lock);
-		ret = invalidate_inode_pages2(&inode->i_data);
+		ret = invalidate_mapping_pages(&inode->i_data, 0, -1);
 		spin_lock(&inode->i_lock);
 		if (ret == 0 && invalidating_gen == ci->i_rdcache_gen) {
 			/* success. */
@@ -2180,7 +2180,7 @@ restart:
 		spin_unlock(&inode->i_lock);
 		tried_invalidate = 1;
 
-		ret = invalidate_inode_pages2(&inode->i_data);
+		ret = invalidate_mapping_pages(&inode->i_data, 0, -1);
 		spin_lock(&inode->i_lock);
 		if (ret < 0) {
 			/* there were locked pages.. invalidate later
-- 
cgit v1.2.2


From 42ce56e50d948fc7c1c8c3a749ee5a21a7e134f6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 18 Nov 2009 11:22:36 -0800
Subject: ceph: remove bad calls to ceph_con_shutdown

We want to ceph_con_close when we're done with the connection, before
the ref count reaches 0.  Once it does, do not call ceph_con_shutdown,
as that takes the con mutex and may sleep, and besides that is
unnecessary.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 30 ++++++++++++++++++------------
 fs/ceph/messenger.h  |  1 -
 fs/ceph/osd_client.c |  4 +---
 3 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 44cac576f15e..fdecf9984180 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -274,10 +274,8 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
 {
 	dout("mdsc put_session %p %d -> %d\n", s,
 	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
-	if (atomic_dec_and_test(&s->s_ref)) {
-		ceph_con_shutdown(&s->s_con);
+	if (atomic_dec_and_test(&s->s_ref))
 		kfree(s);
-	}
 }
 
 /*
@@ -326,7 +324,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	s->s_con.ops = &mds_con_ops;
 	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
 	s->s_con.peer_name.num = cpu_to_le64(mds);
-	ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
 
 	spin_lock_init(&s->s_cap_lock);
 	s->s_cap_gen = 0;
@@ -352,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 		dout("register_session realloc to %d\n", newmax);
 		sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
 		if (sa == NULL)
-			return ERR_PTR(-ENOMEM);
+			goto fail_realloc;
 		if (mdsc->sessions) {
 			memcpy(sa, mdsc->sessions,
 			       mdsc->max_sessions * sizeof(void *));
@@ -363,17 +360,26 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	}
 	mdsc->sessions[mds] = s;
 	atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
+
+	ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
 	return s;
+
+fail_realloc:
+	kfree(s);
+	return ERR_PTR(-ENOMEM);
 }
 
 /*
  * called under mdsc->mutex
  */
-static void unregister_session(struct ceph_mds_client *mdsc, int mds)
+static void unregister_session(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_session *s)
 {
-	dout("unregister_session mds%d %p\n", mds, mdsc->sessions[mds]);
-	ceph_put_mds_session(mdsc->sessions[mds]);
-	mdsc->sessions[mds] = NULL;
+	dout("unregister_session mds%d %p\n", s->s_mds, s);
+	mdsc->sessions[s->s_mds] = NULL;
+	ceph_con_close(&s->s_con);
+	ceph_put_mds_session(s);
 }
 
 /*
@@ -1870,7 +1876,7 @@ static void handle_session(struct ceph_mds_session *session,
 		break;
 
 	case CEPH_SESSION_CLOSE:
-		unregister_session(mdsc, mds);
+		unregister_session(mdsc, session);
 		remove_session_caps(session);
 		wake = 1; /* for good measure */
 		complete(&mdsc->session_close_waiters);
@@ -2199,7 +2205,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 				/* the session never opened, just close it
 				 * out now */
 				__wake_requests(mdsc, &s->s_waiting);
-				unregister_session(mdsc, i);
+				unregister_session(mdsc, s);
 			} else {
 				/* just close it */
 				mutex_unlock(&mdsc->mutex);
@@ -2724,7 +2730,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	for (i = 0; i < mdsc->max_sessions; i++) {
 		if (mdsc->sessions[i]) {
 			session = get_session(mdsc->sessions[i]);
-			unregister_session(mdsc, i);
+			unregister_session(mdsc, session);
 			mutex_unlock(&mdsc->mutex);
 			mutex_lock(&session->s_mutex);
 			remove_session_caps(session);
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 80f7e1e94448..4bd85c36308e 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -212,7 +212,6 @@ extern void ceph_messenger_destroy(struct ceph_messenger *);
 
 extern void ceph_con_init(struct ceph_messenger *msgr,
 			  struct ceph_connection *con);
-extern void ceph_con_shutdown(struct ceph_connection *con);
 extern void ceph_con_open(struct ceph_connection *con,
 			  struct ceph_entity_addr *addr);
 extern void ceph_con_close(struct ceph_connection *con);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 7db14ba6261c..bcb9fe693076 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -350,10 +350,8 @@ static void put_osd(struct ceph_osd *osd)
 {
 	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
 	     atomic_read(&osd->o_ref) - 1);
-	if (atomic_dec_and_test(&osd->o_ref)) {
-		ceph_con_shutdown(&osd->o_con);
+	if (atomic_dec_and_test(&osd->o_ref))
 		kfree(osd);
-	}
 }
 
 /*
-- 
cgit v1.2.2


From 71ececdacae24be333c534869cb1b06357f0e215 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 18 Nov 2009 11:27:06 -0800
Subject: ceph: remove unnecessary ceph_con_shutdown

We require that ceph_con_close be called before we drop the connection,
so this is unneeded.  Just BUG if con->sock != NULL.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index e389656b2a6b..d8a6a56a1571 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -339,17 +339,6 @@ void ceph_con_close(struct ceph_connection *con)
 	queue_con(con);
 }
 
-/*
- * clean up connection state
- */
-void ceph_con_shutdown(struct ceph_connection *con)
-{
-	dout("con_shutdown %p\n", con);
-	reset_connection(con);
-	set_bit(DEAD, &con->state);
-	con_close_socket(con); /* silently ignore errors */
-}
-
 /*
  * Reopen a closed connection, with a new peer address.
  */
@@ -380,7 +369,7 @@ void ceph_con_put(struct ceph_connection *con)
 	     atomic_read(&con->nref), atomic_read(&con->nref) - 1);
 	BUG_ON(atomic_read(&con->nref) == 0);
 	if (atomic_dec_and_test(&con->nref)) {
-		ceph_con_shutdown(con);
+		BUG_ON(con->sock);
 		kfree(con);
 	}
 }
-- 
cgit v1.2.2


From 5f44f142601bf94c448e2d463f0f18fd159da164 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 18 Nov 2009 14:52:18 -0800
Subject: ceph: handle errors during osd client init

Unwind initializing if we get ENOMEM during client initialization.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c |  3 ++-
 fs/ceph/mds_client.h |  2 +-
 fs/ceph/osd_client.c | 15 +++++++++++----
 fs/ceph/super.c      |  6 +++++-
 4 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fdecf9984180..69feeb1c9819 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2552,7 +2552,7 @@ static void delayed_work(struct work_struct *work)
 }
 
 
-void ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
+int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 {
 	mdsc->client = client;
 	mutex_init(&mdsc->mutex);
@@ -2582,6 +2582,7 @@ void ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 	init_waitqueue_head(&mdsc->cap_flushing_wq);
 	spin_lock_init(&mdsc->dentry_lru_lock);
 	INIT_LIST_HEAD(&mdsc->dentry_lru);
+	return 0;
 }
 
 /*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 0751b821f231..7c439488cfab 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -282,7 +282,7 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
 extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
 			     struct ceph_msg *msg, int mds);
 
-extern void ceph_mdsc_init(struct ceph_mds_client *mdsc,
+extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
 			   struct ceph_client *client);
 extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
 extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index bcb9fe693076..0a16c4f951f9 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1127,19 +1127,26 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->num_requests = 0;
 	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
 
+	err = -ENOMEM;
 	osdc->req_mempool = mempool_create_kmalloc_pool(10,
 					sizeof(struct ceph_osd_request));
 	if (!osdc->req_mempool)
-		return -ENOMEM;
+		goto out;
 
 	err = ceph_msgpool_init(&osdc->msgpool_op, 4096, 10, true);
 	if (err < 0)
-		return -ENOMEM;
+		goto out_mempool;
 	err = ceph_msgpool_init(&osdc->msgpool_op_reply, 512, 0, false);
 	if (err < 0)
-		return -ENOMEM;
-
+		goto out_msgpool;
 	return 0;
+
+out_msgpool:
+	ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+	mempool_destroy(osdc->req_mempool);
+out:
+	return err;
 }
 
 void ceph_osdc_stop(struct ceph_osd_client *osdc)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 1ac7b07214f3..fe0a5962a082 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -530,9 +530,13 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
 	err = ceph_osdc_init(&client->osdc, client);
 	if (err < 0)
 		goto fail_monc;
-	ceph_mdsc_init(&client->mdsc, client);
+	err = ceph_mdsc_init(&client->mdsc, client);
+	if (err < 0)
+		goto fail_osdc;
 	return client;
 
+fail_osdc:
+	ceph_osdc_stop(&client->osdc);
 fail_monc:
 	ceph_monc_stop(&client->monc);
 fail_trunc_wq:
-- 
cgit v1.2.2


From 4e7a5dcd1bbab6560fbc8ada29a840e7a20ed7bc Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 18 Nov 2009 16:19:57 -0800
Subject: ceph: negotiate authentication protocol; implement AUTH_NONE protocol

When we open a monitor session, we send an initial AUTH message listing
the auth protocols we support, our entity name, and (possibly) a previously
assigned global_id.  The monitor chooses a protocol and responds with an
initial message.

Initially implement AUTH_NONE, a dummy protocol that provides no security,
but works within the new framework.  It generates 'authorizers' that are
used when connecting to (mds, osd) services that simply state our entity
name and global_id.

This is a wire protocol change.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/Makefile       |   1 +
 fs/ceph/auth.c         | 220 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/auth.h         |  77 +++++++++++++++++
 fs/ceph/auth_none.c    | 120 +++++++++++++++++++++++++++
 fs/ceph/auth_none.h    |  28 +++++++
 fs/ceph/ceph_fs.h      |  14 +++-
 fs/ceph/ceph_strings.c |  13 +++
 fs/ceph/decode.h       |   6 ++
 fs/ceph/mds_client.c   |  69 +++++++++++++++-
 fs/ceph/mds_client.h   |   4 +
 fs/ceph/messenger.c    |  54 +++++++++++-
 fs/ceph/messenger.h    |  10 +++
 fs/ceph/mon_client.c   | 210 +++++++++++++++++++++++++++++++++++++---------
 fs/ceph/mon_client.h   |  14 ++--
 fs/ceph/msgr.h         |  21 +++--
 fs/ceph/osd_client.c   |  63 +++++++++++++-
 fs/ceph/osd_client.h   |   4 +
 fs/ceph/rados.h        |   2 +-
 fs/ceph/super.c        |  32 ++++---
 fs/ceph/super.h        |   2 +
 20 files changed, 888 insertions(+), 76 deletions(-)
 create mode 100644 fs/ceph/auth.c
 create mode 100644 fs/ceph/auth.h
 create mode 100644 fs/ceph/auth_none.c
 create mode 100644 fs/ceph/auth_none.h

(limited to 'fs')

diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index bdd3e6fc1609..827629c85768 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -13,6 +13,7 @@ ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
 	mon_client.o \
 	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
 	debugfs.o \
+	auth.o auth_none.o \
 	ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
 
 else
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..c4d1eee827a3
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,220 @@
+#include "ceph_debug.h"
+
+#include <linux/module.h>
+#include <linux/err.h>
+
+#include "types.h"
+#include "auth_none.h"
+#include "decode.h"
+#include "super.h"
+
+#include "messenger.h"
+
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+	CEPH_AUTH_NONE
+};
+
+int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+{
+	switch (protocol) {
+	case CEPH_AUTH_NONE:
+		return ceph_auth_none_init(ac);
+	default:
+		return -ENOENT;
+	}
+}
+
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
+{
+	struct ceph_auth_client *ac;
+	int ret;
+
+	dout("auth_init name '%s' secret '%s'\n", name, secret);
+
+	ret = -ENOMEM;
+	ac = kzalloc(sizeof(*ac), GFP_NOFS);
+	if (!ac)
+		goto out;
+
+	ac->negotiating = true;
+	if (name)
+		ac->name = name;
+	else
+		ac->name = CEPH_AUTH_NAME_DEFAULT;
+	dout("auth_init name %s secret %s\n", ac->name, secret);
+	ac->secret = secret;
+	return ac;
+
+out:
+	return ERR_PTR(ret);
+}
+
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+	dout("auth_destroy %p\n", ac);
+	if (ac->ops)
+		ac->ops->destroy(ac);
+	kfree(ac);
+}
+
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+	dout("auth_reset %p\n", ac);
+	if (ac->ops && !ac->negotiating)
+		ac->ops->reset(ac);
+	ac->negotiating = true;
+}
+
+int ceph_entity_name_encode(const char *name, void **p, void *end)
+{
+	int len = strlen(name);
+
+	if (*p + 2*sizeof(u32) + len > end)
+		return -ERANGE;
+	ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+	ceph_encode_32(p, len);
+	ceph_encode_copy(p, name, len);
+	return 0;
+}
+
+/*
+ * Initiate protocol negotiation with monitor.  Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+	struct ceph_mon_request_header *monhdr = buf;
+	void *p = monhdr + 1, *end = buf + len, *lenp;
+	int i, num;
+	int ret;
+
+	dout("auth_build_hello\n");
+	monhdr->have_version = 0;
+	monhdr->session_mon = cpu_to_le16(-1);
+	monhdr->session_mon_tid = 0;
+
+	ceph_encode_32(&p, 0);  /* no protocol, yet */
+
+	lenp = p;
+	p += sizeof(u32);
+
+	num = ARRAY_SIZE(supported_protocols);
+	ceph_encode_32(&p, num);
+	for (i = 0; i < num; i++)
+		ceph_encode_32(&p, supported_protocols[i]);
+
+	ret = ceph_entity_name_encode(ac->name, &p, end);
+	if (ret < 0)
+		return ret;
+	ceph_decode_need(&p, end, sizeof(u64), bad);
+	ceph_encode_64(&p, ac->global_id);
+
+	ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+	return p - buf;
+
+bad:
+	return -ERANGE;
+}
+
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+			   void *buf, size_t len,
+			   void *reply_buf, size_t reply_len)
+{
+	void *p = buf;
+	void *end = buf + len;
+	int protocol;
+	s32 result;
+	u64 global_id;
+	void *payload, *payload_end;
+	int payload_len;
+	char *result_msg;
+	int result_msg_len;
+	int ret = -EINVAL;
+
+	dout("handle_auth_reply %p %p\n", p, end);
+	ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+	protocol = ceph_decode_32(&p);
+	result = ceph_decode_32(&p);
+	global_id = ceph_decode_64(&p);
+	payload_len = ceph_decode_32(&p);
+	payload = p;
+	p += payload_len;
+	ceph_decode_need(&p, end, sizeof(u32), bad);
+	result_msg_len = ceph_decode_32(&p);
+	result_msg = p;
+	p += result_msg_len;
+	if (p != end)
+		goto bad;
+
+	dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+	     result_msg, global_id, payload_len);
+
+	payload_end = payload + payload_len;
+
+	if (global_id && ac->global_id != global_id) {
+		dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
+		ac->global_id = global_id;
+	}
+
+	if (ac->negotiating) {
+		/* set up (new) protocol handler? */
+		if (ac->protocol && ac->protocol != protocol) {
+			ac->ops->destroy(ac);
+			ac->protocol = 0;
+			ac->ops = NULL;
+		}
+		if (ac->protocol != protocol) {
+			ret = ceph_auth_init_protocol(ac, protocol);
+			if (ret) {
+				pr_err("error %d on auth protocol %d init\n",
+				       ret, protocol);
+				goto out;
+			}
+		}
+	}
+
+	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+	if (ret == -EAGAIN) {
+		struct ceph_mon_request_header *monhdr = reply_buf;
+		void *p = reply_buf + 1;
+		void *end = reply_buf + reply_len;
+
+		monhdr->have_version = 0;
+		monhdr->session_mon = cpu_to_le16(-1);
+		monhdr->session_mon_tid = 0;
+
+		ceph_encode_32(&p, ac->protocol);
+
+		ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+		if (ret < 0) {
+			pr_err("error %d building request\n", ret);
+			goto out;
+		}
+		dout(" built request %d bytes\n", ret);
+		ceph_encode_32(&p, ret);
+		return p + ret - reply_buf;
+	} else if (ret) {
+		pr_err("authentication error %d\n", ret);
+		return ret;
+	}
+	return 0;
+
+bad:
+	pr_err("failed to decode auth msg\n");
+out:
+	return ret;
+}
+
+
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..4d8cdf6bb3b6
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,77 @@
+#ifndef _FS_CEPH_AUTH_H
+#define _FS_CEPH_AUTH_H
+
+#include "types.h"
+#include "buffer.h"
+
+/*
+ * Abstract interface for communicating with the authenticate module.
+ * There is some handshake that takes place between us and the monitor
+ * to acquire the necessary keys.  These are used to generate an
+ * 'authorizer' that we use when connecting to a service (mds, osd).
+ */
+
+struct ceph_auth_client;
+struct ceph_authorizer;
+
+struct ceph_auth_client_ops {
+	/*
+	 * true if we are authenticated and can connect to
+	 * services.
+	 */
+	int (*is_authenticated)(struct ceph_auth_client *ac);
+
+	/*
+	 * build requests and process replies during monitor
+	 * handshake.  if handle_reply returns -EAGAIN, we build
+	 * another request.
+	 */
+	int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
+	int (*handle_reply)(struct ceph_auth_client *ac, int result,
+			    void *buf, void *end);
+
+	/*
+	 * Create authorizer for connecting to a service, and verify
+	 * the response to authenticate the service.
+	 */
+	int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
+				 struct ceph_authorizer **a,
+				 void **buf, size_t *len,
+				 void **reply_buf, size_t *reply_len);
+	int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
+				       struct ceph_authorizer *a, size_t len);
+	void (*destroy_authorizer)(struct ceph_auth_client *ac,
+				   struct ceph_authorizer *a);
+
+	/* reset when we (re)connect to a monitor */
+	void (*reset)(struct ceph_auth_client *ac);
+
+	void (*destroy)(struct ceph_auth_client *ac);
+};
+
+struct ceph_auth_client {
+	u32 protocol;           /* CEPH_AUTH_* */
+	void *private;          /* for use by protocol implementation */
+	const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
+
+	bool negotiating;       /* true if negotiating protocol */
+	const char *name;       /* entity name */
+	u64 global_id;          /* our unique id in system */
+	const char *secret;     /* our secret key */
+	unsigned want_keys;     /* which services we want */
+};
+
+extern struct ceph_auth_client *ceph_auth_init(const char *name,
+					       const char *secret);
+extern void ceph_auth_destroy(struct ceph_auth_client *ac);
+
+extern void ceph_auth_reset(struct ceph_auth_client *ac);
+
+extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
+				 void *buf, size_t len);
+extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+				  void *buf, size_t len,
+				  void *reply_buf, size_t reply_len);
+extern int ceph_entity_name_encode(const char *name, void **p, void *end);
+
+#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..631017eb7117
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,120 @@
+
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include "auth_none.h"
+#include "auth.h"
+#include "decode.h"
+
+static void reset(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	xi->starting = true;
+	xi->built_authorizer = false;
+}
+
+static void destroy(struct ceph_auth_client *ac)
+{
+	kfree(ac->private);
+	ac->private = NULL;
+}
+
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	return !xi->starting;
+}
+
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+			void *buf, void *end)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	xi->starting = false;
+	return result;
+}
+
+/*
+ * build an 'authorizer' with our entity_name and global_id.  we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_authorizer **a,
+	void **buf, size_t *len,
+	void **reply_buf, size_t *reply_len)
+{
+	struct ceph_auth_none_info *ai = ac->private;
+	struct ceph_none_authorizer *au = &ai->au;
+	void *p, *end;
+	int ret;
+
+	if (!ai->built_authorizer) {
+		p = au->buf;
+		end = p + sizeof(au->buf);
+		ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+		if (ret < 0)
+			goto bad;
+		ceph_decode_need(&p, end, sizeof(u64), bad2);
+		ceph_encode_64(&p, ac->global_id);
+		au->buf_len = p - (void *)au->buf;
+		ai->built_authorizer = true;
+		dout("built authorizer len %d\n", au->buf_len);
+	}
+
+	*a = (struct ceph_authorizer *)au;
+	*buf = au->buf;
+	*len = au->buf_len;
+	*reply_buf = au->reply_buf;
+	*reply_len = sizeof(au->reply_buf);
+	return 0;
+
+bad2:
+	ret = -ERANGE;
+bad:
+	return ret;
+}
+
+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a)
+{
+	/* nothing to do */
+}
+
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+	.reset = reset,
+	.destroy = destroy,
+	.is_authenticated = is_authenticated,
+	.handle_reply = handle_reply,
+	.create_authorizer = ceph_auth_none_create_authorizer,
+	.destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi;
+
+	dout("ceph_auth_none_init %p\n", ac);
+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
+	if (!xi)
+		return -ENOMEM;
+
+	xi->starting = true;
+	xi->built_authorizer = false;
+
+	ac->protocol = CEPH_AUTH_NONE;
+	ac->private = xi;
+	ac->ops = &ceph_auth_none_ops;
+	return 0;
+}
+
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..56c05533a31c
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+
+#include "auth.h"
+
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+
+struct ceph_none_authorizer {
+	char buf[128];
+	int buf_len;
+	char reply_buf[0];
+};
+
+struct ceph_auth_none_info {
+	bool starting;
+	bool built_authorizer;
+	struct ceph_none_authorizer au;   /* we only need one; it's static */
+};
+
+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 36becb024788..1e96a9a87d8d 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -75,6 +75,16 @@ struct ceph_file_layout {
 int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 
 
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES  0x1
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN	0x0
+#define CEPH_AUTH_NONE	 	0x1
+#define CEPH_AUTH_CEPHX	 	0x2
+
+
 /*********************************************
  * message layer
  */
@@ -90,12 +100,12 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 /* client <-> monitor */
 #define CEPH_MSG_MON_MAP                4
 #define CEPH_MSG_MON_GET_MAP            5
-#define CEPH_MSG_CLIENT_MOUNT           10
-#define CEPH_MSG_CLIENT_MOUNT_ACK       11
 #define CEPH_MSG_STATFS                 13
 #define CEPH_MSG_STATFS_REPLY           14
 #define CEPH_MSG_MON_SUBSCRIBE          15
 #define CEPH_MSG_MON_SUBSCRIBE_ACK      16
+#define CEPH_MSG_AUTH			17
+#define CEPH_MSG_AUTH_REPLY		18
 
 /* client <-> mds */
 #define CEPH_MSG_MDS_MAP                21
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 90d19d9d8d8f..8e4be6a80c62 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -3,6 +3,19 @@
  */
 #include "types.h"
 
+const char *ceph_entity_type_name(int type)
+{
+	switch (type) {
+	case CEPH_ENTITY_TYPE_MDS: return "mds";
+	case CEPH_ENTITY_TYPE_OSD: return "osd";
+	case CEPH_ENTITY_TYPE_MON: return "mon";
+	case CEPH_ENTITY_TYPE_CLIENT: return "client";
+	case CEPH_ENTITY_TYPE_ADMIN: return "admin";
+	case CEPH_ENTITY_TYPE_AUTH: return "auth";
+	default: return "unknown";
+	}
+}
+
 const char *ceph_osd_op_name(int op)
 {
 	switch (op) {
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
index a382aecc55bb..10de84896244 100644
--- a/fs/ceph/decode.h
+++ b/fs/ceph/decode.h
@@ -98,6 +98,7 @@ static inline void ceph_encode_addr(struct ceph_entity_addr *a)
 static inline void ceph_decode_addr(struct ceph_entity_addr *a)
 {
 	a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
+	WARN_ON(a->in_addr.ss_family == 512);
 }
 
 /*
@@ -123,6 +124,11 @@ static inline void ceph_encode_8(void **p, u8 v)
 	*(u8 *)*p = v;
 	(*p)++;
 }
+static inline void ceph_encode_copy(void **p, const void *s, int len)
+{
+	memcpy(*p, s, len);
+	*p += len;
+}
 
 /*
  * filepath, string encoders
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 69feeb1c9819..8a285158aecc 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -8,6 +8,7 @@
 #include "super.h"
 #include "messenger.h"
 #include "decode.h"
+#include "auth.h"
 
 /*
  * A cluster of MDS (metadata server) daemons is responsible for
@@ -274,8 +275,12 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
 {
 	dout("mdsc put_session %p %d -> %d\n", s,
 	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
-	if (atomic_dec_and_test(&s->s_ref))
+	if (atomic_dec_and_test(&s->s_ref)) {
+		if (s->s_authorizer)
+			s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
+				s->s_mdsc->client->monc.auth, s->s_authorizer);
 		kfree(s);
+	}
 }
 
 /*
@@ -2777,9 +2782,15 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 
 	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	if (ceph_fsid_compare(&fsid, &mdsc->client->monc.monmap->fsid)) {
-		pr_err("got mdsmap with wrong fsid\n");
-		return;
+        if (mdsc->client->monc.have_fsid) {
+		if (ceph_fsid_compare(&fsid,
+				      &mdsc->client->monc.monmap->fsid)) {
+			pr_err("got mdsmap with wrong fsid\n");
+			return;
+		}
+	} else {
+		ceph_fsid_set(&mdsc->client->monc.monmap->fsid, &fsid);
+		mdsc->client->monc.have_fsid = true;
 	}
 	epoch = ceph_decode_32(&p);
 	maplen = ceph_decode_32(&p);
@@ -2895,10 +2906,60 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 	ceph_msg_put(msg);
 }
 
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+			  void **buf, int *len, int *proto,
+			  void **reply_buf, int *reply_len, int force_new)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+	int ret = 0;
+
+	if (force_new && s->s_authorizer) {
+		ac->ops->destroy_authorizer(ac, s->s_authorizer);
+		s->s_authorizer = NULL;
+	}
+	if (s->s_authorizer == NULL) {
+		if (ac->ops->create_authorizer) {
+			ret = ac->ops->create_authorizer(
+				ac, CEPH_ENTITY_TYPE_MDS,
+				&s->s_authorizer,
+				&s->s_authorizer_buf,
+				&s->s_authorizer_buf_len,
+				&s->s_authorizer_reply_buf,
+				&s->s_authorizer_reply_buf_len);
+			if (ret)
+				return ret;
+		}
+	}
+
+	*proto = ac->protocol;
+	*buf = s->s_authorizer_buf;
+	*len = s->s_authorizer_buf_len;
+	*reply_buf = s->s_authorizer_reply_buf;
+	*reply_len = s->s_authorizer_reply_buf_len;
+	return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+
+	return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
+}
+
 const static struct ceph_connection_operations mds_con_ops = {
 	.get = con_get,
 	.put = con_put,
 	.dispatch = dispatch,
+	.get_authorizer = get_authorizer,
+	.verify_authorizer_reply = verify_authorizer_reply,
 	.peer_reset = peer_reset,
 	.alloc_msg = ceph_alloc_msg,
 	.alloc_middle = ceph_alloc_middle,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 7c439488cfab..9faa1b2f79a7 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -100,6 +100,10 @@ struct ceph_mds_session {
 
 	struct ceph_connection s_con;
 
+	struct ceph_authorizer *s_authorizer;
+	void             *s_authorizer_buf, *s_authorizer_reply_buf;
+	size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
+
 	/* protected by s_cap_lock */
 	spinlock_t        s_cap_lock;
 	u32               s_cap_gen;  /* inc each time we get mds stale msg */
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index d8a6a56a1571..0b1674854b25 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -550,6 +550,27 @@ static void prepare_write_keepalive(struct ceph_connection *con)
  * Connection negotiation.
  */
 
+static void prepare_connect_authorizer(struct ceph_connection *con)
+{
+	void *auth_buf;
+	int auth_len = 0;
+	int auth_protocol = 0;
+
+	if (con->ops->get_authorizer)
+		con->ops->get_authorizer(con, &auth_buf, &auth_len,
+					 &auth_protocol, &con->auth_reply_buf,
+					 &con->auth_reply_buf_len,
+					 con->auth_retry);
+
+	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
+	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
+
+	con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
+	con->out_kvec[con->out_kvec_left].iov_len = auth_len;
+	con->out_kvec_left++;
+	con->out_kvec_bytes += auth_len;
+}
+
 /*
  * We connected to a peer and are saying hello.
  */
@@ -592,6 +613,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
 
 	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
 	     con->connect_seq, global_seq, proto);
+
 	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
 	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
 	con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -611,6 +633,8 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
 	con->out_kvec_cur = con->out_kvec;
 	con->out_more = 0;
 	set_bit(WRITE_PENDING, &con->state);
+
+	prepare_connect_authorizer(con);
 }
 
 
@@ -777,6 +801,13 @@ static void prepare_read_connect(struct ceph_connection *con)
 	con->in_base_pos = 0;
 }
 
+static void prepare_read_connect_retry(struct ceph_connection *con)
+{
+	dout("prepare_read_connect_retry %p\n", con);
+	con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr)
+		+ sizeof(con->peer_addr_for_me);
+}
+
 static void prepare_read_ack(struct ceph_connection *con)
 {
 	dout("prepare_read_ack %p\n", con);
@@ -853,9 +884,14 @@ static int read_partial_connect(struct ceph_connection *con)
 	ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
 	if (ret <= 0)
 		goto out;
+	ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
+			   con->auth_reply_buf);
+	if (ret <= 0)
+		goto out;
 
-	dout("read_partial_connect %p connect_seq = %u, global_seq = %u\n",
-	     con, le32_to_cpu(con->in_reply.connect_seq),
+	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+	     con, (int)con->in_reply.tag,
+	     le32_to_cpu(con->in_reply.connect_seq),
 	     le32_to_cpu(con->in_reply.global_seq));
 out:
 	return ret;
@@ -1051,6 +1087,20 @@ static int process_connect(struct ceph_connection *con)
 		set_bit(CLOSED, &con->state);  /* in case there's queued work */
 		return -1;
 
+	case CEPH_MSGR_TAG_BADAUTHORIZER:
+		con->auth_retry++;
+		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+		     con->auth_retry);
+		if (con->auth_retry == 2) {
+			con->error_msg = "connect authorization failure";
+			reset_connection(con);
+			set_bit(CLOSED, &con->state);
+			return -1;
+		}
+		con->auth_retry = 1;
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect_retry(con);
+		break;
 
 	case CEPH_MSGR_TAG_RESETSESSION:
 		/*
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 4bd85c36308e..f9c9f6487302 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -26,6 +26,12 @@ struct ceph_connection_operations {
 	/* handle an incoming message. */
 	void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
 
+	/* authorize an outgoing connection */
+	int (*get_authorizer) (struct ceph_connection *con,
+			       void **buf, int *len, int *proto,
+			       void **reply_buf, int *reply_len, int force_new);
+	int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
+
 	/* protocol version mismatch */
 	void (*bad_proto) (struct ceph_connection *con);
 
@@ -144,6 +150,10 @@ struct ceph_connection {
 				 attempt for this connection, client */
 	u32 peer_global_seq;  /* peer's global seq for this connection */
 
+	int auth_retry;       /* true if we need a newer authorizer */
+	void *auth_reply_buf;   /* where to put the authorizer reply */
+	int auth_reply_buf_len;
+
 	/* out queue */
 	struct mutex out_mutex;
 	struct list_head out_queue;
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 95b76e761e18..017d5aef8834 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -6,6 +6,7 @@
 
 #include "mon_client.h"
 #include "super.h"
+#include "auth.h"
 #include "decode.h"
 
 /*
@@ -38,6 +39,10 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 	struct ceph_fsid fsid;
 	u32 epoch, num_mon;
 	u16 version;
+	u32 len;
+
+	ceph_decode_32_safe(&p, end, len, bad);
+	ceph_decode_need(&p, end, len, bad);
 
 	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
 
@@ -95,8 +100,10 @@ static void __close_session(struct ceph_mon_client *monc)
 {
 	if (monc->con) {
 		dout("__close_session closing mon%d\n", monc->cur_mon);
+		ceph_con_revoke(monc->con, monc->m_auth);
 		ceph_con_close(monc->con);
 		monc->cur_mon = -1;
+		ceph_auth_reset(monc->auth);
 	}
 }
 
@@ -106,6 +113,7 @@ static void __close_session(struct ceph_mon_client *monc)
 static int __open_session(struct ceph_mon_client *monc)
 {
 	char r;
+	int ret;
 
 	if (monc->cur_mon < 0) {
 		get_random_bytes(&r, 1);
@@ -121,6 +129,15 @@ static int __open_session(struct ceph_mon_client *monc)
 		monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
 		ceph_con_open(monc->con,
 			      &monc->monmap->mon_inst[monc->cur_mon].addr);
+
+		/* initiatiate authentication handshake */
+		ret = ceph_auth_build_hello(monc->auth,
+					    monc->m_auth->front.iov_base,
+					    monc->m_auth->front_max);
+		monc->m_auth->front.iov_len = ret;
+		monc->m_auth->hdr.front_len = cpu_to_le32(ret);
+		ceph_msg_get(monc->m_auth);  /* keep our ref */
+		ceph_con_send(monc->con, monc->m_auth);
 	} else {
 		dout("open_session mon%d already open\n", monc->cur_mon);
 	}
@@ -139,7 +156,7 @@ static void __schedule_delayed(struct ceph_mon_client *monc)
 {
 	unsigned delay;
 
-	if (monc->cur_mon < 0 || monc->want_mount || __sub_expired(monc))
+	if (monc->cur_mon < 0 || __sub_expired(monc))
 		delay = 10 * HZ;
 	else
 		delay = 20 * HZ;
@@ -161,7 +178,7 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 		struct ceph_mon_subscribe_item *i;
 		void *p, *end;
 
-		msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 64, 0, 0, NULL);
+		msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
 		if (!msg)
 			return;
 
@@ -173,7 +190,7 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 		if (monc->want_next_osdmap) {
 			dout("__send_subscribe to 'osdmap' %u\n",
 			     (unsigned)monc->have_osdmap);
-			ceph_encode_32(&p, 2);
+			ceph_encode_32(&p, 3);
 			ceph_encode_string(&p, end, "osdmap", 6);
 			i = p;
 			i->have = cpu_to_le64(monc->have_osdmap);
@@ -181,13 +198,18 @@ static void __send_subscribe(struct ceph_mon_client *monc)
 			p += sizeof(*i);
 			monc->want_next_osdmap = 2;  /* requested */
 		} else {
-			ceph_encode_32(&p, 1);
+			ceph_encode_32(&p, 2);
 		}
 		ceph_encode_string(&p, end, "mdsmap", 6);
 		i = p;
 		i->have = cpu_to_le64(monc->have_mdsmap);
 		i->onetime = 0;
 		p += sizeof(*i);
+		ceph_encode_string(&p, end, "monmap", 6);
+		i = p;
+		i->have = 0;
+		i->onetime = 0;
+		p += sizeof(*i);
 
 		msg->front.iov_len = p - msg->front.iov_base;
 		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -256,7 +278,7 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 	mutex_unlock(&monc->mutex);
 }
 
-
+#if 0
 /*
  * mount
  */
@@ -264,12 +286,8 @@ static void __request_mount(struct ceph_mon_client *monc)
 {
 	struct ceph_msg *msg;
 	struct ceph_client_mount *h;
-	int err;
 
 	dout("__request_mount\n");
-	err = __open_session(monc);
-	if (err)
-		return;
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_MOUNT, sizeof(*h), 0, 0, NULL);
 	if (IS_ERR(msg))
 		return;
@@ -279,8 +297,12 @@ static void __request_mount(struct ceph_mon_client *monc)
 	h->monhdr.session_mon_tid = 0;
 	ceph_con_send(monc->con, msg);
 }
+#endif
 
-int ceph_monc_request_mount(struct ceph_mon_client *monc)
+/*
+ * 
+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
 {
 	if (!monc->con) {
 		monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
@@ -292,12 +314,14 @@ int ceph_monc_request_mount(struct ceph_mon_client *monc)
 	}
 
 	mutex_lock(&monc->mutex);
-	__request_mount(monc);
+	__open_session(monc);
 	__schedule_delayed(monc);
 	mutex_unlock(&monc->mutex);
 	return 0;
 }
 
+#if 0
+
 /*
  * The monitor responds with mount ack indicate mount success.  The
  * included client ticket allows the client to talk to MDSs and OSDs.
@@ -372,9 +396,65 @@ out:
 	mutex_unlock(&monc->mutex);
 	wake_up(&client->mount_wq);
 }
+#endif
+
+/*
+ * The monitor responds with mount ack indicate mount success.  The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_msg *msg)
+{
+	struct ceph_client *client = monc->client;
+	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+	void *p, *end;
+
+	mutex_lock(&monc->mutex);
+
+	dout("handle_monmap\n");
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	monmap = ceph_monmap_decode(p, end);
+	if (IS_ERR(monmap)) {
+		pr_err("problem decoding monmap, %d\n",
+		       (int)PTR_ERR(monmap));
+		return;
+	}
+	if (monc->have_fsid &&
+	    ceph_fsid_compare(&monmap->fsid, &monc->monmap->fsid)) {
+		print_hex_dump(KERN_ERR, "monmap->fsid: ", DUMP_PREFIX_NONE, 16, 1,
+			       (void *)&monmap->fsid, 16, 0);
+		print_hex_dump(KERN_ERR, "monc->monmap->fsid: ", DUMP_PREFIX_NONE, 16, 1,
+			       (void *)&monc->monmap->fsid, 16, 0);
+
+		pr_err("fsid mismatch, got a previous map with different fsid");
+		kfree(monmap);
+		return;
+	}
+
+	client->monc.monmap = monmap;
+	client->monc.have_fsid = true;
+	kfree(old);
+
+	mutex_unlock(&monc->mutex);
+	wake_up(&client->mount_wq);
+}
+
 
+/*
+ * init client info after authentication
+ */
+static void __init_authenticated_client(struct ceph_mon_client *monc)
+{
+	struct ceph_client *client = monc->client;
 
+	client->signed_ticket = NULL;
+	client->signed_ticket_len = 0;
+	client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+	client->msgr->inst.name.num = monc->auth->global_id;
 
+	ceph_debugfs_client_init(client);
+}
 
 /*
  * statfs
@@ -414,12 +494,8 @@ static int send_statfs(struct ceph_mon_client *monc,
 {
 	struct ceph_msg *msg;
 	struct ceph_mon_statfs *h;
-	int err;
 
 	dout("send_statfs tid %llu\n", req->tid);
-	err = __open_session(monc);
-	if (err)
-		return err;
 	msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
 	if (IS_ERR(msg))
 		return PTR_ERR(msg);
@@ -514,17 +590,14 @@ static void delayed_work(struct work_struct *work)
 
 	dout("monc delayed_work\n");
 	mutex_lock(&monc->mutex);
-	if (monc->want_mount) {
-		__request_mount(monc);
+	if (monc->hunting) {
+		__close_session(monc);
+		__open_session(monc);  /* continue hunting */
 	} else {
-		if (monc->hunting) {
-			__close_session(monc);
-			__open_session(monc);  /* continue hunting */
-		} else {
-			ceph_con_keepalive(monc->con);
-		}
+		ceph_con_keepalive(monc->con);
+		if (monc->auth->ops->is_authenticated(monc->auth))
+			__send_subscribe(monc);
 	}
-	__send_subscribe(monc);
 	__schedule_delayed(monc);
 	mutex_unlock(&monc->mutex);
 }
@@ -555,6 +628,7 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
 		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
 	}
 	monc->monmap->num_mon = num_mon;
+	monc->have_fsid = false;
 
 	/* release addr memory */
 	kfree(args->mon_addr);
@@ -579,21 +653,37 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
 	monc->con = NULL;
 
+	/* authentication */
+	monc->auth = ceph_auth_init(cl->mount_args->name,
+				    cl->mount_args->secret);
+	if (IS_ERR(monc->auth))
+		return PTR_ERR(monc->auth);
+	monc->auth->want_keys =
+		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+
 	/* msg pools */
-	err = ceph_msgpool_init(&monc->msgpool_mount_ack, 4096, 1, false);
-	if (err < 0)
-		goto out;
 	err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
 			       sizeof(struct ceph_mon_subscribe_ack), 1, false);
 	if (err < 0)
-		goto out;
+		goto out_monmap;
 	err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
 				sizeof(struct ceph_mon_statfs_reply), 0, false);
 	if (err < 0)
-		goto out;
+		goto out_pool1;
+	err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
+	if (err < 0)
+		goto out_pool2;
+
+	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+	if (IS_ERR(monc->m_auth)) {
+		err = PTR_ERR(monc->m_auth);
+		monc->m_auth = NULL;
+		goto out_pool3;
+	}
 
 	monc->cur_mon = -1;
-	monc->hunting = false;  /* not really */
+	monc->hunting = true;
 	monc->sub_renew_after = jiffies;
 	monc->sub_sent = 0;
 
@@ -605,7 +695,16 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	monc->have_mdsmap = 0;
 	monc->have_osdmap = 0;
 	monc->want_next_osdmap = 1;
-	monc->want_mount = true;
+	return 0;
+
+out_pool3:
+	ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+out_pool2:
+	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+out_pool1:
+	ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+out_monmap:
+	kfree(monc->monmap);
 out:
 	return err;
 }
@@ -624,14 +723,44 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 	}
 	mutex_unlock(&monc->mutex);
 
-	ceph_msgpool_destroy(&monc->msgpool_mount_ack);
+	ceph_auth_destroy(monc->auth);
+
+	ceph_msg_put(monc->m_auth);
 	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
 	ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+	ceph_msgpool_destroy(&monc->msgpool_auth_reply);
 
 	kfree(monc->monmap);
 }
 
 
+static void handle_auth_reply(struct ceph_mon_client *monc,
+			      struct ceph_msg *msg)
+{
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+				     msg->front.iov_len,
+				     monc->m_auth->front.iov_base,
+				     monc->m_auth->front_max);
+	if (ret < 0) {
+		monc->client->mount_err = ret;
+		wake_up(&monc->client->mount_wq);
+	} else if (ret > 0) {
+		monc->m_auth->front.iov_len = ret;
+		monc->m_auth->hdr.front_len = cpu_to_le32(ret);
+		ceph_msg_get(monc->m_auth);  /* keep our ref */
+		ceph_con_send(monc->con, monc->m_auth);
+	} else if (monc->auth->ops->is_authenticated(monc->auth)) {
+		dout("authenticated, starting session\n");
+		__init_authenticated_client(monc);
+		__send_subscribe(monc);
+		__resend_statfs(monc);
+	}
+	mutex_unlock(&monc->mutex);
+}
+
 /*
  * handle incoming message
  */
@@ -644,8 +773,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		return;
 
 	switch (type) {
-	case CEPH_MSG_CLIENT_MOUNT_ACK:
-		handle_mount_ack(monc, msg);
+	case CEPH_MSG_AUTH_REPLY:
+		handle_auth_reply(monc, msg);
 		break;
 
 	case CEPH_MSG_MON_SUBSCRIBE_ACK:
@@ -656,6 +785,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		handle_statfs_reply(monc, msg);
 		break;
 
+	case CEPH_MSG_MON_MAP:
+		ceph_monc_handle_map(monc, msg);
+		break;
+
 	case CEPH_MSG_MDS_MAP:
 		ceph_mdsc_handle_map(&monc->client->mdsc, msg);
 		break;
@@ -682,12 +815,12 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 	int front = le32_to_cpu(hdr->front_len);
 
 	switch (type) {
-	case CEPH_MSG_CLIENT_MOUNT_ACK:
-		return ceph_msgpool_get(&monc->msgpool_mount_ack, front);
 	case CEPH_MSG_MON_SUBSCRIBE_ACK:
 		return ceph_msgpool_get(&monc->msgpool_subscribe_ack, front);
 	case CEPH_MSG_STATFS_REPLY:
 		return ceph_msgpool_get(&monc->msgpool_statfs_reply, front);
+	case CEPH_MSG_AUTH_REPLY:
+		return ceph_msgpool_get(&monc->msgpool_auth_reply, front);
 	}
 	return ceph_alloc_msg(con, hdr);
 }
@@ -717,10 +850,7 @@ static void mon_fault(struct ceph_connection *con)
 	if (!monc->hunting) {
 		/* start hunting */
 		monc->hunting = true;
-		if (__open_session(monc) == 0) {
-			__send_subscribe(monc);
-			__resend_statfs(monc);
-		}
+		__open_session(monc);
 	} else {
 		/* already hunting, let's wait a bit */
 		__schedule_delayed(monc);
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index 9f6db45bf469..c75b53302ecc 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -9,6 +9,7 @@
 
 struct ceph_client;
 struct ceph_mount_args;
+struct ceph_auth_client;
 
 /*
  * The monitor map enumerates the set of all monitors.
@@ -58,23 +59,26 @@ struct ceph_mon_client {
 	struct mutex mutex;
 	struct delayed_work delayed_work;
 
+	struct ceph_auth_client *auth;
+	struct ceph_msg *m_auth;
+
 	bool hunting;
 	int cur_mon;                       /* last monitor i contacted */
 	unsigned long sub_sent, sub_renew_after;
 	struct ceph_connection *con;
+	bool have_fsid;
 
 	/* msg pools */
-	struct ceph_msgpool msgpool_mount_ack;
 	struct ceph_msgpool msgpool_subscribe_ack;
 	struct ceph_msgpool msgpool_statfs_reply;
+	struct ceph_msgpool msgpool_auth_reply;
 
 	/* pending statfs requests */
 	struct radix_tree_root statfs_request_tree;
 	int num_statfs_requests;
 	u64 last_tid;
 
-	/* mds/osd map or mount requests */
-	bool want_mount;
+	/* mds/osd map */
 	int want_next_osdmap; /* 1 = want, 2 = want+asked */
 	u32 have_osdmap, have_mdsmap;
 
@@ -101,11 +105,11 @@ extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
 
 extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
 
-extern int ceph_monc_request_mount(struct ceph_mon_client *monc);
-
 extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
 			       struct ceph_statfs *buf);
 
+extern int ceph_monc_open_session(struct ceph_mon_client *monc);
+
 
 
 #endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8e3ea2eb1bf5..c758e8f8f71b 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -21,7 +21,7 @@
  * whenever the wire protocol changes.  try to keep this string length
  * constant.
  */
-#define CEPH_BANNER "ceph v023"
+#define CEPH_BANNER "ceph v024"
 #define CEPH_BANNER_MAX_LEN 30
 
 
@@ -46,11 +46,16 @@ struct ceph_entity_name {
 	__le64 num;
 } __attribute__ ((packed));
 
-#define CEPH_ENTITY_TYPE_MON    1
-#define CEPH_ENTITY_TYPE_MDS    2
-#define CEPH_ENTITY_TYPE_OSD    3
-#define CEPH_ENTITY_TYPE_CLIENT 4
-#define CEPH_ENTITY_TYPE_ADMIN  5
+#define CEPH_ENTITY_TYPE_MON    0x01
+#define CEPH_ENTITY_TYPE_MDS    0x02
+#define CEPH_ENTITY_TYPE_OSD    0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_ADMIN  0x10
+#define CEPH_ENTITY_TYPE_AUTH   0x20
+
+#define CEPH_ENTITY_TYPE_ANY    0xFF
+
+extern const char *ceph_entity_type_name(int type);
 
 /*
  * entity_addr -- network address
@@ -94,6 +99,7 @@ struct ceph_entity_inst {
 #define CEPH_MSGR_TAG_ACK           8  /* message ack */
 #define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
 #define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
 
 
 /*
@@ -104,6 +110,8 @@ struct ceph_msg_connect {
 	__le32 global_seq;   /* count connections initiated by this host */
 	__le32 connect_seq;  /* count connections initiated in this session */
 	__le32 protocol_version;
+	__le32 authorizer_protocol;
+	__le32 authorizer_len;
 	__u8  flags;         /* CEPH_MSG_CONNECT_* */
 } __attribute__ ((packed));
 
@@ -112,6 +120,7 @@ struct ceph_msg_connect_reply {
 	__le32 global_seq;
 	__le32 connect_seq;
 	__le32 protocol_version;
+	__le32 authorizer_len;
 	__u8 flags;
 } __attribute__ ((packed));
 
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 0a16c4f951f9..ca0ee68c322a 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -11,6 +11,7 @@
 #include "osd_client.h"
 #include "messenger.h"
 #include "decode.h"
+#include "auth.h"
 
 const static struct ceph_connection_operations osd_con_ops;
 
@@ -331,6 +332,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
 	osd->o_con.private = osd;
 	osd->o_con.ops = &osd_con_ops;
 	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+
 	return osd;
 }
 
@@ -880,9 +882,15 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	/* verify fsid */
 	ceph_decode_need(&p, end, sizeof(fsid), bad);
 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	if (ceph_fsid_compare(&fsid, &osdc->client->monc.monmap->fsid)) {
-		pr_err("got osdmap with wrong fsid, ignoring\n");
-		return;
+        if (osdc->client->monc.have_fsid) {
+		if (ceph_fsid_compare(&fsid,
+			              &osdc->client->monc.monmap->fsid)) {
+			pr_err("got osdmap with wrong fsid, ignoring\n");
+			return;
+		}
+	} else {
+		ceph_fsid_set(&osdc->client->monc.monmap->fsid, &fsid);
+		osdc->client->monc.have_fsid = true;
 	}
 
 	down_write(&osdc->map_sem);
@@ -1302,10 +1310,59 @@ static void put_osd_con(struct ceph_connection *con)
 	put_osd(osd);
 }
 
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+                          void **buf, int *len, int *proto,
+                          void **reply_buf, int *reply_len, int force_new)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+	int ret = 0;
+
+	if (force_new && o->o_authorizer) {
+		ac->ops->destroy_authorizer(ac, o->o_authorizer);
+		o->o_authorizer = NULL;
+	}
+	if (o->o_authorizer == NULL) {
+		ret = ac->ops->create_authorizer(
+			ac, CEPH_ENTITY_TYPE_OSD,
+			&o->o_authorizer,
+			&o->o_authorizer_buf,
+			&o->o_authorizer_buf_len,
+			&o->o_authorizer_reply_buf,
+			&o->o_authorizer_reply_buf_len);
+		if (ret)
+		return ret;
+	}
+
+	*proto = ac->protocol;
+	*buf = o->o_authorizer_buf;
+	*len = o->o_authorizer_buf_len;
+	*reply_buf = o->o_authorizer_reply_buf;
+	*reply_len = o->o_authorizer_reply_buf_len;
+	return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+	return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
+}
+
+
 const static struct ceph_connection_operations osd_con_ops = {
 	.get = get_osd_con,
 	.put = put_osd_con,
 	.dispatch = dispatch,
+	.get_authorizer = get_authorizer,
+	.verify_authorizer_reply = verify_authorizer_reply,
 	.alloc_msg = alloc_msg,
 	.fault = osd_reset,
 	.alloc_middle = ceph_alloc_middle,
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index 766c8dc80aff..3d4ae6595aaa 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -13,6 +13,7 @@ struct ceph_msg;
 struct ceph_snap_context;
 struct ceph_osd_request;
 struct ceph_osd_client;
+struct ceph_authorizer;
 
 /*
  * completion callback for async writepages
@@ -29,6 +30,9 @@ struct ceph_osd {
 	struct rb_node o_node;
 	struct ceph_connection o_con;
 	struct list_head o_requests;
+	struct ceph_authorizer *o_authorizer;
+	void *o_authorizer_buf, *o_authorizer_reply_buf;
+	size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
 };
 
 /* an in-flight request */
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index fb23ff9297c9..12bfb2f7c275 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -157,7 +157,6 @@ struct ceph_eversion {
 #define CEPH_OSD_OP_MODE_WR    0x2000
 #define CEPH_OSD_OP_MODE_RMW   0x3000
 #define CEPH_OSD_OP_MODE_SUB   0x4000
-#define CEPH_OSD_OP_MODE_EXEC  0x8000
 
 #define CEPH_OSD_OP_TYPE       0x0f00
 #define CEPH_OSD_OP_TYPE_LOCK  0x0100
@@ -285,6 +284,7 @@ enum {
 	CEPH_OSD_FLAG_BALANCE_READS = 256,
 	CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
 	CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
+	CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
 };
 
 enum {
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index fe0a5962a082..c901395ae8a1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -128,6 +128,8 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 		seq_puts(m, ",noasyncreaddir");
 	if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
 		seq_printf(m, ",snapdirname=%s", args->snapdir_name);
+	if (args->name)
+		seq_printf(m, ",name=%s", args->name);
 	if (args->secret)
 		seq_puts(m, ",secret=<hidden>");
 	return 0;
@@ -224,12 +226,12 @@ const char *ceph_msg_type_name(int type)
 	switch (type) {
 	case CEPH_MSG_SHUTDOWN: return "shutdown";
 	case CEPH_MSG_PING: return "ping";
+	case CEPH_MSG_AUTH: return "auth";
+	case CEPH_MSG_AUTH_REPLY: return "auth_reply";
 	case CEPH_MSG_MON_MAP: return "mon_map";
 	case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
 	case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
 	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
-	case CEPH_MSG_CLIENT_MOUNT: return "client_mount";
-	case CEPH_MSG_CLIENT_MOUNT_ACK: return "client_mount_ack";
 	case CEPH_MSG_STATFS: return "statfs";
 	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
 	case CEPH_MSG_MDS_MAP: return "mds_map";
@@ -267,6 +269,7 @@ enum {
 	Opt_last_int,
 	/* int args above */
 	Opt_snapdirname,
+	Opt_name,
 	Opt_secret,
 	Opt_last_string,
 	/* string args above */
@@ -293,6 +296,7 @@ static match_table_t arg_tokens = {
 	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
 	/* int args above */
 	{Opt_snapdirname, "snapdirname=%s"},
+	{Opt_name, "name=%s"},
 	{Opt_secret, "secret=%s"},
 	/* string args above */
 	{Opt_ip, "ip=%s"},
@@ -407,6 +411,11 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
 					      argstr[0].to-argstr[0].from,
 					      GFP_KERNEL);
 			break;
+		case Opt_name:
+			args->name = kstrndup(argstr[0].from,
+					      argstr[0].to-argstr[0].from,
+					      GFP_KERNEL);
+			break;
 		case Opt_secret:
 			args->secret = kstrndup(argstr[0].from,
 						argstr[0].to-argstr[0].from,
@@ -476,6 +485,8 @@ static void destroy_mount_args(struct ceph_mount_args *args)
 	dout("destroy_mount_args %p\n", args);
 	kfree(args->snapdir_name);
 	args->snapdir_name = NULL;
+	kfree(args->name);
+	args->name = NULL;
 	kfree(args->secret);
 	args->secret = NULL;
 	kfree(args);
@@ -657,27 +668,23 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
 		client->msgr->nocrc = ceph_test_opt(client, NOCRC);
 	}
 
-	/* send mount request, and wait for mon, mds, and osd maps */
-	err = ceph_monc_request_mount(&client->monc);
+	/* open session, and wait for mon, mds, and osd maps */
+	err = ceph_monc_open_session(&client->monc);
 	if (err < 0)
 		goto out;
 
-	while (!have_mon_map(client) && !client->mount_err) {
+	while (!have_mon_map(client)) {
 		err = -EIO;
 		if (timeout && time_after_eq(jiffies, started + timeout))
 			goto out;
 
 		/* wait */
-		dout("mount waiting for mount\n");
-		err = wait_event_interruptible_timeout(client->mount_wq,
-			       client->mount_err || have_mon_map(client),
+		dout("mount waiting for mon_map\n");
+		err = wait_event_interruptible_timeout(client->mount_wq, /* FIXME */
+			       have_mon_map(client),
 			       timeout);
 		if (err == -EINTR || err == -ERESTARTSYS)
 			goto out;
-		if (client->mount_err) {
-			err = client->mount_err;
-			goto out;
-		}
 	}
 
 	dout("mount opening root\n");
@@ -795,7 +802,6 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 		client->backing_dev_info.ra_pages =
 			(client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
 			>> PAGE_SHIFT;
-
 	err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
 	return err;
 }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 8aa1ffba6c0d..e0e8130959b6 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -61,6 +61,7 @@ struct ceph_mount_args {
 	int max_readdir;      /* max readdir size */
 	int osd_timeout;
 	char *snapdir_name;   /* default ".snap" */
+	char *name;
 	char *secret;
 	int cap_release_safety;
 };
@@ -75,6 +76,7 @@ struct ceph_mount_args {
 #define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
 
 #define CEPH_SNAPDIRNAME_DEFAULT ".snap"
+#define CEPH_AUTH_NAME_DEFAULT   "guest"
 
 /*
  * Delay telling the MDS we no longer want caps, in case we reopen
-- 
cgit v1.2.2


From b9bfb93ce2b1ef668254f0b9e16fcc5246d65d8e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 18 Nov 2009 15:08:44 -0800
Subject: ceph: move mempool creation to ceph_create_client

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c901395ae8a1..df05617aca86 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -534,10 +534,18 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
 	if (client->trunc_wq == NULL)
 		goto fail_pg_inv_wq;
 
+	/* set up mempools */
+	err = -ENOMEM;
+	client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+			      client->mount_args->wsize >> PAGE_CACHE_SHIFT);
+	if (!client->wb_pagevec_pool)
+		goto fail_trunc_wq;
+
+
 	/* subsystems */
 	err = ceph_monc_init(&client->monc, client);
 	if (err < 0)
-		goto fail_trunc_wq;
+		goto fail_mempool;
 	err = ceph_osdc_init(&client->osdc, client);
 	if (err < 0)
 		goto fail_monc;
@@ -550,6 +558,8 @@ fail_osdc:
 	ceph_osdc_stop(&client->osdc);
 fail_monc:
 	ceph_monc_stop(&client->monc);
+fail_mempool:
+	mempool_destroy(client->wb_pagevec_pool);
 fail_trunc_wq:
 	destroy_workqueue(client->trunc_wq);
 fail_pg_inv_wq:
@@ -581,8 +591,7 @@ static void ceph_destroy_client(struct ceph_client *client)
 
 	if (client->msgr)
 		ceph_messenger_destroy(client->msgr);
-	if (client->wb_pagevec_pool)
-		mempool_destroy(client->wb_pagevec_pool);
+	mempool_destroy(client->wb_pagevec_pool);
 
 	destroy_mount_args(client->mount_args);
 
@@ -845,14 +854,6 @@ static int ceph_get_sb(struct file_system_type *fs_type,
 		dout("get_sb got existing client %p\n", client);
 	} else {
 		dout("get_sb using new client %p\n", client);
-
-		/* set up mempools */
-		err = -ENOMEM;
-		client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
-			      client->mount_args->wsize >> PAGE_CACHE_SHIFT);
-		if (!client->wb_pagevec_pool)
-			goto out_splat;
-
 		err = ceph_register_bdi(sb, client);
 		if (err < 0)
 			goto out_splat;
-- 
cgit v1.2.2


From cfea1cf42b614583c02727d5bffd5a2384e92bda Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 18 Nov 2009 16:50:55 -0800
Subject: ceph: small cleanup in hash function

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_hash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
index ac8be54631fe..1c44e43fe89e 100644
--- a/fs/ceph/ceph_hash.c
+++ b/fs/ceph/ceph_hash.c
@@ -85,7 +85,7 @@ unsigned ceph_str_hash_linux(const char *str, unsigned length)
         unsigned long hash = 0;
 	unsigned char c;
 
-        while (length-- > 0) {
+        while (length--) {
 		c = *str++;
 		hash = (hash + (c << 4) + (c >> 4)) * 11;
 	}
-- 
cgit v1.2.2


From 0743304d871559cb4c7c066357de2caa60e94c2f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 18 Nov 2009 16:50:41 -0800
Subject: ceph: fix debugfs entry, simplify fsid checks

We may first learn our fsid from any of the mon, osd, or mds maps
(whichever the monitor sends first).  Consolidate checks in a single
helper.  Initialize the client debugfs entry then, since we need the
fsid (and global_id) for the directory name.

Also remove dead mount code.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/debugfs.c    |   4 +-
 fs/ceph/mds_client.c |  12 +-----
 fs/ceph/mon_client.c | 113 ++++-----------------------------------------------
 fs/ceph/osd_client.c |  12 +-----
 fs/ceph/super.c      |  32 ++++++++++++---
 fs/ceph/super.h      |  18 ++++----
 6 files changed, 51 insertions(+), 140 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 9b2020696680..b90fc3e1ff70 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -7,6 +7,8 @@
 
 #include "super.h"
 #include "mds_client.h"
+#include "mon_client.h"
+#include "auth.h"
 
 #ifdef CONFIG_DEBUG_FS
 
@@ -335,7 +337,7 @@ int ceph_debugfs_client_init(struct ceph_client *client)
 	char name[80];
 
 	snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
-		 PR_FSID(&client->monc.monmap->fsid), client->whoami);
+		 PR_FSID(&client->fsid), client->monc.auth->global_id);
 
 	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
 	if (!client->debugfs_dir)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 8a285158aecc..8d95b0f051e4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2782,16 +2782,8 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 
 	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-        if (mdsc->client->monc.have_fsid) {
-		if (ceph_fsid_compare(&fsid,
-				      &mdsc->client->monc.monmap->fsid)) {
-			pr_err("got mdsmap with wrong fsid\n");
-			return;
-		}
-	} else {
-		ceph_fsid_set(&mdsc->client->monc.monmap->fsid, &fsid);
-		mdsc->client->monc.have_fsid = true;
-	}
+	if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+		return;
 	epoch = ceph_decode_32(&p);
 	maplen = ceph_decode_32(&p);
 	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 017d5aef8834..b742b3b3e0f3 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -320,89 +320,12 @@ int ceph_monc_open_session(struct ceph_mon_client *monc)
 	return 0;
 }
 
-#if 0
-
-/*
- * The monitor responds with mount ack indicate mount success.  The
- * included client ticket allows the client to talk to MDSs and OSDs.
- */
-static void handle_mount_ack(struct ceph_mon_client *monc, struct ceph_msg *msg)
-{
-	struct ceph_client *client = monc->client;
-	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
-	void *p, *end;
-	s32 result;
-	u32 len;
-	s64 cnum;
-	int err = -EINVAL;
-
-	if (client->whoami >= 0) {
-		dout("handle_mount_ack - already mounted\n");
-		return;
-	}
-
-	mutex_lock(&monc->mutex);
-
-	dout("handle_mount_ack\n");
-	p = msg->front.iov_base;
-	end = p + msg->front.iov_len;
-
-	ceph_decode_64_safe(&p, end, cnum, bad);
-	ceph_decode_32_safe(&p, end, result, bad);
-	ceph_decode_32_safe(&p, end, len, bad);
-	if (result) {
-		pr_err("mount denied: %.*s (%d)\n", len, (char *)p,
-		       result);
-		err = result;
-		goto out;
-	}
-	p += len;
-
-	ceph_decode_32_safe(&p, end, len, bad);
-	ceph_decode_need(&p, end, len, bad);
-	monmap = ceph_monmap_decode(p, p + len);
-	if (IS_ERR(monmap)) {
-		pr_err("problem decoding monmap, %d\n",
-		       (int)PTR_ERR(monmap));
-		err = -EINVAL;
-		goto out;
-	}
-	p += len;
-
-	client->monc.monmap = monmap;
-	kfree(old);
-
-	client->signed_ticket = NULL;
-	client->signed_ticket_len = 0;
-
-	monc->want_mount = false;
-
-	client->whoami = cnum;
-	client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
-	client->msgr->inst.name.num = cpu_to_le64(cnum);
-	pr_info("client%lld fsid " FSID_FORMAT "\n",
-		client->whoami, PR_FSID(&client->monc.monmap->fsid));
-
-	ceph_debugfs_client_init(client);
-	__send_subscribe(monc);
-
-	err = 0;
-	goto out;
-
-bad:
-	pr_err("error decoding mount_ack message\n");
-out:
-	client->mount_err = err;
-	mutex_unlock(&monc->mutex);
-	wake_up(&client->mount_wq);
-}
-#endif
-
 /*
  * The monitor responds with mount ack indicate mount success.  The
  * included client ticket allows the client to talk to MDSs and OSDs.
  */
-static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_msg *msg)
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+				 struct ceph_msg *msg)
 {
 	struct ceph_client *client = monc->client;
 	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
@@ -420,42 +343,19 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_msg *
 		       (int)PTR_ERR(monmap));
 		return;
 	}
-	if (monc->have_fsid &&
-	    ceph_fsid_compare(&monmap->fsid, &monc->monmap->fsid)) {
-		print_hex_dump(KERN_ERR, "monmap->fsid: ", DUMP_PREFIX_NONE, 16, 1,
-			       (void *)&monmap->fsid, 16, 0);
-		print_hex_dump(KERN_ERR, "monc->monmap->fsid: ", DUMP_PREFIX_NONE, 16, 1,
-			       (void *)&monc->monmap->fsid, 16, 0);
-
-		pr_err("fsid mismatch, got a previous map with different fsid");
+
+	if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
 		kfree(monmap);
 		return;
 	}
 
 	client->monc.monmap = monmap;
-	client->monc.have_fsid = true;
 	kfree(old);
 
 	mutex_unlock(&monc->mutex);
 	wake_up(&client->mount_wq);
 }
 
-
-/*
- * init client info after authentication
- */
-static void __init_authenticated_client(struct ceph_mon_client *monc)
-{
-	struct ceph_client *client = monc->client;
-
-	client->signed_ticket = NULL;
-	client->signed_ticket_len = 0;
-	client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
-	client->msgr->inst.name.num = monc->auth->global_id;
-
-	ceph_debugfs_client_init(client);
-}
-
 /*
  * statfs
  */
@@ -754,7 +654,10 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 		ceph_con_send(monc->con, monc->m_auth);
 	} else if (monc->auth->ops->is_authenticated(monc->auth)) {
 		dout("authenticated, starting session\n");
-		__init_authenticated_client(monc);
+
+		monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+		monc->client->msgr->inst.name.num = monc->auth->global_id;
+
 		__send_subscribe(monc);
 		__resend_statfs(monc);
 	}
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index ca0ee68c322a..d63f192999ee 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -882,16 +882,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	/* verify fsid */
 	ceph_decode_need(&p, end, sizeof(fsid), bad);
 	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-        if (osdc->client->monc.have_fsid) {
-		if (ceph_fsid_compare(&fsid,
-			              &osdc->client->monc.monmap->fsid)) {
-			pr_err("got osdmap with wrong fsid, ignoring\n");
-			return;
-		}
-	} else {
-		ceph_fsid_set(&osdc->client->monc.monmap->fsid, &fsid);
-		osdc->client->monc.have_fsid = true;
-	}
+	if (ceph_check_fsid(osdc->client, &fsid) < 0)
+		return;
 
 	down_write(&osdc->map_sem);
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index df05617aca86..3df6d4ab236c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -19,6 +19,7 @@
 #include "decode.h"
 #include "super.h"
 #include "mon_client.h"
+#include "auth.h"
 
 /*
  * Ceph superblock operations
@@ -510,14 +511,11 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
 
 	client->sb = NULL;
 	client->mount_state = CEPH_MOUNT_MOUNTING;
-	client->whoami = -1;
 	client->mount_args = args;
 
 	client->msgr = NULL;
 
 	client->mount_err = 0;
-	client->signed_ticket = NULL;
-	client->signed_ticket_len = 0;
 
 	err = bdi_init(&client->backing_dev_info);
 	if (err < 0)
@@ -582,8 +580,6 @@ static void ceph_destroy_client(struct ceph_client *client)
 	ceph_monc_stop(&client->monc);
 	ceph_osdc_stop(&client->osdc);
 
-	kfree(client->signed_ticket);
-
 	ceph_debugfs_client_cleanup(client);
 	destroy_workqueue(client->wb_wq);
 	destroy_workqueue(client->pg_inv_wq);
@@ -599,6 +595,32 @@ static void ceph_destroy_client(struct ceph_client *client)
 	dout("destroy_client %p done\n", client);
 }
 
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+	if (client->have_fsid) {
+		if (ceph_fsid_compare(&client->fsid, fsid)) {
+			print_hex_dump(KERN_ERR, "this fsid: ",
+				       DUMP_PREFIX_NONE, 16, 1,
+				       (void *)fsid, 16, 0);
+			print_hex_dump(KERN_ERR, " old fsid: ",
+				       DUMP_PREFIX_NONE, 16, 1,
+				       (void *)&client->fsid, 16, 0);
+			pr_err("fsid mismatch\n");
+			return -1;
+		}
+	} else {
+		pr_info("client%lld fsid " FSID_FORMAT "\n",
+			client->monc.auth->global_id, PR_FSID(fsid));
+		memcpy(&client->fsid, fsid, sizeof(*fsid));
+		ceph_debugfs_client_init(client);
+		client->have_fsid = true;
+	}
+	return 0;
+}
+
 /*
  * true if we have the mon map (and have thus joined the cluster)
  */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e0e8130959b6..de5e32414978 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -113,16 +113,11 @@ static inline unsigned long time_sub(unsigned long a, unsigned long b)
  * mounting the same ceph filesystem/cluster.
  */
 struct ceph_client {
-	__s64 whoami;                   /* my client number */
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *debugfs_monmap;
-	struct dentry *debugfs_mdsmap, *debugfs_osdmap;
-	struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
-#endif
+	struct ceph_fsid fsid;
+	bool have_fsid;
 
 	struct mutex mount_mutex;       /* serialize mount attempts */
 	struct ceph_mount_args *mount_args;
-	struct ceph_fsid fsid;
 
 	struct super_block *sb;
 
@@ -130,8 +125,6 @@ struct ceph_client {
 	wait_queue_head_t mount_wq;
 
 	int mount_err;
-	void *signed_ticket;           /* our keys to the kingdom */
-	int signed_ticket_len;
 
 	struct ceph_messenger *msgr;   /* messenger instance */
 	struct ceph_mon_client monc;
@@ -145,6 +138,12 @@ struct ceph_client {
 	struct workqueue_struct *trunc_wq;
 
 	struct backing_dev_info backing_dev_info;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_monmap;
+	struct dentry *debugfs_mdsmap, *debugfs_osdmap;
+	struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+#endif
 };
 
 static inline struct ceph_client *ceph_client(struct super_block *sb)
@@ -735,6 +734,7 @@ extern struct kmem_cache *ceph_dentry_cachep;
 extern struct kmem_cache *ceph_file_cachep;
 
 extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
 
 #define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
 	"%02x%02x%02x%02x%02x%02x"
-- 
cgit v1.2.2


From 94045e115ee72aee3b17295791da07078f2f778c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 19 Nov 2009 15:31:50 -0800
Subject: ceph: decode updated mdsmap format

The mds map now uses the global_id as the 'key' (instead of the addr,
which was a poor choice).

This is protocol change.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h |  2 +-
 fs/ceph/mdsmap.c  | 15 +++++++++------
 fs/ceph/mdsmap.h  |  1 +
 3 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 1e96a9a87d8d..4e5f49c738d8 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -39,7 +39,7 @@
 #define CEPH_MDS_PROTOCOL     9 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
 #define CEPH_OSDC_PROTOCOL   22 /* server/client */
-#define CEPH_MDSC_PROTOCOL   29 /* server/client */
+#define CEPH_MDSC_PROTOCOL   30 /* server/client */
 #define CEPH_MONC_PROTOCOL   15 /* server/client */
 
 
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 4226c810ce22..cad8d25861e5 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -76,6 +76,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 	/* pick out active nodes from mds_info (state > 0) */
 	n = ceph_decode_32(p);
 	for (i = 0; i < n; i++) {
+		u64 global_id;
 		u32 namelen;
 		s32 mds, inc, state;
 		u64 state_seq;
@@ -84,10 +85,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		u32 num_export_targets;
 		void *pexport_targets = NULL;
 
-		ceph_decode_need(p, end, sizeof(addr) + 1 + sizeof(u32), bad);
-		ceph_decode_copy(p, &addr, sizeof(addr));
-		ceph_decode_addr(&addr);
+		ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+		global_id = ceph_decode_64(p);
 		infoversion = ceph_decode_8(p);
+		*p += sizeof(u64);
 		namelen = ceph_decode_32(p);  /* skip mds name */
 		*p += namelen;
 
@@ -99,7 +100,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		inc = ceph_decode_32(p);
 		state = ceph_decode_32(p);
 		state_seq = ceph_decode_64(p);
-		*p += sizeof(addr);
+		ceph_decode_copy(p, &addr, sizeof(addr));
+		ceph_decode_addr(&addr);
 		*p += sizeof(struct ceph_timespec);
 		*p += sizeof(u32);
 		ceph_decode_32_safe(p, end, namelen, bad);
@@ -112,10 +114,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 			num_export_targets = 0;
 		}
 
-		dout("mdsmap_decode %d/%d mds%d.%d %s %s\n",
-		     i+1, n, mds, inc, pr_addr(&addr.in_addr),
+		dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
+		     i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
 		     ceph_mds_state_name(state));
 		if (mds >= 0 && mds < m->m_max_mds && state > 0) {
+			m->m_info[mds].global_id = global_id;
 			m->m_info[mds].state = state;
 			m->m_info[mds].addr = addr;
 			m->m_info[mds].num_export_targets = num_export_targets;
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
index d317308648fb..eacc131aa5cb 100644
--- a/fs/ceph/mdsmap.h
+++ b/fs/ceph/mdsmap.h
@@ -9,6 +9,7 @@
  * we limit fields to those the client actually xcares about
  */
 struct ceph_mds_info {
+	u64 global_id;
 	struct ceph_entity_addr addr;
 	s32 state;
 	int num_export_targets;
-- 
cgit v1.2.2


From dc14657c9c946f25b84a98e9ffa41b812a70699e Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@newdream.net>
Date: Fri, 20 Nov 2009 13:59:13 -0800
Subject: ceph: mount fails immediately on error

Signed-off-by: Yehuda Sadeh <yehuda@newdream.net>
---
 fs/ceph/auth.c  | 5 +++++
 fs/ceph/super.c | 6 +++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index c4d1eee827a3..32f2e2a021ab 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -169,6 +169,11 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 	}
 
 	if (ac->negotiating) {
+		/* server does not support our protocols? */
+		if (!protocol && result < 0) {
+			ret = result;
+			goto out;
+		}
 		/* set up (new) protocol handler? */
 		if (ac->protocol && ac->protocol != protocol) {
 			ac->ops->destroy(ac);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 3df6d4ab236c..a828943296c5 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -712,10 +712,14 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
 		/* wait */
 		dout("mount waiting for mon_map\n");
 		err = wait_event_interruptible_timeout(client->mount_wq, /* FIXME */
-			       have_mon_map(client),
+			       have_mon_map(client) || (client->mount_err < 0),
 			       timeout);
 		if (err == -EINTR || err == -ERESTARTSYS)
 			goto out;
+		if (client->mount_err < 0) {
+			err = client->mount_err;
+			goto out;
+		}
 	}
 
 	dout("mount opening root\n");
-- 
cgit v1.2.2


From 0dc2570fab222affe7739b88b5ed04c511d433dc Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 20 Nov 2009 13:43:45 -0800
Subject: ceph: reset requested max_size after mds reconnect

The max_size increase request to the MDS can get lost during an MDS
restart and reconnect.  Reset our requested value after the MDS recovers,
so that any blocked writes will re-request a larger max_size upon waking.

Also, explicit wake session caps after the reconnect.  Normally the cap
renewal catches this, but not in the cases where the caps didn't go stale
in the first place, which would leave writers waiting on max_size asleep.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 8d95b0f051e4..7da836909abb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -746,14 +746,24 @@ static void remove_session_caps(struct ceph_mds_session *session)
 static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
 			      void *arg)
 {
-	wake_up(&ceph_inode(inode)->i_cap_wq);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	wake_up(&ci->i_cap_wq);
+	if (arg) {
+		spin_lock(&inode->i_lock);
+		ci->i_wanted_max_size = 0;
+		ci->i_requested_max_size = 0;
+		spin_unlock(&inode->i_lock);
+	}
 	return 0;
 }
 
-static void wake_up_session_caps(struct ceph_mds_session *session)
+static void wake_up_session_caps(struct ceph_mds_session *session,
+				 int reconnect)
 {
 	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
-	iterate_session_caps(session, wake_up_session_cb, NULL);
+	iterate_session_caps(session, wake_up_session_cb,
+			     (void *)(unsigned long)reconnect);
 }
 
 /*
@@ -794,6 +804,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
 
 /*
  * Note new cap ttl, and any transition from stale -> not stale (fresh?).
+ *
+ * Called under session->s_mutex
  */
 static void renewed_caps(struct ceph_mds_client *mdsc,
 			 struct ceph_mds_session *session, int is_renew)
@@ -822,7 +834,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc,
 	spin_unlock(&session->s_cap_lock);
 
 	if (wake)
-		wake_up_session_caps(session);
+		wake_up_session_caps(session, 0);
 }
 
 /*
@@ -2248,6 +2260,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 			pr_info("mds%d reconnect completed\n", s->s_mds);
 			kick_requests(mdsc, i, 1);
 			ceph_kick_flushing_caps(mdsc, s);
+			wake_up_session_caps(s, 1);
 		}
 	}
 }
-- 
cgit v1.2.2


From 03c677e1d189ff62891d9f278c55bb798a418b81 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 20 Nov 2009 15:14:15 -0800
Subject: ceph: reset msgr backoff during open, not after successful handshake

Reset the backoff delay when we reopen the connection, so that the delays
for any initial connection problems are reasonable.  We were resetting only
after a successful handshake, which was of limited utility.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 0b1674854b25..45cec31fdf5e 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -348,6 +348,7 @@ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
 	set_bit(OPENING, &con->state);
 	clear_bit(CLOSED, &con->state);
 	memcpy(&con->peer_addr, addr, sizeof(*addr));
+	con->delay = 0;      /* reset backoff memory */
 	queue_con(con);
 }
 
@@ -1162,8 +1163,6 @@ static int process_connect(struct ceph_connection *con)
 		     con->connect_seq);
 		WARN_ON(con->connect_seq !=
 			le32_to_cpu(con->in_reply.connect_seq));
-
-		con->delay = 0;  /* reset backoff memory */
 		prepare_read_tag(con);
 		break;
 
-- 
cgit v1.2.2


From b19a29af74c09553b9fef95cdf6e9af3df65f544 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 20 Nov 2009 14:44:18 -0800
Subject: ceph: remove dead code

Left over from mount/auth protocol changes.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index b742b3b3e0f3..9ff2da69d33a 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -278,27 +278,6 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 	mutex_unlock(&monc->mutex);
 }
 
-#if 0
-/*
- * mount
- */
-static void __request_mount(struct ceph_mon_client *monc)
-{
-	struct ceph_msg *msg;
-	struct ceph_client_mount *h;
-
-	dout("__request_mount\n");
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_MOUNT, sizeof(*h), 0, 0, NULL);
-	if (IS_ERR(msg))
-		return;
-	h = msg->front.iov_base;
-	h->monhdr.have_version = 0;
-	h->monhdr.session_mon = cpu_to_le16(-1);
-	h->monhdr.session_mon_tid = 0;
-	ceph_con_send(monc->con, msg);
-}
-#endif
-
 /*
  * 
  */
-- 
cgit v1.2.2


From 60d877334f7d9f5f2417ea4a83c1def769286102 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Sat, 21 Nov 2009 12:53:08 +0100
Subject: fs/ceph: introduce missing kfree

Error handling code following a kmalloc should free the allocated data.

The semantic match that finds this problem is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@r exists@
local idexpression x;
statement S;
expression E;
identifier f,f1,l;
position p1,p2;
expression *ptr != NULL;
@@

x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...);
...
if (x == NULL) S
<... when != x
     when != if (...) { <+...x...+> }
(
x->f1 = E
|
 (x->f1 == NULL || ...)
|
 f(...,x->f1,...)
)
...>
(
 return \(0\|<+...x...+>\|ptr\);
|
 return@p2 ...;
)

@script:python@
p1 << r.p1;
p2 << r.p2;
@@

print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 1a48a55c5109..04769a3ab832 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -655,8 +655,10 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
 	/* do request */
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
 				       USE_AUTH_MDS);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
 	req->r_inode = igrab(inode);
 	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
 	req->r_num_caps = 1;
-- 
cgit v1.2.2


From 32c895e776a0dd2cb701d60fbd6440280c09ce35 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Sat, 21 Nov 2009 16:53:16 +0100
Subject: fs/ceph: Move a dereference below a NULL test

If the NULL test is necessary, then the dereference should be moved below
the NULL test.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/).

// <smpl>
@@
type T;
expression E;
identifier i,fld;
statement S;
@@

- T i = E->fld;
+ T i;
  ... when != E
      when != i
  if (E == NULL) S
+ i = E->fld;
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index d63f192999ee..5d30d5959b97 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1249,11 +1249,12 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	struct ceph_osd *osd = con->private;
-	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct ceph_osd_client *osdc;
 	int type = le16_to_cpu(msg->hdr.type);
 
 	if (!osd)
 		return;
+	osdc = osd->o_osdc;
 
 	switch (type) {
 	case CEPH_MSG_OSD_MAP:
-- 
cgit v1.2.2


From 75eb3592811028e5b01835126483d115532a3aa1 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sat, 21 Nov 2009 13:08:14 -0800
Subject: ceph: remove useless IS_ERR checks

ceph_lookup_snap_realm either returns a valid pointer or NULL; there is no
need to check IS_ERR(result).

Reported-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/snap.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 2e3cb40b7e48..52f46a1208f5 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -226,8 +226,6 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
 		return 0;
 
 	parent = ceph_lookup_snap_realm(mdsc, parentino);
-	if (IS_ERR(parent))
-		return PTR_ERR(parent);
 	if (!parent) {
 		parent = ceph_create_snap_realm(mdsc, parentino);
 		if (IS_ERR(parent))
@@ -541,10 +539,6 @@ more:
 	p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
 
 	realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
-	if (IS_ERR(realm)) {
-		err = PTR_ERR(realm);
-		goto fail;
-	}
 	if (!realm) {
 		realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
 		if (IS_ERR(realm)) {
@@ -762,8 +756,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 		ri = p;
 
 		realm = ceph_lookup_snap_realm(mdsc, split);
-		if (IS_ERR(realm))
-			goto out;
 		if (!realm) {
 			realm = ceph_create_snap_realm(mdsc, split);
 			if (IS_ERR(realm))
@@ -829,8 +821,6 @@ skip_inode:
 			struct ceph_snap_realm *child =
 				ceph_lookup_snap_realm(mdsc,
 					   le64_to_cpu(split_realms[i]));
-			if (IS_ERR(child))
-				continue;
 			if (!child)
 				continue;
 			adjust_snap_realm_parent(mdsc, child, realm->ino);
-- 
cgit v1.2.2


From 34b43a56b9b103a7a820032177131532d9dbdbe8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 1 Dec 2009 12:23:54 -0800
Subject: ceph: plug leak of request_mutex

Fix leak of osd client request_mutex on receiving dup ack.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 5d30d5959b97..d600073f1d3f 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -739,6 +739,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 		req->r_got_reply = 1;
 	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
 		dout("handle_reply tid %llu dup ack\n", tid);
+		mutex_unlock(&osdc->request_mutex);
 		goto done;
 	}
 
-- 
cgit v1.2.2


From 50b885b96c903e420a1eac54dd27626244704a06 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 1 Dec 2009 14:12:07 -0800
Subject: ceph: whitespace cleanup

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c         | 2 +-
 fs/ceph/ceph_hash.c    | 8 ++++----
 fs/ceph/crush/mapper.c | 2 +-
 fs/ceph/mon_client.c   | 2 +-
 fs/ceph/osd_client.c   | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 9dd110602cda..9b9ce143ac1f 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1316,7 +1316,7 @@ static int __mark_caps_flushing(struct inode *inode,
 	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int flushing;
-	
+
 	BUG_ON(ci->i_dirty_caps == 0);
 	BUG_ON(list_empty(&ci->i_dirty_item));
 
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
index 1c44e43fe89e..bd570015d147 100644
--- a/fs/ceph/ceph_hash.c
+++ b/fs/ceph/ceph_hash.c
@@ -82,14 +82,14 @@ unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
  */
 unsigned ceph_str_hash_linux(const char *str, unsigned length)
 {
-        unsigned long hash = 0;
+	unsigned long hash = 0;
 	unsigned char c;
 
-        while (length--) {
+	while (length--) {
 		c = *str++;
 		hash = (hash + (c << 4) + (c >> 4)) * 11;
 	}
-        return hash;
+	return hash;
 }
 
 
@@ -105,7 +105,7 @@ unsigned ceph_str_hash(int type, const char *s, unsigned len)
 	}
 }
 
-const char *ceph_str_hash_name(int type) 
+const char *ceph_str_hash_name(int type)
 {
 	switch (type) {
 	case CEPH_STR_HASH_LINUX:
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
index 2523d448445c..9ba54efb6543 100644
--- a/fs/ceph/crush/mapper.c
+++ b/fs/ceph/crush/mapper.c
@@ -254,7 +254,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
 					   x, r);
 	default:
 		BUG_ON(1);
- 		return in->items[0];
+		return in->items[0];
 	}
 }
 
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 9ff2da69d33a..1dd0dc258c50 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -279,7 +279,7 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 }
 
 /*
- * 
+ *
  */
 int ceph_monc_open_session(struct ceph_mon_client *monc)
 {
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index d600073f1d3f..d639c74e749f 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1308,8 +1308,8 @@ static void put_osd_con(struct ceph_connection *con)
  * authentication
  */
 static int get_authorizer(struct ceph_connection *con,
-                          void **buf, int *len, int *proto,
-                          void **reply_buf, int *reply_len, int force_new)
+	                  void **buf, int *len, int *proto,
+	                  void **reply_buf, int *reply_len, int force_new)
 {
 	struct ceph_osd *o = con->private;
 	struct ceph_osd_client *osdc = o->o_osdc;
-- 
cgit v1.2.2


From 1d1de9160e0d8aff0d67a21137b62e63ffd6f184 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 2 Dec 2009 11:54:25 -0800
Subject: ceph: hide /.ceph from readdir results

We need to skip /.ceph in (cached) readdir results, and exclude "/.ceph"
from the cached ENOENT lookup check.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h | 1 +
 fs/ceph/dir.c     | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 4e5f49c738d8..699196a10c66 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -44,6 +44,7 @@
 
 
 #define CEPH_INO_ROOT  1
+#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
 
 /* arbitrary limit on max # of monitors (cluster of 3 is typical) */
 #define CEPH_MAX_MON   31
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 32ef54367224..89ce3ba4a614 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -132,6 +132,7 @@ more:
 		}
 		if (!d_unhashed(dentry) && dentry->d_inode &&
 		    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
+		    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
 		    filp->f_pos <= di->offset)
 			break;
 		dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
@@ -512,6 +513,12 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 	return dentry;
 }
 
+static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+{
+	return ceph_ino(inode) == CEPH_INO_ROOT &&
+		strncmp(dentry->d_name.name, ".ceph", 5) == 0;
+}
+
 /*
  * Look up a single dir entry.  If there is a lookup intent, inform
  * the MDS so that it gets our 'caps wanted' value in a single op.
@@ -554,6 +561,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 		if (strncmp(dentry->d_name.name,
 			    client->mount_args->snapdir_name,
 			    dentry->d_name.len) &&
+		    !is_root_ceph_dentry(dir, dentry) &&
 		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
 		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
 			di->offset = ci->i_max_offset++;
-- 
cgit v1.2.2


From 33d4909ccc094b8262667bccdd52e01458bee0df Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 2 Dec 2009 14:42:39 -0800
Subject: ceph: allow preferred osd to be get/set via layout ioctl

There is certainly no reason not to report this.

The only real downside to allowing the user to set it is that you don't
get default values by zeroing the layout struct (the default is -1).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ioctl.c | 5 ++++-
 fs/ceph/ioctl.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 4c33e19fc241..8a5bcae62846 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -24,6 +24,8 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
 		l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
 		l.object_size = ceph_file_layout_object_size(ci->i_layout);
 		l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+		l.preferred_osd =
+			(s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
 		if (copy_to_user(arg, &l, sizeof(l)))
 			return -EFAULT;
 	}
@@ -79,7 +81,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 	req->r_args.setlayout.layout.fl_object_size =
 		cpu_to_le32(l.object_size);
 	req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
-	req->r_args.setlayout.layout.fl_pg_preferred = cpu_to_le32((s32)-1);
+	req->r_args.setlayout.layout.fl_pg_preferred =
+		cpu_to_le32(l.preferred_osd);
 
 	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
 	ceph_mdsc_put_request(req);
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 3c511dab3730..25e4f1a9d059 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -10,6 +10,7 @@
 struct ceph_ioctl_layout {
 	__u64 stripe_unit, stripe_count, object_size;
 	__u64 data_pool;
+	__s64 preferred_osd;
 };
 
 #define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1,		\
-- 
cgit v1.2.2


From 2f2ffd35822688a3650e503197b8724f47312748 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 4 Dec 2009 10:27:17 -0800
Subject: ceph: mark v0.18 release

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 699196a10c66..e2fd0247827e 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
  * Ceph release version
  */
 #define CEPH_VERSION_MAJOR 0
-#define CEPH_VERSION_MINOR 17
+#define CEPH_VERSION_MINOR 18
 #define CEPH_VERSION_PATCH 0
 
 #define _CEPH_STRINGIFY(x) #x
-- 
cgit v1.2.2


From dd26d857a7bf1b5b734a23180c19eac3e46db944 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sat, 5 Dec 2009 10:13:33 -0800
Subject: ceph: use kref for ceph_buffer

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/buffer.c | 14 +++++++++++++-
 fs/ceph/buffer.h | 18 +++++++-----------
 2 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
index cf9aaccef22b..847c5da9a0db 100644
--- a/fs/ceph/buffer.c
+++ b/fs/ceph/buffer.c
@@ -9,13 +9,25 @@ struct ceph_buffer *ceph_buffer_new(gfp_t gfp)
 	b = kmalloc(sizeof(*b), gfp);
 	if (!b)
 		return NULL;
-	atomic_set(&b->nref, 1);
+	kref_init(&b->kref);
 	b->vec.iov_base = NULL;
 	b->vec.iov_len = 0;
 	b->alloc_len = 0;
 	return b;
 }
 
+void ceph_buffer_release(struct kref *kref)
+{
+	struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+	if (b->vec.iov_base) {
+		if (b->is_vmalloc)
+			vfree(b->vec.iov_base);
+		else
+			kfree(b->vec.iov_base);
+	}
+	kfree(b);
+}
+
 int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
 {
 	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
index 16b1930acc45..3f541a13094f 100644
--- a/fs/ceph/buffer.h
+++ b/fs/ceph/buffer.h
@@ -1,6 +1,7 @@
 #ifndef __FS_CEPH_BUFFER_H
 #define __FS_CEPH_BUFFER_H
 
+#include <linux/kref.h>
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/types.h>
@@ -13,7 +14,7 @@
  * sizes.
  */
 struct ceph_buffer {
-	atomic_t nref;
+	struct kref kref;
 	struct kvec vec;
 	size_t alloc_len;
 	bool is_vmalloc;
@@ -24,21 +25,16 @@ int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp);
 
 static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
 {
-	atomic_inc(&b->nref);
+	kref_get(&b->kref);
 	return b;
 }
 
+void ceph_buffer_release(struct kref *kref);
+
 static inline void ceph_buffer_put(struct ceph_buffer *b)
 {
-	if (b && atomic_dec_and_test(&b->nref)) {
-		if (b->vec.iov_base) {
-			if (b->is_vmalloc)
-				vfree(b->vec.iov_base);
-			else
-				kfree(b->vec.iov_base);
-		}
-		kfree(b);
-	}
+	if (b)
+		kref_put(&b->kref, ceph_buffer_release);
 }
 
 static inline struct ceph_buffer *ceph_buffer_new_alloc(int len, gfp_t gfp)
-- 
cgit v1.2.2


From b6c1d5b81ea0841ae9d3ce2cda319ab986b081cf Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 7 Dec 2009 12:17:17 -0800
Subject: ceph: simplify ceph_buffer interface

We never allocate the ceph_buffer and buffer separtely, so use a single
constructor.

Disallow put on NULL buffer; make the caller check.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/buffer.c    | 23 +++++++++++++++++++----
 fs/ceph/buffer.h    | 20 +++-----------------
 fs/ceph/inode.c     | 11 +++++++----
 fs/ceph/messenger.c |  2 +-
 fs/ceph/xattr.c     |  8 +++++---
 5 files changed, 35 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
index 847c5da9a0db..2576bd452cb8 100644
--- a/fs/ceph/buffer.c
+++ b/fs/ceph/buffer.c
@@ -2,23 +2,38 @@
 #include "ceph_debug.h"
 #include "buffer.h"
 
-struct ceph_buffer *ceph_buffer_new(gfp_t gfp)
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
 {
 	struct ceph_buffer *b;
 
 	b = kmalloc(sizeof(*b), gfp);
 	if (!b)
 		return NULL;
+
+	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+	if (b->vec.iov_base) {
+		b->is_vmalloc = false;
+	} else {
+		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+		if (!b->vec.iov_base) {
+			kfree(b);
+			return NULL;
+		}
+		b->is_vmalloc = true;
+	}
+
 	kref_init(&b->kref);
-	b->vec.iov_base = NULL;
-	b->vec.iov_len = 0;
-	b->alloc_len = 0;
+	b->alloc_len = len;
+	b->vec.iov_len = len;
+	dout("buffer_new %p\n", b);
 	return b;
 }
 
 void ceph_buffer_release(struct kref *kref)
 {
 	struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+
+	dout("buffer_release %p\n", b);
 	if (b->vec.iov_base) {
 		if (b->is_vmalloc)
 			vfree(b->vec.iov_base);
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
index 3f541a13094f..47b9514c5bbd 100644
--- a/fs/ceph/buffer.h
+++ b/fs/ceph/buffer.h
@@ -20,8 +20,8 @@ struct ceph_buffer {
 	bool is_vmalloc;
 };
 
-struct ceph_buffer *ceph_buffer_new(gfp_t gfp);
-int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp);
+extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
+extern void ceph_buffer_release(struct kref *kref);
 
 static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
 {
@@ -29,23 +29,9 @@ static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
 	return b;
 }
 
-void ceph_buffer_release(struct kref *kref);
-
 static inline void ceph_buffer_put(struct ceph_buffer *b)
 {
-	if (b)
-		kref_put(&b->kref, ceph_buffer_release);
-}
-
-static inline struct ceph_buffer *ceph_buffer_new_alloc(int len, gfp_t gfp)
-{
-	struct ceph_buffer *b = ceph_buffer_new(gfp);
-
-	if (b && ceph_buffer_alloc(b, len, gfp) < 0) {
-		ceph_buffer_put(b);
-		b = NULL;
-	}
-	return b;
+	kref_put(&b->kref, ceph_buffer_release);
 }
 
 #endif
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 074ee42bd344..db684686f48a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -383,8 +383,10 @@ void ceph_destroy_inode(struct inode *inode)
 	}
 
 	__ceph_destroy_xattrs(ci);
-	ceph_buffer_put(ci->i_xattrs.blob);
-	ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+	if (ci->i_xattrs.blob)
+		ceph_buffer_put(ci->i_xattrs.blob);
+	if (ci->i_xattrs.prealloc_blob)
+		ceph_buffer_put(ci->i_xattrs.prealloc_blob);
 
 	kmem_cache_free(ceph_inode_cachep, ci);
 }
@@ -526,7 +528,7 @@ static int fill_inode(struct inode *inode,
 	 * bytes are the xattr count).
 	 */
 	if (iinfo->xattr_len > 4) {
-		xattr_blob = ceph_buffer_new_alloc(iinfo->xattr_len, GFP_NOFS);
+		xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
 		if (!xattr_blob)
 			pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
 			       iinfo->xattr_len);
@@ -715,7 +717,8 @@ no_change:
 	err = 0;
 
 out:
-	ceph_buffer_put(xattr_blob);
+	if (xattr_blob)
+		ceph_buffer_put(xattr_blob);
 	return err;
 }
 
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 45cec31fdf5e..bf762107b3d5 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -2047,7 +2047,7 @@ int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
 	BUG_ON(!middle_len);
 	BUG_ON(msg->middle);
 
-	msg->middle = ceph_buffer_new_alloc(middle_len, GFP_NOFS);
+	msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
 	if (!msg->middle)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 04769a3ab832..37d6ce645691 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -482,7 +482,8 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
 		ci->i_xattrs.prealloc_blob->vec.iov_len =
 			dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
 
-		ceph_buffer_put(ci->i_xattrs.blob);
+		if (ci->i_xattrs.blob)
+			ceph_buffer_put(ci->i_xattrs.blob);
 		ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
 		ci->i_xattrs.prealloc_blob = NULL;
 		ci->i_xattrs.dirty = false;
@@ -745,11 +746,12 @@ retry:
 
 		spin_unlock(&inode->i_lock);
 		dout(" preaallocating new blob size=%d\n", required_blob_size);
-		blob = ceph_buffer_new_alloc(required_blob_size, GFP_NOFS);
+		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
 		if (!blob)
 			goto out;
 		spin_lock(&inode->i_lock);
-		ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+		if (ci->i_xattrs.prealloc_blob)
+			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
 		ci->i_xattrs.prealloc_blob = blob;
 		goto retry;
 	}
-- 
cgit v1.2.2


From 153c8e6bf7ffee561e046e60b26ef6486c6fc9f2 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 7 Dec 2009 12:31:09 -0800
Subject: ceph: use kref for struct ceph_mds_request

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 69 ++++++++++++++++++++++++++--------------------------
 fs/ceph/mds_client.h | 11 ++++++---
 2 files changed, 42 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7da836909abb..739093f281d0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -400,41 +400,40 @@ static void put_request_session(struct ceph_mds_request *req)
 	}
 }
 
-void ceph_mdsc_put_request(struct ceph_mds_request *req)
-{
-	dout("mdsc put_request %p %d -> %d\n", req,
-	     atomic_read(&req->r_ref), atomic_read(&req->r_ref)-1);
-	if (atomic_dec_and_test(&req->r_ref)) {
-		if (req->r_request)
-			ceph_msg_put(req->r_request);
-		if (req->r_reply) {
-			ceph_msg_put(req->r_reply);
-			destroy_reply_info(&req->r_reply_info);
-		}
-		if (req->r_inode) {
-			ceph_put_cap_refs(ceph_inode(req->r_inode),
-					  CEPH_CAP_PIN);
-			iput(req->r_inode);
-		}
-		if (req->r_locked_dir)
-			ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
-					  CEPH_CAP_PIN);
-		if (req->r_target_inode)
-			iput(req->r_target_inode);
-		if (req->r_dentry)
-			dput(req->r_dentry);
-		if (req->r_old_dentry) {
-			ceph_put_cap_refs(
-			     ceph_inode(req->r_old_dentry->d_parent->d_inode),
-			     CEPH_CAP_PIN);
-			dput(req->r_old_dentry);
-		}
-		kfree(req->r_path1);
-		kfree(req->r_path2);
-		put_request_session(req);
-		ceph_unreserve_caps(&req->r_caps_reservation);
-		kfree(req);
+void ceph_mdsc_release_request(struct kref *kref)
+{
+	struct ceph_mds_request *req = container_of(kref,
+						    struct ceph_mds_request,
+						    r_kref);
+	if (req->r_request)
+		ceph_msg_put(req->r_request);
+	if (req->r_reply) {
+		ceph_msg_put(req->r_reply);
+		destroy_reply_info(&req->r_reply_info);
+	}
+	if (req->r_inode) {
+		ceph_put_cap_refs(ceph_inode(req->r_inode),
+				  CEPH_CAP_PIN);
+		iput(req->r_inode);
+	}
+	if (req->r_locked_dir)
+		ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
+				  CEPH_CAP_PIN);
+	if (req->r_target_inode)
+		iput(req->r_target_inode);
+	if (req->r_dentry)
+		dput(req->r_dentry);
+	if (req->r_old_dentry) {
+		ceph_put_cap_refs(
+			ceph_inode(req->r_old_dentry->d_parent->d_inode),
+			CEPH_CAP_PIN);
+		dput(req->r_old_dentry);
 	}
+	kfree(req->r_path1);
+	kfree(req->r_path2);
+	put_request_session(req);
+	ceph_unreserve_caps(&req->r_caps_reservation);
+	kfree(req);
 }
 
 /*
@@ -1097,7 +1096,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 	req->r_resend_mds = -1;
 	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
 	req->r_fmode = -1;
-	atomic_set(&req->r_ref, 1);  /* one for request_tree, one for caller */
+	kref_init(&req->r_kref);
 	INIT_LIST_HEAD(&req->r_wait);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 9faa1b2f79a7..41af5ca316e6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -2,6 +2,7 @@
 #define _FS_CEPH_MDS_CLIENT_H
 
 #include <linux/completion.h>
+#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/radix-tree.h>
@@ -203,7 +204,7 @@ struct ceph_mds_request {
 	int               r_num_stale;
 	int               r_resend_mds; /* mds to resend to next, if any*/
 
-	atomic_t          r_ref;
+	struct kref       r_kref;
 	struct list_head  r_wait;
 	struct completion r_completion;
 	struct completion r_safe_completion;
@@ -306,9 +307,13 @@ extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 				struct ceph_mds_request *req);
 static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
 {
-	atomic_inc(&req->r_ref);
+	kref_get(&req->r_kref);
+}
+extern void ceph_mdsc_release_request(struct kref *kref);
+static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
+{
+	kref_put(&req->r_kref, ceph_mdsc_release_request);
 }
-extern void ceph_mdsc_put_request(struct ceph_mds_request *req);
 
 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
 
-- 
cgit v1.2.2


From 415e49a9c4faf1a1480b1497da2037608e5aa2c5 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 7 Dec 2009 13:37:03 -0800
Subject: ceph: use kref for ceph_osd_request

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 37 ++++++++++++++++++-------------------
 fs/ceph/osd_client.h | 11 ++++++++---
 2 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index d639c74e749f..67ef8ab06af4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -77,25 +77,24 @@ static void calc_layout(struct ceph_osd_client *osdc,
 /*
  * requests
  */
-void ceph_osdc_put_request(struct ceph_osd_request *req)
+void ceph_osdc_release_request(struct kref *kref)
 {
-	dout("osdc put_request %p %d -> %d\n", req, atomic_read(&req->r_ref),
-	     atomic_read(&req->r_ref)-1);
-	BUG_ON(atomic_read(&req->r_ref) <= 0);
-	if (atomic_dec_and_test(&req->r_ref)) {
-		if (req->r_request)
-			ceph_msg_put(req->r_request);
-		if (req->r_reply)
-			ceph_msg_put(req->r_reply);
-		if (req->r_own_pages)
-			ceph_release_page_vector(req->r_pages,
-						 req->r_num_pages);
-		ceph_put_snap_context(req->r_snapc);
-		if (req->r_mempool)
-			mempool_free(req, req->r_osdc->req_mempool);
-		else
-			kfree(req);
-	}
+	struct ceph_osd_request *req = container_of(kref,
+						    struct ceph_osd_request,
+						    r_kref);
+
+	if (req->r_request)
+		ceph_msg_put(req->r_request);
+	if (req->r_reply)
+		ceph_msg_put(req->r_reply);
+	if (req->r_own_pages)
+		ceph_release_page_vector(req->r_pages,
+					 req->r_num_pages);
+	ceph_put_snap_context(req->r_snapc);
+	if (req->r_mempool)
+		mempool_free(req, req->r_osdc->req_mempool);
+	else
+		kfree(req);
 }
 
 /*
@@ -149,7 +148,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
 	req->r_osdc = osdc;
 	req->r_mempool = use_mempool;
-	atomic_set(&req->r_ref, 1);
+	kref_init(&req->r_kref);
 	init_completion(&req->r_completion);
 	init_completion(&req->r_safe_completion);
 	INIT_LIST_HEAD(&req->r_unsafe_item);
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index 3d4ae6595aaa..20ee61847416 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -2,6 +2,7 @@
 #define _FS_CEPH_OSD_CLIENT_H
 
 #include <linux/completion.h>
+#include <linux/kref.h>
 #include <linux/mempool.h>
 #include <linux/rbtree.h>
 
@@ -49,7 +50,7 @@ struct ceph_osd_request {
 	int r_prepared_pages, r_got_reply;
 
 	struct ceph_osd_client *r_osdc;
-	atomic_t          r_ref;
+	struct kref       r_kref;
 	bool              r_mempool;
 	struct completion r_completion, r_safe_completion;
 	ceph_osdc_callback_t r_callback, r_safe_callback;
@@ -118,9 +119,13 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 
 static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
 {
-	atomic_inc(&req->r_ref);
+	kref_get(&req->r_kref);
+}
+extern void ceph_osdc_release_request(struct kref *kref);
+static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+	kref_put(&req->r_kref, ceph_osdc_release_request);
 }
-extern void ceph_osdc_put_request(struct ceph_osd_request *req);
 
 extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 				   struct ceph_osd_request *req,
-- 
cgit v1.2.2


From c2e552e76e2c6907ca50cd9a4b747a2e2e8c615e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 7 Dec 2009 15:55:05 -0800
Subject: ceph: use kref for ceph_msg

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 47 ++++++++++++++++++-----------------------------
 fs/ceph/messenger.h | 13 ++++++++-----
 fs/ceph/msgpool.c   |  2 +-
 3 files changed, 27 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index bf762107b3d5..b0571b01b19f 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1958,7 +1958,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
 	m = kmalloc(sizeof(*m), GFP_NOFS);
 	if (m == NULL)
 		goto out;
-	atomic_set(&m->nref, 1);
+	kref_init(&m->kref);
 	INIT_LIST_HEAD(&m->list_head);
 
 	m->hdr.type = cpu_to_le16(type);
@@ -2070,34 +2070,23 @@ void ceph_msg_kfree(struct ceph_msg *m)
 /*
  * Drop a msg ref.  Destroy as needed.
  */
-void ceph_msg_put(struct ceph_msg *m)
-{
-	dout("ceph_msg_put %p %d -> %d\n", m, atomic_read(&m->nref),
-	     atomic_read(&m->nref)-1);
-	if (atomic_read(&m->nref) <= 0) {
-		pr_err("bad ceph_msg_put on %p %llu %d=%s %d+%d\n",
-		       m, le64_to_cpu(m->hdr.seq),
-		       le16_to_cpu(m->hdr.type),
-		       ceph_msg_type_name(le16_to_cpu(m->hdr.type)),
-		       le32_to_cpu(m->hdr.front_len),
-		       le32_to_cpu(m->hdr.data_len));
-		WARN_ON(1);
-	}
-	if (atomic_dec_and_test(&m->nref)) {
-		dout("ceph_msg_put last one on %p\n", m);
-		WARN_ON(!list_empty(&m->list_head));
-
-		/* drop middle, data, if any */
-		if (m->middle) {
-			ceph_buffer_put(m->middle);
-			m->middle = NULL;
-		}
-		m->nr_pages = 0;
-		m->pages = NULL;
+void ceph_msg_last_put(struct kref *kref)
+{
+	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
 
-		if (m->pool)
-			ceph_msgpool_put(m->pool, m);
-		else
-			ceph_msg_kfree(m);
+	dout("ceph_msg_put last one on %p\n", m);
+	WARN_ON(!list_empty(&m->list_head));
+
+	/* drop middle, data, if any */
+	if (m->middle) {
+		ceph_buffer_put(m->middle);
+		m->middle = NULL;
 	}
+	m->nr_pages = 0;
+	m->pages = NULL;
+
+	if (m->pool)
+		ceph_msgpool_put(m->pool, m);
+	else
+		ceph_msg_kfree(m);
 }
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index f9c9f6487302..981b7c08ad82 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -1,6 +1,7 @@
 #ifndef __FS_CEPH_MESSENGER_H
 #define __FS_CEPH_MESSENGER_H
 
+#include <linux/kref.h>
 #include <linux/mutex.h>
 #include <linux/net.h>
 #include <linux/radix-tree.h>
@@ -85,7 +86,7 @@ struct ceph_msg {
 	struct page **pages;            /* data payload.  NOT OWNER. */
 	unsigned nr_pages;              /* size of page array */
 	struct list_head list_head;
-	atomic_t nref;
+	struct kref kref;
 	bool front_is_vmalloc;
 	bool more_to_follow;
 	int front_max;
@@ -243,11 +244,13 @@ extern int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg);
 
 static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
 {
-	dout("ceph_msg_get %p %d -> %d\n", msg, atomic_read(&msg->nref),
-	     atomic_read(&msg->nref)+1);
-	atomic_inc(&msg->nref);
+	kref_get(&msg->kref);
 	return msg;
 }
-extern void ceph_msg_put(struct ceph_msg *msg);
+extern void ceph_msg_last_put(struct kref *kref);
+static inline void ceph_msg_put(struct ceph_msg *msg)
+{
+	kref_put(&msg->kref, ceph_msg_last_put);
+}
 
 #endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index 7599b3382076..ad5482c0267b 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -165,7 +165,7 @@ void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
 {
 	spin_lock(&pool->lock);
 	if (pool->num < pool->min) {
-		ceph_msg_get(msg);   /* retake a single ref */
+		kref_set(&msg->kref, 1);  /* retake a single ref */
 		list_add(&msg->list_head, &pool->msgs);
 		pool->num++;
 		dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
-- 
cgit v1.2.2


From 767ea5c33a360ce88da24e296e802dace5821799 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 9 Dec 2009 12:34:01 -0800
Subject: ceph: do not feed bad device ids to crush

Do not feed bad (large) device ids to CRUSH.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 8c994c714781..be5318aa7714 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -868,6 +868,11 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 	ps = le16_to_cpu(pgid.ps);
 	preferred = (s16)le16_to_cpu(pgid.preferred);
 
+	/* don't forcefeed bad device ids to crush */
+	if (preferred >= osdmap->max_osd ||
+	    preferred >= osdmap->crush->max_devices)
+		preferred = -1;
+
 	if (poolid >= osdmap->num_pools)
 		return NULL;
 	pool = &osdmap->pg_pool[poolid];
-- 
cgit v1.2.2


From d4a780ce8821a37dd135f15b6150a5bfc5604f29 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 11 Dec 2009 08:55:23 -0800
Subject: ceph: fix leak of monc mutex

Fix leak of monc mutex on ENOMEM or bad fsid when receiving new mon map.
Audited all other users.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 1dd0dc258c50..a76da5e6dbdd 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -320,17 +320,18 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 	if (IS_ERR(monmap)) {
 		pr_err("problem decoding monmap, %d\n",
 		       (int)PTR_ERR(monmap));
-		return;
+		goto out;
 	}
 
 	if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
 		kfree(monmap);
-		return;
+		goto out;
 	}
 
 	client->monc.monmap = monmap;
 	kfree(old);
 
+out:
 	mutex_unlock(&monc->mutex);
 	wake_up(&client->mount_wq);
 }
-- 
cgit v1.2.2


From c86a2930ccbd90d77c54d04b5c2bbec95b989e40 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 14 Dec 2009 14:04:30 -0800
Subject: ceph: carry explicit msg reference for currently sending message

Carry a ceph_msg reference for connection->out_msg.  This will allow us to
make out_sent optional.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 22 ++++++++++++++++++----
 fs/ceph/messenger.h |  1 +
 2 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index b0571b01b19f..96fd556a7804 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -322,7 +322,10 @@ static void reset_connection(struct ceph_connection *con)
 
 	con->connect_seq = 0;
 	con->out_seq = 0;
-	con->out_msg = NULL;
+	if (con->out_msg) {
+		ceph_msg_put(con->out_msg);
+		con->out_msg = NULL;
+	}
 	con->in_seq = 0;
 	mutex_unlock(&con->out_mutex);
 }
@@ -423,7 +426,7 @@ static void prepare_write_message_footer(struct ceph_connection *con, int v)
 	con->out_kvec_bytes += sizeof(m->footer);
 	con->out_kvec_left++;
 	con->out_more = m->more_to_follow;
-	con->out_msg = NULL;   /* we're done with this one */
+	con->out_msg_done = true;
 }
 
 /*
@@ -436,6 +439,7 @@ static void prepare_write_message(struct ceph_connection *con)
 
 	con->out_kvec_bytes = 0;
 	con->out_kvec_is_msg = true;
+	con->out_msg_done = false;
 
 	/* Sneak an ack in there first?  If we can get it into the same
 	 * TCP packet that's a good thing. */
@@ -452,8 +456,9 @@ static void prepare_write_message(struct ceph_connection *con)
 	/* move message to sending/sent list */
 	m = list_first_entry(&con->out_queue,
 		       struct ceph_msg, list_head);
+	con->out_msg = m;
+	ceph_msg_get(m);
 	list_move_tail(&m->list_head, &con->out_sent);
-	con->out_msg = m;   /* we don't bother taking a reference here. */
 
 	m->hdr.seq = cpu_to_le64(++con->out_seq);
 
@@ -1521,6 +1526,12 @@ more_kvec:
 
 	/* msg pages? */
 	if (con->out_msg) {
+		if (con->out_msg_done) {
+			ceph_msg_put(con->out_msg);
+			con->out_msg = NULL;   /* we're done with this one */
+			goto do_next;
+		}
+
 		ret = write_partial_msg_pages(con);
 		if (ret == 1)
 			goto more_kvec;  /* we need to send the footer, too! */
@@ -1533,6 +1544,7 @@ more_kvec:
 		}
 	}
 
+do_next:
 	if (!test_bit(CONNECTING, &con->state)) {
 		/* is anything else pending? */
 		if (!list_empty(&con->out_queue)) {
@@ -1923,8 +1935,10 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 		list_del_init(&msg->list_head);
 		ceph_msg_put(msg);
 		msg->hdr.seq = 0;
-		if (con->out_msg == msg)
+		if (con->out_msg == msg) {
+			ceph_msg_put(con->out_msg);
 			con->out_msg = NULL;
+		}
 		if (con->out_kvec_is_msg) {
 			con->out_skip = con->out_kvec_bytes;
 			con->out_kvec_is_msg = false;
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 981b7c08ad82..eff5cb5197fc 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -182,6 +182,7 @@ struct ceph_connection {
 	/* message out temps */
 	struct ceph_msg *out_msg;        /* sending message (== tail of
 					    out_sent) */
+	bool out_msg_done;
 	struct ceph_msg_pos out_msg_pos;
 
 	struct kvec out_kvec[8],         /* sending header/footer data */
-- 
cgit v1.2.2


From 5e095e8b40b0402ad3bcadc5b8d84c38b26c30b2 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 14 Dec 2009 14:30:34 -0800
Subject: ceph: plug msg leak in con_fault

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 96fd556a7804..98519bd33f04 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1434,8 +1434,9 @@ no_data:
  */
 static void process_message(struct ceph_connection *con)
 {
-	struct ceph_msg *msg = con->in_msg;
+	struct ceph_msg *msg;
 
+	msg = con->in_msg;
 	con->in_msg = NULL;
 
 	/* if first message, set peer_name */
@@ -1810,7 +1811,11 @@ static void ceph_fault(struct ceph_connection *con)
 	clear_bit(BUSY, &con->state);  /* to avoid an improbable race */
 
 	con_close_socket(con);
-	con->in_msg = NULL;
+
+	if (con->in_msg) {
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
 
 	/* If there are no messages in the queue, place the connection
 	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
-- 
cgit v1.2.2


From 92ac41d0a4ab26fb68d3f841332e5d1f15d79123 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 14 Dec 2009 14:56:56 -0800
Subject: ceph: detect lossy state of connection

The server indicates whether a connection is lossy; set our LOSSYTX bit
appropriately.  Do not set lossy bit on outgoing connections.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 98519bd33f04..986d8fb9c574 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -625,8 +625,6 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
 	con->out_connect.global_seq = cpu_to_le32(global_seq);
 	con->out_connect.protocol_version = cpu_to_le32(proto);
 	con->out_connect.flags = 0;
-	if (test_bit(LOSSYTX, &con->state))
-		con->out_connect.flags = CEPH_MSG_CONNECT_LOSSY;
 
 	if (!after_banner) {
 		con->out_kvec_left = 0;
@@ -1168,6 +1166,10 @@ static int process_connect(struct ceph_connection *con)
 		     con->connect_seq);
 		WARN_ON(con->connect_seq !=
 			le32_to_cpu(con->in_reply.connect_seq));
+
+		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+			set_bit(LOSSYTX, &con->state);
+
 		prepare_read_tag(con);
 		break;
 
-- 
cgit v1.2.2


From b3d1dbbdd5670d8a9fb01f7dfb1cac522ff6795a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 14 Dec 2009 14:58:11 -0800
Subject: ceph: don't save sent messages on lossy connections

For lossy connections we drop all state on socket errors, so there is no
reason to keep sent ceph_msg's around.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 986d8fb9c574..d5eef76a253c 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -453,12 +453,16 @@ static void prepare_write_message(struct ceph_connection *con)
 		con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
 	}
 
-	/* move message to sending/sent list */
 	m = list_first_entry(&con->out_queue,
 		       struct ceph_msg, list_head);
 	con->out_msg = m;
-	ceph_msg_get(m);
-	list_move_tail(&m->list_head, &con->out_sent);
+	if (test_bit(LOSSYTX, &con->state)) {
+		/* put message on sent list */
+		ceph_msg_get(m);
+		list_move_tail(&m->list_head, &con->out_sent);
+	} else {
+		list_del_init(&m->list_head);
+	}
 
 	m->hdr.seq = cpu_to_le64(++con->out_seq);
 
-- 
cgit v1.2.2


From 93c20d98c29ccefa039c3843ccc37122caaf3d31 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 15 Dec 2009 09:50:36 -0800
Subject: ceph: fix msgpool reservation leak

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/osd_client.c | 5 ++++-
 fs/ceph/osd_client.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 67ef8ab06af4..63482ef3de01 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -145,6 +145,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		ceph_osdc_put_request(req);
 		return ERR_PTR(-ENOMEM);
 	}
+	req->r_num_prealloc_reply = num_reply;
 
 	req->r_osdc = osdc;
 	req->r_mempool = use_mempool;
@@ -165,7 +166,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	else
 		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
 	if (IS_ERR(msg)) {
-		ceph_msgpool_resv(&osdc->msgpool_op_reply, num_reply);
+		ceph_msgpool_resv(&osdc->msgpool_op_reply, -num_reply);
 		ceph_osdc_put_request(req);
 		return ERR_PTR(PTR_ERR(msg));
 	}
@@ -465,6 +466,8 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 	rb_erase(&req->r_node, &osdc->requests);
 	osdc->num_requests--;
 
+	ceph_msgpool_resv(&osdc->msgpool_op_reply, -req->r_num_prealloc_reply);
+
 	if (req->r_osd) {
 		/* make sure the original request isn't in flight. */
 		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index 20ee61847416..2e4cfd1e9f10 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -48,6 +48,7 @@ struct ceph_osd_request {
 	int               r_flags;     /* any additional flags for the osd */
 	u32               r_sent;      /* >0 if r_request is sending/sent */
 	int r_prepared_pages, r_got_reply;
+	int		  r_num_prealloc_reply;
 
 	struct ceph_osd_client *r_osdc;
 	struct kref       r_kref;
-- 
cgit v1.2.2


From 9ec7cab14e6de732d4e7c355fe67c5810c32c758 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 14 Dec 2009 15:13:47 -0800
Subject: ceph: hex dump corrupt server data to KERN_DEBUG

Also, print fsid using standard format, NOT hex dump.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       |  1 +
 fs/ceph/mds_client.c |  4 ++++
 fs/ceph/mdsmap.c     |  4 ++++
 fs/ceph/messenger.c  | 20 ++++++++++++++++++++
 fs/ceph/messenger.h  |  2 ++
 fs/ceph/mon_client.c |  2 ++
 fs/ceph/osd_client.c |  2 ++
 fs/ceph/osdmap.c     |  3 +++
 fs/ceph/snap.c       |  1 +
 fs/ceph/super.c      |  9 ++-------
 10 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 9b9ce143ac1f..dfb509f53542 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2681,6 +2681,7 @@ done:
 
 bad:
 	pr_err("ceph_handle_caps: corrupt message\n");
+	ceph_msg_dump(msg);
 	return;
 }
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 739093f281d0..29a93fe35f85 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1650,6 +1650,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 		return;
 	if (msg->front.iov_len < sizeof(*head)) {
 		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+		ceph_msg_dump(msg);
 		return;
 	}
 
@@ -1740,6 +1741,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	mutex_lock(&session->s_mutex);
 	if (err < 0) {
 		pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+		ceph_msg_dump(msg);
 		goto out_err;
 	}
 
@@ -1929,6 +1931,7 @@ static void handle_session(struct ceph_mds_session *session,
 bad:
 	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
 	       (int)msg->front.iov_len);
+	ceph_msg_dump(msg);
 	return;
 }
 
@@ -2394,6 +2397,7 @@ out:
 
 bad:
 	pr_err("corrupt lease message\n");
+	ceph_msg_dump(msg);
 }
 
 void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index cad8d25861e5..c4c498e6dfef 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -49,6 +49,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
 struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 {
 	struct ceph_mdsmap *m;
+	const void *start = *p;
 	int i, j, n;
 	int err = -EINVAL;
 	u16 version;
@@ -154,6 +155,9 @@ badmem:
 	err = -ENOMEM;
 bad:
 	pr_err("corrupt mdsmap\n");
+	print_hex_dump(KERN_DEBUG, "mdsmap: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       start, end - start, true);
 	ceph_mdsmap_destroy(m);
 	return ERR_PTR(-EINVAL);
 }
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index d5eef76a253c..b10f88c56706 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -2115,3 +2115,23 @@ void ceph_msg_last_put(struct kref *kref)
 	else
 		ceph_msg_kfree(m);
 }
+
+void ceph_msg_dump(struct ceph_msg *msg)
+{
+	pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
+		 msg->front_max, msg->nr_pages);
+	print_hex_dump(KERN_DEBUG, "header: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       &msg->hdr, sizeof(msg->hdr), true);
+	print_hex_dump(KERN_DEBUG, " front: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       msg->front.iov_base, msg->front.iov_len, true);
+	if (msg->middle)
+		print_hex_dump(KERN_DEBUG, "middle: ",
+			       DUMP_PREFIX_OFFSET, 16, 1,
+			       msg->middle->vec.iov_base,
+			       msg->middle->vec.iov_len, true);
+	print_hex_dump(KERN_DEBUG, "footer: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       &msg->footer, sizeof(msg->footer), true);
+}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index eff5cb5197fc..e04c214b4f6f 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -254,4 +254,6 @@ static inline void ceph_msg_put(struct ceph_msg *msg)
 	kref_put(&msg->kref, ceph_msg_last_put);
 }
 
+extern void ceph_msg_dump(struct ceph_msg *msg);
+
 #endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index a76da5e6dbdd..775a9c029c51 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -242,6 +242,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
 	return;
 bad:
 	pr_err("got corrupt subscribe-ack msg\n");
+	ceph_msg_dump(msg);
 }
 
 /*
@@ -364,6 +365,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
 
 bad:
 	pr_err("corrupt statfs reply, no tid\n");
+	ceph_msg_dump(msg);
 }
 
 /*
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 63482ef3de01..4bfe880d53c8 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -773,6 +773,7 @@ bad:
 	pr_err("corrupt osd_op_reply got %d %d expected %d\n",
 	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
 	       (int)sizeof(*rhead));
+	ceph_msg_dump(msg);
 }
 
 
@@ -964,6 +965,7 @@ done:
 
 bad:
 	pr_err("osdc handle_map corrupt msg\n");
+	ceph_msg_dump(msg);
 	up_write(&osdc->map_sem);
 	return;
 }
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index be5318aa7714..8c8ffe5ef7d4 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -726,6 +726,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 bad:
 	pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
 	       epoch, (int)(*p - start), *p, start, end);
+	print_hex_dump(KERN_DEBUG, "osdmap: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       start, end - start, true);
 	if (newcrush)
 		crush_destroy(newcrush);
 	return ERR_PTR(err);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 52f46a1208f5..dcf18d92130a 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -877,6 +877,7 @@ split_skip_inode:
 
 bad:
 	pr_err("corrupt snap message from mds%d\n", mds);
+	ceph_msg_dump(msg);
 out:
 	if (locked_rwsem)
 		up_write(&mdsc->snap_rwsem);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a828943296c5..6d02a166f8ff 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -602,13 +602,8 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
 {
 	if (client->have_fsid) {
 		if (ceph_fsid_compare(&client->fsid, fsid)) {
-			print_hex_dump(KERN_ERR, "this fsid: ",
-				       DUMP_PREFIX_NONE, 16, 1,
-				       (void *)fsid, 16, 0);
-			print_hex_dump(KERN_ERR, " old fsid: ",
-				       DUMP_PREFIX_NONE, 16, 1,
-				       (void *)&client->fsid, 16, 0);
-			pr_err("fsid mismatch\n");
+			pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
+			       PR_FSID(&client->fsid), PR_FSID(fsid));
 			return -1;
 		}
 	} else {
-- 
cgit v1.2.2


From cf3e5c409b5d66ec66207092a3f7e3e2c42c0f3f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 11 Dec 2009 09:48:05 -0800
Subject: ceph: plug leak of incoming message during connection fault/close

If we explicitly close a connection, or there is a socket error, we need
to drop any partially received message.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index b10f88c56706..b12604ef1846 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -320,6 +320,11 @@ static void reset_connection(struct ceph_connection *con)
 	ceph_msg_remove_list(&con->out_queue);
 	ceph_msg_remove_list(&con->out_sent);
 
+	if (con->in_msg) {
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
+
 	con->connect_seq = 0;
 	con->out_seq = 0;
 	if (con->out_msg) {
@@ -1288,7 +1293,7 @@ static int read_partial_message(struct ceph_connection *con)
 		con->in_msg = con->ops->alloc_msg(con, &con->in_hdr);
 		if (!con->in_msg) {
 			/* skip this message */
-			dout("alloc_msg returned NULL, skipping message\n");
+			pr_err("alloc_msg returned NULL, skipping message\n");
 			con->in_base_pos = -front_len - middle_len - data_len -
 				sizeof(m->footer);
 			con->in_tag = CEPH_MSGR_TAG_READY;
@@ -1327,7 +1332,7 @@ static int read_partial_message(struct ceph_connection *con)
 			if (con->ops->alloc_middle)
 				ret = con->ops->alloc_middle(con, m);
 			if (ret < 0) {
-				dout("alloc_middle failed, skipping payload\n");
+				pr_err("alloc_middle fail skipping payload\n");
 				con->in_base_pos = -middle_len - data_len
 					- sizeof(m->footer);
 				ceph_msg_put(con->in_msg);
@@ -1498,6 +1503,7 @@ more:
 		set_bit(CONNECTING, &con->state);
 		clear_bit(NEGOTIATING, &con->state);
 
+		BUG_ON(con->in_msg);
 		con->in_tag = CEPH_MSGR_TAG_READY;
 		dout("try_write initiating connect on %p new state %lu\n",
 		     con, con->state);
-- 
cgit v1.2.2


From e2885f06ce31d82b556be021acfa2eba160f29cc Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 15 Dec 2009 10:27:48 -0800
Subject: ceph: make mds ops interruptible

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 29a93fe35f85..d7cecc3366da 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1597,14 +1597,17 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 	if (!req->r_reply) {
 		mutex_unlock(&mdsc->mutex);
 		if (req->r_timeout) {
-			err = wait_for_completion_timeout(&req->r_completion,
-							  req->r_timeout);
-			if (err > 0)
-				err = 0;
-			else if (err == 0)
+			err = (long)wait_for_completion_interruptible_timeout(
+				&req->r_completion, req->r_timeout);
+			if (err == 0)
 				req->r_reply = ERR_PTR(-EIO);
+			else if (err < 0)
+				req->r_reply = ERR_PTR(err);
 		} else {
-			wait_for_completion(&req->r_completion);
+                        err = wait_for_completion_interruptible(
+                                &req->r_completion);
+                        if (err)
+                                req->r_reply = ERR_PTR(err);
 		}
 		mutex_lock(&mdsc->mutex);
 	}
-- 
cgit v1.2.2


From 06edf046dd68ccbc7cf5f70f957a31702d0e7596 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 15 Dec 2009 14:44:32 -0800
Subject: ceph: include link to bdi in debugfs

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/debugfs.c | 7 +++++++
 fs/ceph/super.h   | 1 +
 2 files changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index b90fc3e1ff70..441484ab7e94 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,5 +1,6 @@
 #include "ceph_debug.h"
 
+#include <linux/device.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
@@ -24,6 +25,7 @@
  *      .../monc        - mon client state
  *      .../dentry_lru  - dump contents of dentry lru
  *      .../caps        - expose cap (reservation) stats
+ *      .../bdi         - symlink to ../../bdi/something
  */
 
 static struct dentry *ceph_debugfs_dir;
@@ -407,6 +409,10 @@ int ceph_debugfs_client_init(struct ceph_client *client)
 	if (!client->debugfs_caps)
 		goto out;
 
+	sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
+	client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
+						     name);
+
 	return 0;
 
 out:
@@ -416,6 +422,7 @@ out:
 
 void ceph_debugfs_client_cleanup(struct ceph_client *client)
 {
+	debugfs_remove(client->debugfs_bdi);
 	debugfs_remove(client->debugfs_caps);
 	debugfs_remove(client->debugfs_dentry_lru);
 	debugfs_remove(client->debugfs_osdmap);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index de5e32414978..2304bd2844a4 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -143,6 +143,7 @@ struct ceph_client {
 	struct dentry *debugfs_monmap;
 	struct dentry *debugfs_mdsmap, *debugfs_osdmap;
 	struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+	struct dentry *debugfs_bdi;
 #endif
 };
 
-- 
cgit v1.2.2


From 169e16ce816ca417286daf1db25de424a9d65a0c Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Wed, 16 Dec 2009 14:22:17 -0800
Subject: ceph: remove unaccessible code

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/messenger.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index b12604ef1846..2e4e9773c46b 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1531,10 +1531,6 @@ more_kvec:
 		ret = write_partial_kvec(con);
 		if (ret <= 0)
 			goto done;
-		if (ret < 0) {
-			dout("try_write write_partial_kvec err %d\n", ret);
-			goto done;
-		}
 	}
 
 	/* msg pages? */
-- 
cgit v1.2.2


From dbd646a851713bec5bfff40ecf624b2e78518fe5 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Wed, 16 Dec 2009 14:51:06 -0800
Subject: ceph: writepage grabs and releases inode

Fixes a deadlock that is triggered due to kswapd,
while the page was locked and the iput couldn't tear
down the address space.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/addr.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index bf535815592d..d0cdceb0b90b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -448,8 +448,13 @@ out:
 
 static int ceph_writepage(struct page *page, struct writeback_control *wbc)
 {
-	int err = writepage_nounlock(page, wbc);
+	int err;
+	struct inode *inode = page->mapping->host;
+	BUG_ON(!inode);
+	igrab(inode);
+	err = writepage_nounlock(page, wbc);
 	unlock_page(page);
+	iput(inode);
 	return err;
 }
 
-- 
cgit v1.2.2


From 2baba25019ec564cd247af74013873d69a0b8190 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Fri, 18 Dec 2009 13:51:57 -0800
Subject: ceph: writeback congestion control

Set bdi congestion bit when amount of write data in flight exceeds adjustable
threshold.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c    | 35 +++++++++++++++++++++++++++++++++--
 fs/ceph/debugfs.c | 33 +++++++++++++++++++++++++++++++++
 fs/ceph/super.c   | 36 ++++++++++++++++++++++++++++++++++++
 fs/ceph/super.h   |  3 +++
 4 files changed, 105 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d0cdceb0b90b..a6850a14038e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -47,6 +47,12 @@
  * accounting is preserved.
  */
 
+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
+#define CONGESTION_OFF_THRESH(congestion_kb)				\
+	(CONGESTION_ON_THRESH(congestion_kb) -				\
+	 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
+
+
 
 /*
  * Dirty a page.  Optimistically adjust accounting, on the assumption
@@ -377,6 +383,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode;
 	struct ceph_inode_info *ci;
+	struct ceph_client *client;
 	struct ceph_osd_client *osdc;
 	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
 	int len = PAGE_CACHE_SIZE;
@@ -384,6 +391,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	int err = 0;
 	struct ceph_snap_context *snapc;
 	u64 snap_size = 0;
+	long writeback_stat;
 
 	dout("writepage %p idx %lu\n", page, page->index);
 
@@ -393,7 +401,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	}
 	inode = page->mapping->host;
 	ci = ceph_inode(inode);
-	osdc = &ceph_inode_to_client(inode)->osdc;
+	client = ceph_inode_to_client(inode);
+	osdc = &client->osdc;
 
 	/* verify this is a writeable snap context */
 	snapc = (void *)page->private;
@@ -420,6 +429,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	dout("writepage %p page %p index %lu on %llu~%u\n",
 	     inode, page, page->index, page_off, len);
 
+	writeback_stat = atomic_long_inc_return(&client->writeback_count);
+	if (writeback_stat >
+	    CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
+		set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+
 	set_page_writeback(page);
 	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
 				   &ci->i_layout, snapc,
@@ -499,6 +513,8 @@ static void writepages_finish(struct ceph_osd_request *req,
 	struct writeback_control *wbc = req->r_wbc;
 	__s32 rc = -EIO;
 	u64 bytes = 0;
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	long writeback_stat;
 
 	/* parse reply */
 	replyhead = msg->front.iov_base;
@@ -524,6 +540,13 @@ static void writepages_finish(struct ceph_osd_request *req,
 		BUG_ON(!page);
 		WARN_ON(!PageUptodate(page));
 
+		writeback_stat =
+			atomic_long_dec_return(&client->writeback_count);
+		if (writeback_stat <
+		    CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
+			clear_bdi_congested(&client->backing_dev_info,
+					    BLK_RW_ASYNC);
+
 		if (i >= wrote) {
 			dout("inode %p skipping page %p\n", inode, page);
 			wbc->pages_skipped++;
@@ -666,6 +689,7 @@ retry:
 		u64 offset, len;
 		struct ceph_osd_request_head *reqhead;
 		struct ceph_osd_op *op;
+		long writeback_stat;
 
 		next = 0;
 		locked_pages = 0;
@@ -773,6 +797,12 @@ get_more_pages:
 				first = i;
 			dout("%p will write page %p idx %lu\n",
 			     inode, page, page->index);
+
+			writeback_stat = atomic_long_inc_return(&client->writeback_count);
+			if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
+				set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+			}
+
 			set_page_writeback(page);
 			req->r_pages[locked_pages] = page;
 			locked_pages++;
@@ -998,7 +1028,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 			  struct page *page, void *fsdata)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = &client->mdsc;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	int check_cap = 0;
 
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 441484ab7e94..22d3b47fb1be 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -320,6 +320,30 @@ DEFINE_SHOW_FUNC(osdc_show)
 DEFINE_SHOW_FUNC(dentry_lru_show)
 DEFINE_SHOW_FUNC(caps_show)
 
+static int congestion_kb_set(void *data, u64 val)
+{
+	struct ceph_client *client = (struct ceph_client *)data;
+
+	if (client)
+		client->mount_args->congestion_kb = (int)val;
+
+	return 0;
+}
+
+static int congestion_kb_get(void *data, u64 *val)
+{
+	struct ceph_client *client = (struct ceph_client *)data;
+
+	if (client)
+		*val = (u64)client->mount_args->congestion_kb;
+
+	return 0;
+}
+
+
+DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
+			congestion_kb_set, "%llu\n");
+
 int __init ceph_debugfs_init(void)
 {
 	ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
@@ -409,6 +433,14 @@ int ceph_debugfs_client_init(struct ceph_client *client)
 	if (!client->debugfs_caps)
 		goto out;
 
+	client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
+						   0600,
+						   client->debugfs_dir,
+						   client,
+						   &congestion_kb_fops);
+	if (!client->debugfs_congestion_kb)
+		goto out;
+
 	sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
 	client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
 						     name);
@@ -431,6 +463,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
 	debugfs_remove(client->osdc.debugfs_file);
 	debugfs_remove(client->mdsc.debugfs_file);
 	debugfs_remove(client->monc.debugfs_file);
+	debugfs_remove(client->debugfs_congestion_kb);
 	debugfs_remove(client->debugfs_dir);
 }
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6d02a166f8ff..b9cb8cebcdc1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -150,6 +150,35 @@ static void ceph_inode_init_once(void *foo)
 	inode_init_once(&ci->vfs_inode);
 }
 
+static int default_congestion_kb(void)
+{
+	int congestion_kb;
+
+	/*
+	 * Copied from NFS
+	 *
+	 * congestion size, scale with available memory.
+	 *
+	 *  64MB:    8192k
+	 * 128MB:   11585k
+	 * 256MB:   16384k
+	 * 512MB:   23170k
+	 *   1GB:   32768k
+	 *   2GB:   46340k
+	 *   4GB:   65536k
+	 *   8GB:   92681k
+	 *  16GB:  131072k
+	 *
+	 * This allows larger machines to have larger/more transfers.
+	 * Limit the default to 256M
+	 */
+	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	if (congestion_kb > 256*1024)
+		congestion_kb = 256*1024;
+
+	return congestion_kb;
+}
+
 static int __init init_caches(void)
 {
 	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -267,6 +296,7 @@ enum {
 	Opt_caps_wanted_delay_min,
 	Opt_caps_wanted_delay_max,
 	Opt_readdir_max_entries,
+	Opt_congestion_kb,
 	Opt_last_int,
 	/* int args above */
 	Opt_snapdirname,
@@ -295,6 +325,7 @@ static match_table_t arg_tokens = {
 	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
 	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
+	{Opt_congestion_kb, "write_congestion_kb=%d"},
 	/* int args above */
 	{Opt_snapdirname, "snapdirname=%s"},
 	{Opt_name, "name=%s"},
@@ -342,6 +373,7 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
 	args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
 	args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
 	args->max_readdir = 1024;
+	args->congestion_kb = default_congestion_kb();
 
 	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
 	err = -EINVAL;
@@ -445,6 +477,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
 		case Opt_readdir_max_entries:
 			args->max_readdir = intval;
 			break;
+		case Opt_congestion_kb:
+			args->congestion_kb = intval;
+			break;
 
 		case Opt_noshare:
 			args->flags |= CEPH_OPT_NOSHARE;
@@ -516,6 +551,7 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
 	client->msgr = NULL;
 
 	client->mount_err = 0;
+	atomic_long_set(&client->writeback_count, 0);
 
 	err = bdi_init(&client->backing_dev_info);
 	if (err < 0)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2304bd2844a4..62d9ae482d72 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -59,6 +59,7 @@ struct ceph_mount_args {
 	int wsize;
 	int rsize;            /* max readahead */
 	int max_readdir;      /* max readdir size */
+	int congestion_kb;      /* max readdir size */
 	int osd_timeout;
 	char *snapdir_name;   /* default ".snap" */
 	char *name;
@@ -136,6 +137,7 @@ struct ceph_client {
 	struct workqueue_struct *wb_wq;
 	struct workqueue_struct *pg_inv_wq;
 	struct workqueue_struct *trunc_wq;
+	atomic_long_t writeback_count;
 
 	struct backing_dev_info backing_dev_info;
 
@@ -143,6 +145,7 @@ struct ceph_client {
 	struct dentry *debugfs_monmap;
 	struct dentry *debugfs_mdsmap, *debugfs_osdmap;
 	struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+	struct dentry *debugfs_congestion_kb;
 	struct dentry *debugfs_bdi;
 #endif
 };
-- 
cgit v1.2.2


From c4a29f26d50bea65809ca670992108a33aa2efa6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 21 Dec 2009 11:42:18 -0800
Subject: ceph: ensure rename target dentry fails revalidation

This works around a bug in vfs_rename_dir() that rehashes the target
dentry.  Ensure such dentries always fail revalidation by timing out the
dentry lease and kicking it out of the current directory lease gen.

This can be reverted when the vfs bug is fixed.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index db684686f48a..8774b2811597 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -939,6 +939,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			     req->r_old_dentry->d_name.len,
 			     req->r_old_dentry->d_name.name,
 			     dn, dn->d_name.len, dn->d_name.name);
+			/* ensure target dentry is invalidated, despite
+			   rehashing bug in vfs_rename_dir */
+			dn->d_time = jiffies;
+			ceph_dentry(dn)->lease_shared_gen = 0;
 			/* take overwritten dentry's readdir offset */
 			ceph_dentry(req->r_old_dentry)->offset =
 				ceph_dentry(dn)->offset;
-- 
cgit v1.2.2


From 5de7bf8afa87f75af5ef3d6f9fce3e171cac834c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 21 Dec 2009 11:48:44 -0800
Subject: ceph: do not drop lease during revalidate

We need to hold session s_mutex for __ceph_mdsc_drop_dentry_lease(), which
we don't, so skip it.  It was purely an optimization.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 89ce3ba4a614..fde839c61236 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -920,8 +920,6 @@ static int dentry_lease_is_valid(struct dentry *dentry)
 				di->lease_renew_after = 0;
 				di->lease_renew_from = jiffies;
 			}
-		} else {
-			__ceph_mdsc_drop_dentry_lease(dentry);
 		}
 	}
 	spin_unlock(&dentry->d_lock);
-- 
cgit v1.2.2


From 30dc6381bbac213987be6fe0b0fb89868ff1f2c0 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 21 Dec 2009 14:49:37 -0800
Subject: ceph: fix error paths for corrupt osdmap messages

Both osdmap_decode() and osdmap_apply_incremental() should never return
NULL.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c |  2 ++
 fs/ceph/osdmap.c     | 11 ++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 4bfe880d53c8..b474b3ad61f0 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -910,6 +910,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 				err = PTR_ERR(newmap);
 				goto bad;
 			}
+			BUG_ON(!newmap);
 			if (newmap != osdc->osdmap) {
 				ceph_osdmap_destroy(osdc->osdmap);
 				osdc->osdmap = newmap;
@@ -946,6 +947,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 				err = PTR_ERR(newmap);
 				goto bad;
 			}
+			BUG_ON(!newmap);
 			oldmap = osdc->osdmap;
 			osdc->osdmap = newmap;
 			if (oldmap)
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 8c8ffe5ef7d4..a9416308de6f 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -200,6 +200,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 			size = sizeof(struct crush_bucket_straw);
 			break;
 		default:
+			err = -EINVAL;
 			goto bad;
 		}
 		BUG_ON(size == 0);
@@ -278,6 +279,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 		/* len */
 		ceph_decode_32_safe(p, end, yes, bad);
 #if BITS_PER_LONG == 32
+		err = -EINVAL;
 		if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
 			goto bad;
 #endif
@@ -489,11 +491,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		ceph_decode_copy(p, &pgid, sizeof(pgid));
 		n = ceph_decode_32(p);
 		ceph_decode_need(p, end, n * sizeof(u32), bad);
+		err = -ENOMEM;
 		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
-		if (!pg) {
-			err = -ENOMEM;
+		if (!pg)
 			goto bad;
-		}
 		pg->pgid = pgid;
 		pg->len = n;
 		for (j = 0; j < n; j++)
@@ -564,8 +565,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	if (len > 0) {
 		dout("apply_incremental full map len %d, %p to %p\n",
 		     len, *p, end);
-		newmap = osdmap_decode(p, min(*p+len, end));
-		return newmap;  /* error or not */
+		return osdmap_decode(p, min(*p+len, end));
 	}
 
 	/* new crush? */
@@ -809,6 +809,7 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 	struct ceph_pg_pool_info *pool;
 	unsigned ps;
 
+	BUG_ON(!osdmap);
 	if (poolid >= osdmap->num_pools)
 		return -EIO;
 
-- 
cgit v1.2.2


From 7067f797b8409f1e10ec95ac2c1e17a200173d13 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 21 Dec 2009 16:02:37 -0800
Subject: ceph: fix incremental osdmap pg_temp decoding bug

An incremental pg_temp wasn't being decoded properly (wrong bound on
for loop).

Also remove unused local variable, while we're at it.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index a9416308de6f..0dbd606e21c4 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -538,7 +538,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 					     struct ceph_osdmap *map,
 					     struct ceph_messenger *msgr)
 {
-	struct ceph_osdmap *newmap = map;
 	struct crush_map *newcrush = NULL;
 	struct ceph_fsid fsid;
 	u32 epoch = 0;
@@ -701,7 +700,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 			}
 			pg->pgid = pgid;
 			pg->len = pglen;
-			for (j = 0; j < len; j++)
+			for (j = 0; j < pglen; j++)
 				pg->osds[j] = ceph_decode_32(p);
 			err = __insert_pg_mapping(pg, &map->pg_temp);
 			if (err)
-- 
cgit v1.2.2


From 5dacf09121ffb2e5fc7d15b78cae0b77042a1935 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 21 Dec 2009 20:40:34 -0800
Subject: ceph: do not touch_caps while iterating over caps list

Avoid confusing iterate_session_caps(), flag the session while we are
iterating so that __touch_cap does not rearrange items on the list.

All other modifiers of session->s_caps do so under the protection of
s_mutex.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       | 11 ++++++++---
 fs/ceph/mds_client.c | 14 ++++++++++----
 fs/ceph/mds_client.h |  1 +
 3 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index dfb509f53542..93c1afe3f0b3 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -697,10 +697,15 @@ static void __touch_cap(struct ceph_cap *cap)
 {
 	struct ceph_mds_session *s = cap->session;
 
-	dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
-	     s->s_mds);
 	spin_lock(&s->s_cap_lock);
-	list_move_tail(&cap->session_caps, &s->s_caps);
+	if (!s->s_iterating_caps) {
+		dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+		     s->s_mds);
+		list_move_tail(&cap->session_caps, &s->s_caps);
+	} else {
+		dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
+		     &cap->ci->vfs_inode, cap, s->s_mds);
+	}
 	spin_unlock(&s->s_cap_lock);
 }
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d7cecc3366da..63ca3b1ad45f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -337,10 +337,12 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	s->s_renew_seq = 0;
 	INIT_LIST_HEAD(&s->s_caps);
 	s->s_nr_caps = 0;
+	s->s_trim_caps = 0;
 	atomic_set(&s->s_ref, 1);
 	INIT_LIST_HEAD(&s->s_waiting);
 	INIT_LIST_HEAD(&s->s_unsafe);
 	s->s_num_cap_releases = 0;
+	s->s_iterating_caps = false;
 	INIT_LIST_HEAD(&s->s_cap_releases);
 	INIT_LIST_HEAD(&s->s_cap_releases_done);
 	INIT_LIST_HEAD(&s->s_cap_flushing);
@@ -699,6 +701,7 @@ static int iterate_session_caps(struct ceph_mds_session *session,
 
 	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
 	spin_lock(&session->s_cap_lock);
+	session->s_iterating_caps = true;
 	list_for_each_entry_safe(cap, ncap, &session->s_caps, session_caps) {
 		inode = igrab(&cap->ci->vfs_inode);
 		if (!inode)
@@ -706,13 +709,15 @@ static int iterate_session_caps(struct ceph_mds_session *session,
 		spin_unlock(&session->s_cap_lock);
 		ret = cb(inode, cap, arg);
 		iput(inode);
-		if (ret < 0)
-			return ret;
 		spin_lock(&session->s_cap_lock);
+		if (ret < 0)
+			goto out;
 	}
+	ret = 0;
+out:
+	session->s_iterating_caps = false;
 	spin_unlock(&session->s_cap_lock);
-
-	return 0;
+	return ret;
 }
 
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
@@ -935,6 +940,7 @@ static int trim_caps(struct ceph_mds_client *mdsc,
 		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
 		     session->s_mds, session->s_nr_caps, max_caps,
 			trim_caps - session->s_trim_caps);
+		session->s_trim_caps = 0;
 	}
 	return 0;
 }
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 41af5ca316e6..b1c2025227c5 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -114,6 +114,7 @@ struct ceph_mds_session {
 	int               s_num_cap_releases;
 	struct list_head  s_cap_releases; /* waiting cap_release messages */
 	struct list_head  s_cap_releases_done; /* ready to send */
+	bool              s_iterating_caps;
 
 	/* protected by mutex */
 	struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
-- 
cgit v1.2.2


From e0e3271074e1ebd0b80a912a457ce03c971bcd66 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 21 Dec 2009 21:04:26 -0800
Subject: ceph: only unregister registered bdi

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b9cb8cebcdc1..cd81c84e96fc 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -942,7 +942,8 @@ static void ceph_kill_sb(struct super_block *s)
 	dout("kill_sb %p\n", s);
 	ceph_mdsc_pre_umount(&client->mdsc);
 	kill_anon_super(s);    /* will call put_super after sb is r/o */
-	bdi_unregister(&client->backing_dev_info);
+	if (s->s_bdi == &client->backing_dev_info)
+		bdi_unregister(&client->backing_dev_info);
 	bdi_destroy(&client->backing_dev_info);
 	ceph_destroy_client(client);
 }
-- 
cgit v1.2.2


From 529cfcc46ffa2cbe4d07641c11e65f67fe7b66e4 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 22 Dec 2009 10:29:39 -0800
Subject: ceph: unregister canceled/timed out osd requests

Canceled or timed out osd requests were getting left in the request list
and never deallocated (until umount).  Unregister if they are canceled
(control-c) or time out.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index b474b3ad61f0..a1800fb63237 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1071,8 +1071,9 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
 	if (rc < 0) {
 		mutex_lock(&osdc->request_mutex);
 		__cancel_request(req);
+		__unregister_request(osdc, req);
 		mutex_unlock(&osdc->request_mutex);
-		dout("wait_request tid %llu timed out\n", req->r_tid);
+		dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
 		return rc;
 	}
 
-- 
cgit v1.2.2


From ec302645f4a9bd9ec757c30d185557e1c0972c1a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 22 Dec 2009 10:43:42 -0800
Subject: ceph: use connection mutex to protect read and write stages

Use a single mutex (previously out_mutex) to protect both read and write
activity from concurrent ceph_con_* calls.  Drop the mutex when doing
callbacks to avoid nested locking (the callback may need to call something
like ceph_con_close).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 49 +++++++++++++++++++++++++++++++------------------
 fs/ceph/messenger.h |  3 ++-
 2 files changed, 33 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 2e4e9773c46b..c03b4185c143 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -316,7 +316,6 @@ static void reset_connection(struct ceph_connection *con)
 {
 	/* reset connection, out_queue, msg_ and connect_seq */
 	/* discard existing out_queue and msg_seq */
-	mutex_lock(&con->out_mutex);
 	ceph_msg_remove_list(&con->out_queue);
 	ceph_msg_remove_list(&con->out_sent);
 
@@ -332,7 +331,6 @@ static void reset_connection(struct ceph_connection *con)
 		con->out_msg = NULL;
 	}
 	con->in_seq = 0;
-	mutex_unlock(&con->out_mutex);
 }
 
 /*
@@ -343,7 +341,9 @@ void ceph_con_close(struct ceph_connection *con)
 	dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
 	set_bit(CLOSED, &con->state);  /* in case there's queued work */
 	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
+	mutex_lock(&con->mutex);
 	reset_connection(con);
+	mutex_unlock(&con->mutex);
 	queue_con(con);
 }
 
@@ -392,7 +392,7 @@ void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
 	memset(con, 0, sizeof(*con));
 	atomic_set(&con->nref, 1);
 	con->msgr = msgr;
-	mutex_init(&con->out_mutex);
+	mutex_init(&con->mutex);
 	INIT_LIST_HEAD(&con->out_queue);
 	INIT_LIST_HEAD(&con->out_sent);
 	INIT_DELAYED_WORK(&con->work, con_work);
@@ -571,11 +571,13 @@ static void prepare_connect_authorizer(struct ceph_connection *con)
 	int auth_len = 0;
 	int auth_protocol = 0;
 
+	mutex_unlock(&con->mutex);
 	if (con->ops->get_authorizer)
 		con->ops->get_authorizer(con, &auth_buf, &auth_len,
 					 &auth_protocol, &con->auth_reply_buf,
 					 &con->auth_reply_buf_len,
 					 con->auth_retry);
+	mutex_lock(&con->mutex);
 
 	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
 	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
@@ -1094,10 +1096,13 @@ static int process_connect(struct ceph_connection *con)
 		       le32_to_cpu(con->out_connect.protocol_version),
 		       le32_to_cpu(con->in_reply.protocol_version));
 		con->error_msg = "protocol version mismatch";
-		if (con->ops->bad_proto)
-			con->ops->bad_proto(con);
 		reset_connection(con);
 		set_bit(CLOSED, &con->state);  /* in case there's queued work */
+
+		mutex_unlock(&con->mutex);
+		if (con->ops->bad_proto)
+			con->ops->bad_proto(con);
+		mutex_lock(&con->mutex);
 		return -1;
 
 	case CEPH_MSGR_TAG_BADAUTHORIZER:
@@ -1133,9 +1138,11 @@ static int process_connect(struct ceph_connection *con)
 		prepare_read_connect(con);
 
 		/* Tell ceph about it. */
+		mutex_unlock(&con->mutex);
 		pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
 		if (con->ops->peer_reset)
 			con->ops->peer_reset(con);
+		mutex_lock(&con->mutex);
 		break;
 
 	case CEPH_MSGR_TAG_RETRY_SESSION:
@@ -1221,7 +1228,6 @@ static void process_ack(struct ceph_connection *con)
 	u64 ack = le64_to_cpu(con->in_temp_ack);
 	u64 seq;
 
-	mutex_lock(&con->out_mutex);
 	while (!list_empty(&con->out_sent)) {
 		m = list_first_entry(&con->out_sent, struct ceph_msg,
 				     list_head);
@@ -1232,7 +1238,6 @@ static void process_ack(struct ceph_connection *con)
 		     le16_to_cpu(m->hdr.type), m);
 		ceph_msg_remove(m);
 	}
-	mutex_unlock(&con->out_mutex);
 	prepare_read_tag(con);
 }
 
@@ -1366,8 +1371,10 @@ static int read_partial_message(struct ceph_connection *con)
 		/* find pages for data payload */
 		want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
 		ret = -1;
+		mutex_unlock(&con->mutex);
 		if (con->ops->prepare_pages)
 			ret = con->ops->prepare_pages(con, m, want);
+		mutex_lock(&con->mutex);
 		if (ret < 0) {
 			dout("%p prepare_pages failed, skipping payload\n", m);
 			con->in_base_pos = -data_len - sizeof(m->footer);
@@ -1454,9 +1461,8 @@ static void process_message(struct ceph_connection *con)
 	if (con->peer_name.type == 0)
 		con->peer_name = msg->hdr.src.name;
 
-	mutex_lock(&con->out_mutex);
 	con->in_seq++;
-	mutex_unlock(&con->out_mutex);
+	mutex_unlock(&con->mutex);
 
 	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
 	     msg, le64_to_cpu(msg->hdr.seq),
@@ -1467,6 +1473,8 @@ static void process_message(struct ceph_connection *con)
 	     le32_to_cpu(msg->hdr.data_len),
 	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
 	con->ops->dispatch(con, msg);
+
+	mutex_lock(&con->mutex);
 	prepare_read_tag(con);
 }
 
@@ -1483,7 +1491,7 @@ static int try_write(struct ceph_connection *con)
 	dout("try_write start %p state %lu nref %d\n", con, con->state,
 	     atomic_read(&con->nref));
 
-	mutex_lock(&con->out_mutex);
+	mutex_lock(&con->mutex);
 more:
 	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
 
@@ -1576,7 +1584,7 @@ do_next:
 done:
 	ret = 0;
 out:
-	mutex_unlock(&con->out_mutex);
+	mutex_unlock(&con->mutex);
 	dout("try_write done on %p\n", con);
 	return ret;
 }
@@ -1600,6 +1608,8 @@ static int try_read(struct ceph_connection *con)
 	dout("try_read start on %p\n", con);
 	msgr = con->msgr;
 
+	mutex_lock(&con->mutex);
+
 more:
 	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
 	     con->in_base_pos);
@@ -1693,6 +1703,7 @@ more:
 done:
 	ret = 0;
 out:
+	mutex_unlock(&con->mutex);
 	dout("try_read done on %p\n", con);
 	return ret;
 
@@ -1818,6 +1829,8 @@ static void ceph_fault(struct ceph_connection *con)
 
 	clear_bit(BUSY, &con->state);  /* to avoid an improbable race */
 
+	mutex_lock(&con->mutex);
+
 	con_close_socket(con);
 
 	if (con->in_msg) {
@@ -1827,24 +1840,24 @@ static void ceph_fault(struct ceph_connection *con)
 
 	/* If there are no messages in the queue, place the connection
 	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
-	mutex_lock(&con->out_mutex);
 	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
 		dout("fault setting STANDBY\n");
 		set_bit(STANDBY, &con->state);
-		mutex_unlock(&con->out_mutex);
+		mutex_unlock(&con->mutex);
 		goto out;
 	}
 
 	/* Requeue anything that hasn't been acked, and retry after a
 	 * delay. */
 	list_splice_init(&con->out_sent, &con->out_queue);
-	mutex_unlock(&con->out_mutex);
 
 	if (con->delay == 0)
 		con->delay = BASE_DELAY_INTERVAL;
 	else if (con->delay < MAX_DELAY_INTERVAL)
 		con->delay *= 2;
 
+	mutex_unlock(&con->mutex);
+
 	/* explicitly schedule work to try to reconnect again later. */
 	dout("fault queueing %p delay %lu\n", con, con->delay);
 	con->ops->get(con);
@@ -1920,7 +1933,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 	msg->hdr.dst_erank = con->peer_addr.erank;
 
 	/* queue */
-	mutex_lock(&con->out_mutex);
+	mutex_lock(&con->mutex);
 	BUG_ON(!list_empty(&msg->list_head));
 	list_add_tail(&msg->list_head, &con->out_queue);
 	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
@@ -1929,7 +1942,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 	     le32_to_cpu(msg->hdr.front_len),
 	     le32_to_cpu(msg->hdr.middle_len),
 	     le32_to_cpu(msg->hdr.data_len));
-	mutex_unlock(&con->out_mutex);
+	mutex_unlock(&con->mutex);
 
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
@@ -1942,7 +1955,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
  */
 void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 {
-	mutex_lock(&con->out_mutex);
+	mutex_lock(&con->mutex);
 	if (!list_empty(&msg->list_head)) {
 		dout("con_revoke %p msg %p\n", con, msg);
 		list_del_init(&msg->list_head);
@@ -1959,7 +1972,7 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 	} else {
 		dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
 	}
-	mutex_unlock(&con->out_mutex);
+	mutex_unlock(&con->mutex);
 }
 
 /*
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index e04c214b4f6f..94b55de90331 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -155,8 +155,9 @@ struct ceph_connection {
 	void *auth_reply_buf;   /* where to put the authorizer reply */
 	int auth_reply_buf_len;
 
+	struct mutex mutex;
+
 	/* out queue */
-	struct mutex out_mutex;
 	struct list_head out_queue;
 	struct list_head out_sent;   /* sending or sent but unacked */
 	u64 out_seq;		     /* last message queued for send */
-- 
cgit v1.2.2


From 350b1c32ea58d29e25d63fc25e92dd48f9339546 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 22 Dec 2009 10:45:45 -0800
Subject: ceph: control access to page vector for incoming data

When we issue an OSD read, we specify a vector of pages that the data is to
be read into.  The request may be sent multiple times, to multiple OSDs, if
the osdmap changes, which means we can get more than one reply.

Only read data into the page vector if the reply is coming from the
OSD we last sent the request to.  Keep track of which connection is using
the vector by taking a reference.  If another connection was already
using the vector before and a new reply comes in on the right connection,
revoke the pages from the other connection.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c  | 29 +++++++++++++++++++++++++++++
 fs/ceph/messenger.h  |  2 ++
 fs/ceph/osd_client.c | 42 +++++++++++++++++++++++++++++++++---------
 fs/ceph/osd_client.h |  4 +++-
 4 files changed, 67 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index c03b4185c143..506b638a023b 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1975,6 +1975,35 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 	mutex_unlock(&con->mutex);
 }
 
+/*
+ * Revoke a page vector that we may be reading data into
+ */
+void ceph_con_revoke_pages(struct ceph_connection *con, struct page **pages)
+{
+	mutex_lock(&con->mutex);
+	if (con->in_msg && con->in_msg->pages == pages) {
+		unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
+
+		/* skip rest of message */
+		dout("con_revoke_pages %p msg %p pages %p revoked\n", con,
+		     con->in_msg, pages);
+		if (con->in_msg_pos.data_pos < data_len)
+			con->in_base_pos = con->in_msg_pos.data_pos - data_len;
+		else
+			con->in_base_pos = con->in_base_pos -
+				sizeof(struct ceph_msg_header) -
+				sizeof(struct ceph_msg_footer);
+		con->in_msg->pages = NULL;
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+		con->in_tag = CEPH_MSGR_TAG_READY;
+	} else {
+		dout("con_revoke_pages %p msg %p pages %p no-op\n",
+		     con, con->in_msg, pages);
+	}
+	mutex_unlock(&con->mutex);
+}
+
 /*
  * Queue a keepalive byte to ensure the tcp connection is alive.
  */
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 94b55de90331..7e2aab1d3ce2 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -230,6 +230,8 @@ extern void ceph_con_open(struct ceph_connection *con,
 extern void ceph_con_close(struct ceph_connection *con);
 extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
 extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke_pages(struct ceph_connection *con,
+				  struct page **pages);
 extern void ceph_con_keepalive(struct ceph_connection *con);
 extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
 extern void ceph_con_put(struct ceph_connection *con);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index a1800fb63237..374f0013956c 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -87,6 +87,13 @@ void ceph_osdc_release_request(struct kref *kref)
 		ceph_msg_put(req->r_request);
 	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
+	if (req->r_con_filling_pages) {
+		dout("release_request revoking pages %p from con %p\n",
+		     req->r_pages, req->r_con_filling_pages);
+		ceph_con_revoke_pages(req->r_con_filling_pages,
+				      req->r_pages);
+		ceph_con_put(req->r_con_filling_pages);
+	}
 	if (req->r_own_pages)
 		ceph_release_page_vector(req->r_pages,
 					 req->r_num_pages);
@@ -687,7 +694,8 @@ static void handle_timeout(struct work_struct *work)
  * handle osd op reply.  either call the callback if it is specified,
  * or do the completion to wake up the waiting thread.
  */
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
+			 struct ceph_connection *con)
 {
 	struct ceph_osd_reply_head *rhead = msg->front.iov_base;
 	struct ceph_osd_request *req;
@@ -715,6 +723,16 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	ceph_osdc_get_request(req);
 	flags = le32_to_cpu(rhead->flags);
 
+	/*
+	 * if this connection filled our pages, drop our reference now, to
+	 * avoid a (safe but slower) revoke later.
+	 */
+	if (req->r_con_filling_pages == con && req->r_pages == msg->pages) {
+		dout(" got pages, dropping con_filling_pages ref %p\n", con);
+		req->r_con_filling_pages = NULL;
+		ceph_con_put(con);
+	}
+
 	if (req->r_reply) {
 		/*
 		 * once we see the message has been received, we don't
@@ -1007,14 +1025,20 @@ static int prepare_pages(struct ceph_connection *con, struct ceph_msg *m,
 	}
 	dout("prepare_pages tid %llu has %d pages, want %d\n",
 	     tid, req->r_num_pages, want);
-	if (likely(req->r_num_pages >= want && !req->r_prepared_pages)) {
-		m->pages = req->r_pages;
-		m->nr_pages = req->r_num_pages;
-		req->r_reply = m;  /* only for duration of read over socket */
-		ceph_msg_get(m);
-		req->r_prepared_pages = 1;
-		ret = 0; /* success */
+	if (unlikely(req->r_num_pages < want))
+		goto out;
+
+	if (req->r_con_filling_pages) {
+		dout("revoking pages %p from old con %p\n", req->r_pages,
+		     req->r_con_filling_pages);
+		ceph_con_revoke_pages(req->r_con_filling_pages, req->r_pages);
+		ceph_con_put(req->r_con_filling_pages);
 	}
+	req->r_con_filling_pages = ceph_con_get(con);
+	req->r_reply = ceph_msg_get(m); /* for duration of read over socket */
+	m->pages = req->r_pages;
+	m->nr_pages = req->r_num_pages;
+	ret = 0; /* success */
 out:
 	mutex_unlock(&osdc->request_mutex);
 	return ret;
@@ -1269,7 +1293,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		ceph_osdc_handle_map(osdc, msg);
 		break;
 	case CEPH_MSG_OSD_OPREPLY:
-		handle_reply(osdc, msg);
+		handle_reply(osdc, msg, con);
 		break;
 
 	default:
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index 2e4cfd1e9f10..8fef71cc4457 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -43,11 +43,13 @@ struct ceph_osd_request {
 	struct list_head r_osd_item;
 	struct ceph_osd *r_osd;
 
+	struct ceph_connection *r_con_filling_pages;
+
 	struct ceph_msg  *r_request, *r_reply;
 	int               r_result;
 	int               r_flags;     /* any additional flags for the osd */
 	u32               r_sent;      /* >0 if r_request is sending/sent */
-	int r_prepared_pages, r_got_reply;
+	int               r_got_reply;
 	int		  r_num_prealloc_reply;
 
 	struct ceph_osd_client *r_osdc;
-- 
cgit v1.2.2


From 0cf90ab5b075821940873e73cdbfeb8edc3dabe8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 22 Dec 2009 10:45:18 -0800
Subject: ceph: more informative msgpool errors

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/msgpool.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ad5482c0267b..2f04e0fc4666 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -140,7 +140,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
 			return msg;
 		}
 		pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
-		       pool->min, pool->blocking ? "waiting" : "failing");
+		       pool->min, pool->blocking ? "waiting" : "may fail");
 		spin_unlock(&pool->lock);
 
 		if (!pool->blocking) {
@@ -151,6 +151,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
 			if (!IS_ERR(msg))
 				return msg;
 
+			pr_err("msgpool_get %p empty + alloc failed\n", pool);
 			return ERR_PTR(-ENOMEM);
 		}
 
-- 
cgit v1.2.2


From 6df058c025ce343052c5516b1d8a9a7e73cddd64 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 22 Dec 2009 11:24:33 -0800
Subject: ceph: include transaction id in ceph_msg_header (protocol change)

Many (most?) message types include a transaction id.  By including it in
the fixed size header, we always have it available even when we are unable
to allocate memory for the (larger, variable sized) message body.  This
will allow us to error out the appropriate request instead of (silently)
dropping the reply.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       | 16 ++++++++--------
 fs/ceph/ceph_fs.h    |  8 ++------
 fs/ceph/mds_client.c |  5 +++--
 fs/ceph/mon_client.c |  4 ++--
 fs/ceph/msgr.h       |  3 ++-
 fs/ceph/osd_client.c |  9 +++------
 fs/ceph/rados.h      |  2 --
 7 files changed, 20 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 93c1afe3f0b3..847ae64346fe 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -922,14 +922,14 @@ static int send_cap_msg(struct ceph_mds_session *session,
 	if (IS_ERR(msg))
 		return PTR_ERR(msg);
 
-	fc = msg->front.iov_base;
+	msg->hdr.tid = cpu_to_le64(flush_tid);
 
+	fc = msg->front.iov_base;
 	memset(fc, 0, sizeof(*fc));
 
 	fc->cap_id = cpu_to_le64(cid);
 	fc->op = cpu_to_le32(op);
 	fc->seq = cpu_to_le32(seq);
-	fc->client_tid = cpu_to_le64(flush_tid);
 	fc->issue_seq = cpu_to_le32(issue_seq);
 	fc->migrate_seq = cpu_to_le32(mseq);
 	fc->caps = cpu_to_le32(caps);
@@ -2329,7 +2329,7 @@ restart:
  * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
  * MDS has been safely committed.
  */
-static void handle_cap_flush_ack(struct inode *inode,
+static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 				 struct ceph_mds_caps *m,
 				 struct ceph_mds_session *session,
 				 struct ceph_cap *cap)
@@ -2340,7 +2340,6 @@ static void handle_cap_flush_ack(struct inode *inode,
 	unsigned seq = le32_to_cpu(m->seq);
 	int dirty = le32_to_cpu(m->dirty);
 	int cleaned = 0;
-	u64 flush_tid = le64_to_cpu(m->client_tid);
 	int drop = 0;
 	int i;
 
@@ -2396,13 +2395,12 @@ out:
  *
  * Caller hold s_mutex.
  */
-static void handle_cap_flushsnap_ack(struct inode *inode,
+static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
 				     struct ceph_mds_caps *m,
 				     struct ceph_mds_session *session)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 follows = le64_to_cpu(m->snap_follows);
-	u64 flush_tid = le64_to_cpu(m->client_tid);
 	struct ceph_cap_snap *capsnap;
 	int drop = 0;
 
@@ -2587,12 +2585,14 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	struct ceph_vino vino;
 	u64 cap_id;
 	u64 size, max_size;
+	u64 tid;
 	int check_caps = 0;
 	int r;
 
 	dout("handle_caps from mds%d\n", mds);
 
 	/* decode */
+	tid = le64_to_cpu(msg->hdr.tid);
 	if (msg->front.iov_len < sizeof(*h))
 		goto bad;
 	h = msg->front.iov_base;
@@ -2621,7 +2621,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	/* these will work even if we don't have a cap yet */
 	switch (op) {
 	case CEPH_CAP_OP_FLUSHSNAP_ACK:
-		handle_cap_flushsnap_ack(inode, h, session);
+		handle_cap_flushsnap_ack(inode, tid, h, session);
 		goto done;
 
 	case CEPH_CAP_OP_EXPORT:
@@ -2662,7 +2662,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		break;
 
 	case CEPH_CAP_OP_FLUSH_ACK:
-		handle_cap_flush_ack(inode, h, session, cap);
+		handle_cap_flush_ack(inode, tid, h, session, cap);
 		break;
 
 	case CEPH_CAP_OP_TRUNC:
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index e2fd0247827e..e87dfa6ec8e5 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -35,7 +35,7 @@
  * internal cluster protocols separately from the public,
  * client-facing protocol.
  */
-#define CEPH_OSD_PROTOCOL     7 /* cluster internal */
+#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
 #define CEPH_MDS_PROTOCOL     9 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
 #define CEPH_OSDC_PROTOCOL   22 /* server/client */
@@ -136,7 +136,6 @@ struct ceph_mon_request_header {
 struct ceph_mon_statfs {
 	struct ceph_mon_request_header monhdr;
 	struct ceph_fsid fsid;
-	__le64 tid;
 } __attribute__ ((packed));
 
 struct ceph_statfs {
@@ -146,7 +145,6 @@ struct ceph_statfs {
 
 struct ceph_mon_statfs_reply {
 	struct ceph_fsid fsid;
-	__le64 tid;
 	__le64 version;
 	struct ceph_statfs st;
 } __attribute__ ((packed));
@@ -333,7 +331,7 @@ union ceph_mds_request_args {
 #define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
 
 struct ceph_mds_request_head {
-	__le64 tid, oldest_client_tid;
+	__le64 oldest_client_tid;
 	__le32 mdsmap_epoch;           /* on client */
 	__le32 flags;                  /* CEPH_MDS_FLAG_* */
 	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
@@ -356,7 +354,6 @@ struct ceph_mds_request_release {
 
 /* client reply */
 struct ceph_mds_reply_head {
-	__le64 tid;
 	__le32 op;
 	__le32 result;
 	__le32 mdsmap_epoch;
@@ -542,7 +539,6 @@ struct ceph_mds_caps {
 	__le32 migrate_seq;
 	__le64 snap_follows;
 	__le32 snap_trace_len;
-	__le64 client_tid;          /* for FLUSH(SNAP) -> FLUSH(SNAP)_ACK */
 
 	/* authlock */
 	__le32 uid, gid, mode;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 63ca3b1ad45f..ec884e2845db 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1339,6 +1339,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	if (IS_ERR(msg))
 		goto out_free2;
 
+	msg->hdr.tid = cpu_to_le64(req->r_tid);
+
 	head = msg->front.iov_base;
 	p = msg->front.iov_base + sizeof(*head);
 	end = msg->front.iov_base + msg->front.iov_len;
@@ -1431,7 +1433,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 	req->r_request = msg;
 
 	rhead = msg->front.iov_base;
-	rhead->tid = cpu_to_le64(req->r_tid);
 	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
 	if (req->r_got_unsafe)
 		flags |= CEPH_MDS_FLAG_REPLAY;
@@ -1664,7 +1665,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	}
 
 	/* get request, session */
-	tid = le64_to_cpu(head->tid);
+	tid = le64_to_cpu(msg->hdr.tid);
 	mutex_lock(&mdsc->mutex);
 	req = __lookup_request(mdsc, tid);
 	if (!req) {
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 775a9c029c51..bb94006fc686 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -349,7 +349,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
 
 	if (msg->front.iov_len != sizeof(*reply))
 		goto bad;
-	tid = le64_to_cpu(reply->tid);
+	tid = le64_to_cpu(msg->hdr.tid);
 	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 
 	mutex_lock(&monc->mutex);
@@ -382,12 +382,12 @@ static int send_statfs(struct ceph_mon_client *monc,
 	if (IS_ERR(msg))
 		return PTR_ERR(msg);
 	req->request = msg;
+	msg->hdr.tid = cpu_to_le64(req->tid);
 	h = msg->front.iov_base;
 	h->monhdr.have_version = 0;
 	h->monhdr.session_mon = cpu_to_le16(-1);
 	h->monhdr.session_mon_tid = 0;
 	h->fsid = monc->monmap->fsid;
-	h->tid = cpu_to_le64(req->tid);
 	ceph_con_send(monc->con, msg);
 	return 0;
 }
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index c758e8f8f71b..e46d8b806dea 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -21,7 +21,7 @@
  * whenever the wire protocol changes.  try to keep this string length
  * constant.
  */
-#define CEPH_BANNER "ceph v024"
+#define CEPH_BANNER "ceph v025"
 #define CEPH_BANNER_MAX_LEN 30
 
 
@@ -132,6 +132,7 @@ struct ceph_msg_connect_reply {
  */
 struct ceph_msg_header {
 	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
 	__le16 type;      /* message type */
 	__le16 priority;  /* priority.  higher value == higher priority */
 	__le16 version;   /* version of message encoding */
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 374f0013956c..a0aac436d5d4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -439,11 +439,9 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
 static void register_request(struct ceph_osd_client *osdc,
 			     struct ceph_osd_request *req)
 {
-	struct ceph_osd_request_head *head = req->r_request->front.iov_base;
-
 	mutex_lock(&osdc->request_mutex);
 	req->r_tid = ++osdc->last_tid;
-	head->tid = cpu_to_le64(req->r_tid);
+	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
 
 	dout("register_request %p tid %lld\n", req, req->r_tid);
 	__insert_request(osdc, req);
@@ -702,9 +700,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	u64 tid;
 	int numops, object_len, flags;
 
+	tid = le64_to_cpu(msg->hdr.tid);
 	if (msg->front.iov_len < sizeof(*rhead))
 		goto bad;
-	tid = le64_to_cpu(rhead->tid);
 	numops = le32_to_cpu(rhead->num_ops);
 	object_len = le32_to_cpu(rhead->object_len);
 	if (msg->front.iov_len != sizeof(*rhead) + object_len +
@@ -1002,7 +1000,6 @@ static int prepare_pages(struct ceph_connection *con, struct ceph_msg *m,
 {
 	struct ceph_osd *osd = con->private;
 	struct ceph_osd_client *osdc;
-	struct ceph_osd_reply_head *rhead = m->front.iov_base;
 	struct ceph_osd_request *req;
 	u64 tid;
 	int ret = -1;
@@ -1016,7 +1013,7 @@ static int prepare_pages(struct ceph_connection *con, struct ceph_msg *m,
 	if (unlikely(type != CEPH_MSG_OSD_OPREPLY))
 		return -1;  /* hmm! */
 
-	tid = le64_to_cpu(rhead->tid);
+	tid = le64_to_cpu(m->hdr.tid);
 	mutex_lock(&osdc->request_mutex);
 	req = __lookup_request(osdc, tid);
 	if (!req) {
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 12bfb2f7c275..c5614d4ae34a 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -331,7 +331,6 @@ struct ceph_osd_op {
  * ceph_osd_op object operations.
  */
 struct ceph_osd_request_head {
-	__le64 tid;                        /* transaction id */
 	__le32 client_inc;                 /* client incarnation */
 	struct ceph_object_layout layout;  /* pgid */
 	__le32 osdmap_epoch;               /* client's osdmap epoch */
@@ -352,7 +351,6 @@ struct ceph_osd_request_head {
 } __attribute__ ((packed));
 
 struct ceph_osd_reply_head {
-	__le64 tid;                       /* transaction id */
 	__le32 client_inc;                /* client incarnation */
 	__le32 flags;
 	struct ceph_object_layout layout;
-- 
cgit v1.2.2


From 04a419f908b5291ff7e8ffd7aa351fa0ac0c08af Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 23 Dec 2009 09:30:21 -0800
Subject: ceph: add feature bits to connection handshake (protocol change)

Define supported and required feature set.  Fail connection if the server
requires features we do not support (TAG_FEATURES), or if the server does
not support features we require.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h   |  6 ++++++
 fs/ceph/dir.c       | 12 ++++++------
 fs/ceph/messenger.c | 47 +++++++++++++++++++++++++++++++++++++----------
 fs/ceph/msgr.h      |  5 ++++-
 4 files changed, 53 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index e87dfa6ec8e5..db3fed33c4aa 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -50,6 +50,12 @@
 #define CEPH_MAX_MON   31
 
 
+/*
+ * feature bits
+ */
+#define CEPH_FEATURE_SUPPORTED  0
+#define CEPH_FEATURE_REQUIRED   0
+
 
 /*
  * ceph_file_layout - describe data layout for a file/inode
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index fde839c61236..5107384ee029 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1135,9 +1135,9 @@ void ceph_dentry_lru_add(struct dentry *dn)
 {
 	struct ceph_dentry_info *di = ceph_dentry(dn);
 	struct ceph_mds_client *mdsc;
-	dout("dentry_lru_add %p %p\t%.*s\n",
-			di, dn, dn->d_name.len, dn->d_name.name);
 
+	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
+	     dn->d_name.len, dn->d_name.name);
 	if (di) {
 		mdsc = &ceph_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
@@ -1151,9 +1151,9 @@ void ceph_dentry_lru_touch(struct dentry *dn)
 {
 	struct ceph_dentry_info *di = ceph_dentry(dn);
 	struct ceph_mds_client *mdsc;
-	dout("dentry_lru_touch %p %p\t%.*s\n",
-			di, dn, dn->d_name.len, dn->d_name.name);
 
+	dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
+	     dn->d_name.len, dn->d_name.name);
 	if (di) {
 		mdsc = &ceph_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
@@ -1167,8 +1167,8 @@ void ceph_dentry_lru_del(struct dentry *dn)
 	struct ceph_dentry_info *di = ceph_dentry(dn);
 	struct ceph_mds_client *mdsc;
 
-	dout("dentry_lru_del %p %p\t%.*s\n",
-			di, dn, dn->d_name.len, dn->d_name.name);
+	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
+	     dn->d_name.len, dn->d_name.name);
 	if (di) {
 		mdsc = &ceph_client(dn->d_sb)->mdsc;
 		spin_lock(&mdsc->dentry_lru_lock);
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 506b638a023b..68052f664280 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -631,6 +631,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
 	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
 	     con->connect_seq, global_seq, proto);
 
+	con->out_connect.features = CEPH_FEATURE_SUPPORTED;
 	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
 	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
 	con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1080,15 +1081,37 @@ static int process_banner(struct ceph_connection *con)
 	return 0;
 }
 
+static void fail_protocol(struct ceph_connection *con)
+{
+	reset_connection(con);
+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->bad_proto)
+		con->ops->bad_proto(con);
+	mutex_lock(&con->mutex);
+}
+
 static int process_connect(struct ceph_connection *con)
 {
+	u64 sup_feat = CEPH_FEATURE_SUPPORTED;
+	u64 req_feat = CEPH_FEATURE_REQUIRED;
+	u64 server_feat = le64_to_cpu(con->in_reply.features);
+
 	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
 
 	switch (con->in_reply.tag) {
+	case CEPH_MSGR_TAG_FEATURES:
+		pr_err("%s%lld %s feature set mismatch,"
+		       " my %llx < server's %llx, missing %llx\n",
+		       ENTITY_NAME(con->peer_name),
+		       pr_addr(&con->peer_addr.in_addr),
+		       sup_feat, server_feat, server_feat & ~sup_feat);
+		con->error_msg = "missing required protocol features";
+		fail_protocol(con);
+		return -1;
+
 	case CEPH_MSGR_TAG_BADPROTOVER:
-		dout("process_connect got BADPROTOVER my %d != their %d\n",
-		     le32_to_cpu(con->out_connect.protocol_version),
-		     le32_to_cpu(con->in_reply.protocol_version));
 		pr_err("%s%lld %s protocol version mismatch,"
 		       " my %d != server's %d\n",
 		       ENTITY_NAME(con->peer_name),
@@ -1096,13 +1119,7 @@ static int process_connect(struct ceph_connection *con)
 		       le32_to_cpu(con->out_connect.protocol_version),
 		       le32_to_cpu(con->in_reply.protocol_version));
 		con->error_msg = "protocol version mismatch";
-		reset_connection(con);
-		set_bit(CLOSED, &con->state);  /* in case there's queued work */
-
-		mutex_unlock(&con->mutex);
-		if (con->ops->bad_proto)
-			con->ops->bad_proto(con);
-		mutex_lock(&con->mutex);
+		fail_protocol(con);
 		return -1;
 
 	case CEPH_MSGR_TAG_BADAUTHORIZER:
@@ -1173,6 +1190,16 @@ static int process_connect(struct ceph_connection *con)
 		break;
 
 	case CEPH_MSGR_TAG_READY:
+		if (req_feat & ~server_feat) {
+			pr_err("%s%lld %s protocol feature mismatch,"
+			       " my required %llx > server's %llx, need %llx\n",
+			       ENTITY_NAME(con->peer_name),
+			       pr_addr(&con->peer_addr.in_addr),
+			       req_feat, server_feat, req_feat & ~server_feat);
+			con->error_msg = "missing required protocol features";
+			fail_protocol(con);
+			return -1;
+		}
 		clear_bit(CONNECTING, &con->state);
 		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
 		con->connect_seq++;
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index e46d8b806dea..be83f93182ee 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -21,7 +21,7 @@
  * whenever the wire protocol changes.  try to keep this string length
  * constant.
  */
-#define CEPH_BANNER "ceph v025"
+#define CEPH_BANNER "ceph v026"
 #define CEPH_BANNER_MAX_LEN 30
 
 
@@ -100,12 +100,14 @@ struct ceph_entity_inst {
 #define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
 #define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
 #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
 
 
 /*
  * connection negotiation
  */
 struct ceph_msg_connect {
+	__le64 features;     /* supported feature bits */
 	__le32 host_type;    /* CEPH_ENTITY_TYPE_* */
 	__le32 global_seq;   /* count connections initiated by this host */
 	__le32 connect_seq;  /* count connections initiated in this session */
@@ -117,6 +119,7 @@ struct ceph_msg_connect {
 
 struct ceph_msg_connect_reply {
 	__u8 tag;
+	__le64 features;     /* feature bits for this session */
 	__le32 global_seq;
 	__le32 connect_seq;
 	__le32 protocol_version;
-- 
cgit v1.2.2


From 58bb3b374b07a2a43315213f00a48a5ffd6d0915 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 23 Dec 2009 12:12:31 -0800
Subject: ceph: support ceph_pagelist for message payload

The ceph_pagelist is a simple list of whole pages, strung together via
their lru list_head.  It facilitates encoding to a "buffer" of unknown
size.  Allow its use in place of the ceph_msg page vector.

This will be used to fix the huge buffer preallocation woes of MDS
reconnection.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/Makefile    |  2 +-
 fs/ceph/messenger.c | 24 ++++++++++++++++++++----
 fs/ceph/messenger.h |  1 +
 fs/ceph/pagelist.c  | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/pagelist.h  | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 130 insertions(+), 5 deletions(-)
 create mode 100644 fs/ceph/pagelist.c
 create mode 100644 fs/ceph/pagelist.h

(limited to 'fs')

diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 827629c85768..47caf2f1b75a 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
 
 ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
 	export.o caps.o snap.o xattr.o \
-	messenger.o msgpool.o buffer.o \
+	messenger.o msgpool.o buffer.o pagelist.o \
 	mds_client.o mdsmap.o \
 	mon_client.o \
 	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 68052f664280..c1106e8360f0 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -13,6 +13,7 @@
 #include "super.h"
 #include "messenger.h"
 #include "decode.h"
+#include "pagelist.h"
 
 /*
  * Ceph uses the messenger to exchange ceph_msg messages with other
@@ -728,6 +729,11 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 			page = msg->pages[con->out_msg_pos.page];
 			if (crc)
 				kaddr = kmap(page);
+		} else if (msg->pagelist) {
+			page = list_first_entry(&msg->pagelist->head,
+						struct page, lru);
+			if (crc)
+				kaddr = kmap(page);
 		} else {
 			page = con->msgr->zero_page;
 			if (crc)
@@ -750,7 +756,7 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 				      MSG_DONTWAIT | MSG_NOSIGNAL |
 				      MSG_MORE);
 
-		if (crc && msg->pages)
+		if (crc && (msg->pages || msg->pagelist))
 			kunmap(page);
 
 		if (ret <= 0)
@@ -762,6 +768,9 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 			con->out_msg_pos.page_pos = 0;
 			con->out_msg_pos.page++;
 			con->out_msg_pos.did_page_crc = 0;
+			if (msg->pagelist)
+				list_move_tail(&page->lru,
+					       &msg->pagelist->head);
 		}
 	}
 
@@ -1051,13 +1060,13 @@ static int process_banner(struct ceph_connection *con)
 				       &con->actual_peer_addr) &&
 	    !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
 	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-		pr_err("wrong peer, want %s/%d, "
-		       "got %s/%d, wtf\n",
+		pr_warning("wrong peer, want %s/%d, "
+		       "got %s/%d\n",
 		       pr_addr(&con->peer_addr.in_addr),
 		       con->peer_addr.nonce,
 		       pr_addr(&con->actual_peer_addr.in_addr),
 		       con->actual_peer_addr.nonce);
-		con->error_msg = "protocol error, wrong peer";
+		con->error_msg = "wrong peer at address";
 		return -1;
 	}
 
@@ -2096,6 +2105,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
 	/* data */
 	m->nr_pages = calc_pages_for(page_off, page_len);
 	m->pages = pages;
+	m->pagelist = NULL;
 
 	dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
 	     m->nr_pages);
@@ -2181,6 +2191,12 @@ void ceph_msg_last_put(struct kref *kref)
 	m->nr_pages = 0;
 	m->pages = NULL;
 
+	if (m->pagelist) {
+		ceph_pagelist_release(m->pagelist);
+		kfree(m->pagelist);
+		m->pagelist = NULL;
+	}
+
 	if (m->pool)
 		ceph_msgpool_put(m->pool, m);
 	else
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 7e2aab1d3ce2..a7b684145092 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -85,6 +85,7 @@ struct ceph_msg {
 	struct ceph_buffer *middle;
 	struct page **pages;            /* data payload.  NOT OWNER. */
 	unsigned nr_pages;              /* size of page array */
+	struct ceph_pagelist *pagelist; /* instead of pages */
 	struct list_head list_head;
 	struct kref kref;
 	bool front_is_vmalloc;
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..370e93695474
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,54 @@
+
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+
+#include "pagelist.h"
+
+int ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+	if (pl->mapped_tail)
+		kunmap(pl->mapped_tail);
+	while (!list_empty(&pl->head)) {
+		struct page *page = list_first_entry(&pl->head, struct page,
+						     lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	return 0;
+}
+
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+	struct page *page = alloc_page(GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+	pl->room += PAGE_SIZE;
+	list_add_tail(&page->lru, &pl->head);
+	if (pl->mapped_tail)
+		kunmap(pl->mapped_tail);
+	pl->mapped_tail = kmap(page);
+	return 0;
+}
+
+int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
+{
+	while (pl->room < len) {
+		size_t bit = pl->room;
+		int ret;
+
+		memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+		       buf, bit);
+		pl->length += bit;
+		pl->room -= bit;
+		buf += bit;
+		len -= bit;
+		ret = ceph_pagelist_addpage(pl);
+		if (ret)
+			return ret;
+	}
+
+	memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+	pl->length += len;
+	pl->room -= len;
+	return 0;
+}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
+#ifndef __FS_CEPH_PAGELIST_H
+#define __FS_CEPH_PAGELIST_H
+
+#include <linux/list.h>
+
+struct ceph_pagelist {
+	struct list_head head;
+	void *mapped_tail;
+	size_t length;
+	size_t room;
+};
+
+static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
+{
+	INIT_LIST_HEAD(&pl->head);
+	pl->mapped_tail = NULL;
+	pl->length = 0;
+	pl->room = 0;
+}
+extern int ceph_pagelist_release(struct ceph_pagelist *pl);
+
+extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
+
+static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
+{
+	__le64 ev = cpu_to_le64(v);
+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
+{
+	__le32 ev = cpu_to_le32(v);
+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
+{
+	__le16 ev = cpu_to_le16(v);
+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
+{
+	return ceph_pagelist_append(pl, &v, 1);
+}
+static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
+					      char *s, size_t len)
+{
+	int ret = ceph_pagelist_encode_32(pl, len);
+	if (ret)
+		return ret;
+	if (len)
+		return ceph_pagelist_append(pl, s, len);
+	return 0;
+}
+
+#endif
-- 
cgit v1.2.2


From 93cea5bebf91319095db866163a7e35c3e77d8f2 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 23 Dec 2009 12:21:51 -0800
Subject: ceph: use ceph_pagelist for mds reconnect message; change encoding
 (protocol change)

Use the ceph_pagelist to encode the MDS reconnect message.  We change the
message encoding (protocol change!) at the same time to make our life
easier (we don't know how many snaprealms we have when we start encoding).

An empty message implies the session is closed/does not exist.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h    |   2 +-
 fs/ceph/mds_client.c | 156 ++++++++++++++++++---------------------------------
 2 files changed, 57 insertions(+), 101 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index db3fed33c4aa..d0f2557bb41b 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -39,7 +39,7 @@
 #define CEPH_MDS_PROTOCOL     9 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
 #define CEPH_OSDC_PROTOCOL   22 /* server/client */
-#define CEPH_MDSC_PROTOCOL   30 /* server/client */
+#define CEPH_MDSC_PROTOCOL   31 /* server/client */
 #define CEPH_MONC_PROTOCOL   15 /* server/client */
 
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ec884e2845db..6e08f488a30f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -9,6 +9,7 @@
 #include "messenger.h"
 #include "decode.h"
 #include "auth.h"
+#include "pagelist.h"
 
 /*
  * A cluster of MDS (metadata server) daemons is responsible for
@@ -1971,20 +1972,12 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
 /*
  * Encode information about a cap for a reconnect with the MDS.
  */
-struct encode_caps_data {
-	void **pp;
-	void *end;
-	int *num_caps;
-};
-
 static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 			  void *arg)
 {
-	struct ceph_mds_cap_reconnect *rec;
+	struct ceph_mds_cap_reconnect rec;
 	struct ceph_inode_info *ci;
-	struct encode_caps_data *data = (struct encode_caps_data *)arg;
-	void *p = *(data->pp);
-	void *end = data->end;
+	struct ceph_pagelist *pagelist = arg;
 	char *path;
 	int pathlen, err;
 	u64 pathbase;
@@ -1995,8 +1988,9 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
 	     inode, ceph_vinop(inode), cap, cap->cap_id,
 	     ceph_cap_string(cap->issued));
-	ceph_decode_need(&p, end, sizeof(u64), needmore);
-	ceph_encode_64(&p, ceph_ino(inode));
+	err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
+	if (err)
+		return err;
 
 	dentry = d_find_alias(inode);
 	if (dentry) {
@@ -2009,33 +2003,29 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		path = NULL;
 		pathlen = 0;
 	}
-	ceph_decode_need(&p, end, pathlen+4, needmore);
-	ceph_encode_string(&p, end, path, pathlen);
+	err = ceph_pagelist_encode_string(pagelist, path, pathlen);
+	if (err)
+		goto out;
 
-	ceph_decode_need(&p, end, sizeof(*rec), needmore);
-	rec = p;
-	p += sizeof(*rec);
-	BUG_ON(p > end);
 	spin_lock(&inode->i_lock);
 	cap->seq = 0;        /* reset cap seq */
 	cap->issue_seq = 0;  /* and issue_seq */
-	rec->cap_id = cpu_to_le64(cap->cap_id);
-	rec->pathbase = cpu_to_le64(pathbase);
-	rec->wanted = cpu_to_le32(__ceph_caps_wanted(ci));
-	rec->issued = cpu_to_le32(cap->issued);
-	rec->size = cpu_to_le64(inode->i_size);
-	ceph_encode_timespec(&rec->mtime, &inode->i_mtime);
-	ceph_encode_timespec(&rec->atime, &inode->i_atime);
-	rec->snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+	rec.cap_id = cpu_to_le64(cap->cap_id);
+	rec.pathbase = cpu_to_le64(pathbase);
+	rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+	rec.issued = cpu_to_le32(cap->issued);
+	rec.size = cpu_to_le64(inode->i_size);
+	ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
+	ceph_encode_timespec(&rec.atime, &inode->i_atime);
+	rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
 	spin_unlock(&inode->i_lock);
 
+	err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
+
+out:
 	kfree(path);
 	dput(dentry);
-	(*data->num_caps)++;
-	*(data->pp) = p;
-	return 0;
-needmore:
-	return -ENOSPC;
+	return err;
 }
 
 
@@ -2053,19 +2043,26 @@ needmore:
  */
 static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 {
-	struct ceph_mds_session *session;
+	struct ceph_mds_session *session = NULL;
 	struct ceph_msg *reply;
-	int newlen, len = 4 + 1;
-	void *p, *end;
 	int err;
-	int num_caps, num_realms = 0;
 	int got;
 	u64 next_snap_ino = 0;
-	__le32 *pnum_caps, *pnum_realms;
-	struct encode_caps_data iter_args;
+	struct ceph_pagelist *pagelist;
 
 	pr_info("reconnect to recovering mds%d\n", mds);
 
+	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+	if (!pagelist)
+		goto fail_nopagelist;
+	ceph_pagelist_init(pagelist);
+
+	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
+	if (IS_ERR(reply)) {
+		err = PTR_ERR(reply);
+		goto fail_nomsg;
+	}
+
 	/* find session */
 	session = __ceph_lookup_mds_session(mdsc, mds);
 	mutex_unlock(&mdsc->mutex);    /* drop lock for duration */
@@ -2081,12 +2078,6 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 
 		/* replay unsafe requests */
 		replay_unsafe_requests(mdsc, session);
-
-		/* estimate needed space */
-		len += session->s_nr_caps *
-			(100+sizeof(struct ceph_mds_cap_reconnect));
-		pr_info("estimating i need %d bytes for %d caps\n",
-		     len, session->s_nr_caps);
 	} else {
 		dout("no session for mds%d, will send short reconnect\n",
 		     mds);
@@ -2094,41 +2085,18 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 
 	down_read(&mdsc->snap_rwsem);
 
-retry:
-	/* build reply */
-	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, len, 0, 0, NULL);
-	if (IS_ERR(reply)) {
-		err = PTR_ERR(reply);
-		pr_err("send_mds_reconnect ENOMEM on %d for mds%d\n",
-		       len, mds);
-		goto out;
-	}
-	p = reply->front.iov_base;
-	end = p + len;
-
-	if (!session) {
-		ceph_encode_8(&p, 1); /* session was closed */
-		ceph_encode_32(&p, 0);
+	if (!session)
 		goto send;
-	}
 	dout("session %p state %s\n", session,
 	     session_state_name(session->s_state));
 
 	/* traverse this session's caps */
-	ceph_encode_8(&p, 0);
-	pnum_caps = p;
-	ceph_encode_32(&p, session->s_nr_caps);
-	num_caps = 0;
-
-	iter_args.pp = &p;
-	iter_args.end = end;
-	iter_args.num_caps = &num_caps;
-	err = iterate_session_caps(session, encode_caps_cb, &iter_args);
-	if (err == -ENOSPC)
-		goto needmore;
+	err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
+	if (err)
+		goto fail;
+	err = iterate_session_caps(session, encode_caps_cb, pagelist);
 	if (err < 0)
 		goto out;
-	*pnum_caps = cpu_to_le32(num_caps);
 
 	/*
 	 * snaprealms.  we provide mds with the ino, seq (version), and
@@ -2136,14 +2104,9 @@ retry:
 	 * it will tell us.
 	 */
 	next_snap_ino = 0;
-	/* save some space for the snaprealm count */
-	pnum_realms = p;
-	ceph_decode_need(&p, end, sizeof(*pnum_realms), needmore);
-	p += sizeof(*pnum_realms);
-	num_realms = 0;
 	while (1) {
 		struct ceph_snap_realm *realm;
-		struct ceph_mds_snaprealm_reconnect *sr_rec;
+		struct ceph_mds_snaprealm_reconnect sr_rec;
 		got = radix_tree_gang_lookup(&mdsc->snap_realms,
 					     (void **)&realm, next_snap_ino, 1);
 		if (!got)
@@ -2151,22 +2114,19 @@ retry:
 
 		dout(" adding snap realm %llx seq %lld parent %llx\n",
 		     realm->ino, realm->seq, realm->parent_ino);
-		ceph_decode_need(&p, end, sizeof(*sr_rec), needmore);
-		sr_rec = p;
-		sr_rec->ino = cpu_to_le64(realm->ino);
-		sr_rec->seq = cpu_to_le64(realm->seq);
-		sr_rec->parent = cpu_to_le64(realm->parent_ino);
-		p += sizeof(*sr_rec);
-		num_realms++;
+		sr_rec.ino = cpu_to_le64(realm->ino);
+		sr_rec.seq = cpu_to_le64(realm->seq);
+		sr_rec.parent = cpu_to_le64(realm->parent_ino);
+		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+		if (err)
+			goto fail;
 		next_snap_ino = realm->ino + 1;
 	}
-	*pnum_realms = cpu_to_le32(num_realms);
 
 send:
-	reply->front.iov_len = p - reply->front.iov_base;
-	reply->hdr.front_len = cpu_to_le32(reply->front.iov_len);
-	dout("final len was %u (guessed %d)\n",
-	     (unsigned)reply->front.iov_len, len);
+	reply->pagelist = pagelist;
+	reply->hdr.data_len = cpu_to_le32(pagelist->length);
+	reply->nr_pages = calc_pages_for(0, pagelist->length);
 	ceph_con_send(&session->s_con, reply);
 
 	if (session) {
@@ -2183,18 +2143,14 @@ out:
 	mutex_lock(&mdsc->mutex);
 	return;
 
-needmore:
-	/*
-	 * we need a larger buffer.  this doesn't very accurately
-	 * factor in snap realms, but it's safe.
-	 */
-	num_caps += num_realms;
-	newlen = len * ((100 * (session->s_nr_caps+3)) / (num_caps + 1)) / 100;
-	pr_info("i guessed %d, and did %d of %d caps, retrying with %d\n",
-	     len, num_caps, session->s_nr_caps, newlen);
-	len = newlen;
+fail:
 	ceph_msg_put(reply);
-	goto retry;
+fail_nomsg:
+	ceph_pagelist_release(pagelist);
+	kfree(pagelist);
+fail_nopagelist:
+	pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
+	goto out;
 }
 
 
-- 
cgit v1.2.2


From 6a4ef48103a78a46b80e07fcd8ac4edda0c7128f Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Thu, 31 Dec 2009 12:04:58 -0800
Subject: ceph: fix copy_user_to_page_vector()

The function was broken in the case where there was more than one page
involved, broke the ceph sync_write case.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index fc8aff4767d3..2d88c805a56c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -350,10 +350,10 @@ static int copy_user_to_page_vector(struct page **pages,
 			return -EFAULT;
 		data += l - bad;
 		left -= l - bad;
-		if (po) {
-			po += l - bad;
-			if (po == PAGE_CACHE_SIZE)
-				po = 0;
+		po += l - bad;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
 		}
 	}
 	return len;
-- 
cgit v1.2.2


From 4baa75ef0ed29adae03fcbbaa9aca1511a5a8cc9 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Thu, 7 Jan 2010 15:36:32 -0800
Subject: ceph: change dentry offset and position after splice_dentry

This fixes a bug, where we had the parent list have dentries with
offsets that are not monotonically increasing, which caused the ceph
dcache_readdir to skip entries.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 8774b2811597..518beb628f09 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -816,6 +816,33 @@ out:
 	return dn;
 }
 
+/*
+ * Set dentry's directory position based on the current dir's max, and
+ * order it in d_subdirs, so that dcache_readdir behaves.
+ */
+static void ceph_set_dentry_offset(struct dentry *dn)
+{
+	struct dentry *dir = dn->d_parent;
+	struct inode *inode = dn->d_parent->d_inode;
+	struct ceph_dentry_info *di;
+
+	BUG_ON(!inode);
+
+	di = ceph_dentry(dn);
+
+	spin_lock(&inode->i_lock);
+	di->offset = ceph_inode(inode)->i_max_offset++;
+	spin_unlock(&inode->i_lock);
+
+	spin_lock(&dcache_lock);
+	spin_lock(&dn->d_lock);
+	list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+	dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
+	     dn->d_u.d_child.prev, dn->d_u.d_child.next);
+	spin_unlock(&dn->d_lock);
+	spin_unlock(&dcache_lock);
+}
+
 /*
  * Incorporate results into the local cache.  This is either just
  * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
@@ -987,6 +1014,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 				goto done;
 			}
 			req->r_dentry = dn;  /* may have spliced */
+			ceph_set_dentry_offset(dn);
 			igrab(in);
 		} else if (ceph_ino(in) == vino.ino &&
 			   ceph_snap(in) == vino.snap) {
@@ -1029,6 +1057,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			err = PTR_ERR(dn);
 			goto done;
 		}
+		ceph_set_dentry_offset(dn);
 		req->r_dentry = dn;  /* may have spliced */
 		igrab(in);
 		rinfo->head->is_dentry = 1;  /* fool notrace handlers */
-- 
cgit v1.2.2


From 103e2d3ae57d38d18aaac1b327266c1407499ac1 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 7 Jan 2010 16:12:36 -0800
Subject: ceph: remove unused erank field

The ceph_entity_addr erank field is obsolete; remove it.  Get rid of
trivial addr comparison helpers while we're at it.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c  | 19 ++++++++-----------
 fs/ceph/mon_client.c |  3 +--
 fs/ceph/msgr.h       | 18 ++----------------
 fs/ceph/osd_client.c |  7 ++++---
 4 files changed, 15 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index c1106e8360f0..1360708d7505 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1056,16 +1056,15 @@ static int process_banner(struct ceph_connection *con)
 	 * end may not yet know their ip address, so if it's 0.0.0.0, give
 	 * them the benefit of the doubt.
 	 */
-	if (!ceph_entity_addr_is_local(&con->peer_addr,
-				       &con->actual_peer_addr) &&
+	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+		   sizeof(con->peer_addr)) != 0 &&
 	    !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
 	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-		pr_warning("wrong peer, want %s/%d, "
-		       "got %s/%d\n",
-		       pr_addr(&con->peer_addr.in_addr),
-		       con->peer_addr.nonce,
-		       pr_addr(&con->actual_peer_addr.in_addr),
-		       con->actual_peer_addr.nonce);
+		pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
+			   pr_addr(&con->peer_addr.in_addr),
+			   le64_to_cpu(con->peer_addr.nonce),
+			   pr_addr(&con->actual_peer_addr.in_addr),
+			   le64_to_cpu(con->actual_peer_addr.nonce));
 		con->error_msg = "wrong peer at address";
 		return -1;
 	}
@@ -1934,8 +1933,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
 		msgr->inst.addr = *myaddr;
 
 	/* select a random nonce */
-	get_random_bytes(&msgr->inst.addr.nonce,
-			 sizeof(msgr->inst.addr.nonce));
+	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
 	encode_my_addr(msgr);
 
 	dout("messenger_create %p\n", msgr);
@@ -1966,7 +1964,6 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 	msg->hdr.src.name = con->msgr->inst.name;
 	msg->hdr.src.addr = con->msgr->my_enc_addr;
 	msg->hdr.orig_src = msg->hdr.src;
-	msg->hdr.dst_erank = con->peer_addr.erank;
 
 	/* queue */
 	mutex_lock(&con->mutex);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index bb94006fc686..223e8bc207e3 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -88,7 +88,7 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
 	int i;
 
 	for (i = 0; i < m->num_mon; i++)
-		if (ceph_entity_addr_equal(addr, &m->mon_inst[i].addr))
+		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
 			return 1;
 	return 0;
 }
@@ -503,7 +503,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
 		return -ENOMEM;
 	for (i = 0; i < num_mon; i++) {
 		monc->monmap->mon_inst[i].addr = mon_addr[i];
-		monc->monmap->mon_inst[i].addr.erank = 0;
 		monc->monmap->mon_inst[i].addr.nonce = 0;
 		monc->monmap->mon_inst[i].name.type =
 			CEPH_ENTITY_TYPE_MON;
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index be83f93182ee..40b6189aa9e3 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -61,24 +61,10 @@ extern const char *ceph_entity_type_name(int type);
  * entity_addr -- network address
  */
 struct ceph_entity_addr {
-	__le32 erank;  /* entity's rank in process */
-	__le32 nonce;  /* unique id for process (e.g. pid) */
+	__le64 nonce;  /* unique id for process (e.g. pid) */
 	struct sockaddr_storage in_addr;
 } __attribute__ ((packed));
 
-static inline bool ceph_entity_addr_is_local(const struct ceph_entity_addr *a,
-					     const struct ceph_entity_addr *b)
-{
-	return a->nonce == b->nonce &&
-		memcmp(&a->in_addr, &b->in_addr, sizeof(a->in_addr)) == 0;
-}
-
-static inline bool ceph_entity_addr_equal(const struct ceph_entity_addr *a,
-					  const struct ceph_entity_addr *b)
-{
-	return memcmp(a, b, sizeof(*a)) == 0;
-}
-
 struct ceph_entity_inst {
 	struct ceph_entity_name name;
 	struct ceph_entity_addr addr;
@@ -147,7 +133,7 @@ struct ceph_msg_header {
 			     receiver: mask against ~PAGE_MASK */
 
 	struct ceph_entity_inst src, orig_src;
-	__le32 dst_erank;
+	__le32 reserved;
 	__le32 crc;       /* header crc32c */
 } __attribute__ ((packed));
 
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index a0aac436d5d4..80b868f7a0fc 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -821,9 +821,10 @@ static void kick_requests(struct ceph_osd_client *osdc,
 
 			n = rb_next(p);
 			if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
-			    !ceph_entity_addr_equal(&osd->o_con.peer_addr,
-					    ceph_osd_addr(osdc->osdmap,
-							  osd->o_osd)))
+			    memcmp(&osd->o_con.peer_addr,
+				   ceph_osd_addr(osdc->osdmap,
+						 osd->o_osd),
+				   sizeof(struct ceph_entity_addr)) != 0)
 				reset_osd(osdc, osd);
 		}
 	}
-- 
cgit v1.2.2


From 7740a42f816790583bd8a9079337772d511af3a3 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 8 Jan 2010 15:58:25 -0800
Subject: ceph: display pgid in debugfs osd request dump

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/debugfs.c    | 6 ++++--
 fs/ceph/osd_client.c | 2 ++
 fs/ceph/osd_client.h | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 22d3b47fb1be..fba44b2a6086 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -231,8 +231,10 @@ static int osdc_show(struct seq_file *s, void *pp)
 
 		req = rb_entry(p, struct ceph_osd_request, r_node);
 
-		seq_printf(s, "%lld\tosd%d\t", req->r_tid,
-			   req->r_osd ? req->r_osd->o_osd : -1);
+		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+			   req->r_osd ? req->r_osd->o_osd : -1,
+			   le32_to_cpu(req->r_pgid.pool),
+			   le16_to_cpu(req->r_pgid.ps));
 
 		head = req->r_request->front.iov_base;
 		op = (void *)(head + 1);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 80b868f7a0fc..8417e21a3cb2 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -538,6 +538,8 @@ static int __map_osds(struct ceph_osd_client *osdc,
 	if (err)
 		return err;
 	pgid = reqhead->layout.ol_pgid;
+	req->r_pgid = pgid;
+
 	o = ceph_calc_pg_primary(osdc->osdmap, pgid);
 
 	if ((req->r_osd && req->r_osd->o_osd == o &&
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index 8fef71cc4457..4162c6810a8f 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -42,6 +42,7 @@ struct ceph_osd_request {
 	struct rb_node  r_node;
 	struct list_head r_osd_item;
 	struct ceph_osd *r_osd;
+	struct ceph_pg   r_pgid;
 
 	struct ceph_connection *r_con_filling_pages;
 
-- 
cgit v1.2.2


From ec7384ec23dc5a9ea8733e90438e16b6066bfe1b Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Wed, 20 Jan 2010 15:16:41 -0800
Subject: ceph: remove duplicate variable initialization

The variable client is initialized twice to the same (side effect-free)
expression.  Drop one initialization.

A simplified version of the semantic match that finds this problem is:
(http://coccinelle.lip6.fr/)

// <smpl>
@forall@
idexpression *x;
identifier f!=ERR_PTR;
@@

x = f(...)
... when != x
(
x = f(...,<+...x...+>,...)
|
* x = f(...)
)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a6850a14038e..a3bd9deb555c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -596,7 +596,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_client *client;
 	pgoff_t index, start, end;
 	int range_whole = 0;
 	int should_loop = 1;
-- 
cgit v1.2.2


From 3ea25f9441fc0951ada649105f2c57a59536b539 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 25 Jan 2010 11:18:17 -0800
Subject: ceph: mark MDS CREATE as a write op

CEPH_MDS_OP_CREATE was not correctly marked as a write operation.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index d0f2557bb41b..d8923fed8c2e 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -39,7 +39,7 @@
 #define CEPH_MDS_PROTOCOL     9 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
 #define CEPH_OSDC_PROTOCOL   22 /* server/client */
-#define CEPH_MDSC_PROTOCOL   31 /* server/client */
+#define CEPH_MDSC_PROTOCOL   32 /* server/client */
 #define CEPH_MONC_PROTOCOL   15 /* server/client */
 
 
@@ -271,7 +271,7 @@ enum {
 	CEPH_MDS_OP_RMDIR      = 0x01221,
 	CEPH_MDS_OP_SYMLINK    = 0x01222,
 
-	CEPH_MDS_OP_CREATE     = 0x00301,
+	CEPH_MDS_OP_CREATE     = 0x01301,
 	CEPH_MDS_OP_OPEN       = 0x00302,
 	CEPH_MDS_OP_READDIR    = 0x00305,
 
-- 
cgit v1.2.2


From 5b1daecd59f95eb24dc629407ed80369c9929520 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 25 Jan 2010 11:33:08 -0800
Subject: ceph: properly handle aborted mds requests

Previously, if the MDS request was interrupted, we would unregister the
request and ignore any reply.  This could cause the caps or other cache
state to become out of sync.  (For instance, aborting dbench and doing
rm -r on clients would complain about a non-empty directory because the
client didn't realize it's aborted file create request completed.)

Even we don't unregister, we still can't process the reply normally because
we are no longer holding the caller's locks (like the dir i_mutex).

So, mark aborted operations with r_aborted, and in the reply handler, be
sure to process all the caps.  Do not process the namespace changes,
though, since we no longer will hold the dir i_mutex.  The dentry lease
state can also be ignored as it's more forgiving.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c      | 16 ++++++++++------
 fs/ceph/mds_client.c | 28 +++++++++++++++++++++++-----
 fs/ceph/mds_client.h |  1 +
 3 files changed, 34 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 518beb628f09..71e107fb4dbc 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -915,6 +915,16 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	}
 
 	if (rinfo->head->is_dentry) {
+		struct inode *dir = req->r_locked_dir;
+
+		err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+				 session, req->r_request_started, -1,
+				 &req->r_caps_reservation);
+		if (err < 0)
+			return err;
+	}
+
+	if (rinfo->head->is_dentry && !req->r_aborted) {
 		/*
 		 * lookup link rename   : null -> possibly existing inode
 		 * mknod symlink mkdir  : null -> new inode
@@ -932,12 +942,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 		BUG_ON(ceph_snap(dir) !=
 		       le64_to_cpu(rinfo->diri.in->snapid));
 
-		err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
-				 session, req->r_request_started, -1,
-				 &req->r_caps_reservation);
-		if (err < 0)
-			return err;
-
 		/* do we have a lease on the whole dir? */
 		have_dir_cap =
 			(le32_to_cpu(rinfo->diri.in->cap.caps) &
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6e08f488a30f..623c67cd484b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1624,11 +1624,29 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 		err = PTR_ERR(req->r_reply);
 		req->r_reply = NULL;
 
-		/* clean up */
-		__unregister_request(mdsc, req);
-		if (!list_empty(&req->r_unsafe_item))
-			list_del_init(&req->r_unsafe_item);
-		complete(&req->r_safe_completion);
+		if (err == -ERESTARTSYS) {
+			/* aborted */
+			req->r_aborted = true;
+
+			if (req->r_locked_dir &&
+			    (req->r_op & CEPH_MDS_OP_WRITE)) {
+				struct ceph_inode_info *ci =
+					ceph_inode(req->r_locked_dir);
+
+				dout("aborted, clearing I_COMPLETE on %p\n", 
+				     req->r_locked_dir);
+				spin_lock(&req->r_locked_dir->i_lock);
+				ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+				ci->i_release_count++;
+				spin_unlock(&req->r_locked_dir->i_lock);
+			}
+		} else {
+			/* clean up this request */
+			__unregister_request(mdsc, req);
+			if (!list_empty(&req->r_unsafe_item))
+				list_del_init(&req->r_unsafe_item);
+			complete(&req->r_safe_completion);
+		}
 	} else if (req->r_err) {
 		err = req->r_err;
 	} else {
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index b1c2025227c5..ee71495e27c4 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -188,6 +188,7 @@ struct ceph_mds_request {
 	struct ceph_msg  *r_reply;
 	struct ceph_mds_reply_info_parsed r_reply_info;
 	int r_err;
+	bool r_aborted;
 
 	unsigned long r_timeout;  /* optional.  jiffies */
 	unsigned long r_started;  /* start time to measure timeout against */
-- 
cgit v1.2.2


From 2450418c47b7998ad55a73f23707b1e21c371eef Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Fri, 8 Jan 2010 13:58:34 -0800
Subject: ceph: allocate middle of message before stating to read

Both front and middle parts of the message are now being
allocated at the ceph_alloc_msg().

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/mds_client.c |   2 -
 fs/ceph/messenger.c  | 142 +++++++++++++++++++++++++++++----------------------
 fs/ceph/messenger.h  |   9 +---
 fs/ceph/mon_client.c |  25 ++++++---
 fs/ceph/osd_client.c |  17 ++++--
 5 files changed, 115 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 623c67cd484b..93998a0678c4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2953,8 +2953,6 @@ const static struct ceph_connection_operations mds_con_ops = {
 	.get_authorizer = get_authorizer,
 	.verify_authorizer_reply = verify_authorizer_reply,
 	.peer_reset = peer_reset,
-	.alloc_msg = ceph_alloc_msg,
-	.alloc_middle = ceph_alloc_middle,
 };
 
 
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 1360708d7505..25de15c006b1 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1279,8 +1279,34 @@ static void process_ack(struct ceph_connection *con)
 
 
+static int read_partial_message_section(struct ceph_connection *con,
+					struct kvec *section, unsigned int sec_len,
+					u32 *crc)
+{
+	int left;
+	int ret;
+
+	BUG_ON(!section);
+
+	while (section->iov_len < sec_len) {
+		BUG_ON(section->iov_base == NULL);
+		left = sec_len - section->iov_len;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+				       section->iov_len, left);
+		if (ret <= 0)
+			return ret;
+		section->iov_len += ret;
+		if (section->iov_len == sec_len)
+			*crc = crc32c(0, section->iov_base,
+				      section->iov_len);
+	}
 
+	return 1;
+}
 
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr,
+				int *skip);
 /*
  * read (part of) a message.
  */
@@ -1292,6 +1318,7 @@ static int read_partial_message(struct ceph_connection *con)
 	int to, want, left;
 	unsigned front_len, middle_len, data_len, data_off;
 	int datacrc = con->msgr->nocrc;
+	int skip;
 
 	dout("read_partial_message con %p msg %p\n", con, m);
 
@@ -1315,7 +1342,6 @@ static int read_partial_message(struct ceph_connection *con)
 			}
 		}
 	}
-
 	front_len = le32_to_cpu(con->in_hdr.front_len);
 	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
 		return -EIO;
@@ -1330,8 +1356,8 @@ static int read_partial_message(struct ceph_connection *con)
 	if (!con->in_msg) {
 		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
 		     con->in_hdr.front_len, con->in_hdr.data_len);
-		con->in_msg = con->ops->alloc_msg(con, &con->in_hdr);
-		if (!con->in_msg) {
+		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
+		if (skip) {
 			/* skip this message */
 			pr_err("alloc_msg returned NULL, skipping message\n");
 			con->in_base_pos = -front_len - middle_len - data_len -
@@ -1342,56 +1368,28 @@ static int read_partial_message(struct ceph_connection *con)
 		if (IS_ERR(con->in_msg)) {
 			ret = PTR_ERR(con->in_msg);
 			con->in_msg = NULL;
-			con->error_msg = "out of memory for incoming message";
+			con->error_msg = "error allocating memory for incoming message";
 			return ret;
 		}
 		m = con->in_msg;
 		m->front.iov_len = 0;    /* haven't read it yet */
+		if (m->middle)
+			m->middle->vec.iov_len = 0;
 		memcpy(&m->hdr, &con->in_hdr, sizeof(con->in_hdr));
 	}
 
 	/* front */
-	while (m->front.iov_len < front_len) {
-		BUG_ON(m->front.iov_base == NULL);
-		left = front_len - m->front.iov_len;
-		ret = ceph_tcp_recvmsg(con->sock, (char *)m->front.iov_base +
-				       m->front.iov_len, left);
-		if (ret <= 0)
-			return ret;
-		m->front.iov_len += ret;
-		if (m->front.iov_len == front_len)
-			con->in_front_crc = crc32c(0, m->front.iov_base,
-						      m->front.iov_len);
-	}
+	ret = read_partial_message_section(con, &m->front, front_len,
+					   &con->in_front_crc);
+	if (ret <= 0)
+		return ret;
 
 	/* middle */
-	while (middle_len > 0 && (!m->middle ||
-				  m->middle->vec.iov_len < middle_len)) {
-		if (m->middle == NULL) {
-			ret = -EOPNOTSUPP;
-			if (con->ops->alloc_middle)
-				ret = con->ops->alloc_middle(con, m);
-			if (ret < 0) {
-				pr_err("alloc_middle fail skipping payload\n");
-				con->in_base_pos = -middle_len - data_len
-					- sizeof(m->footer);
-				ceph_msg_put(con->in_msg);
-				con->in_msg = NULL;
-				con->in_tag = CEPH_MSGR_TAG_READY;
-				return 0;
-			}
-			m->middle->vec.iov_len = 0;
-		}
-		left = middle_len - m->middle->vec.iov_len;
-		ret = ceph_tcp_recvmsg(con->sock,
-				       (char *)m->middle->vec.iov_base +
-				       m->middle->vec.iov_len, left);
+	if (m->middle) {
+		ret = read_partial_message_section(con, &m->middle->vec, middle_len,
+						   &con->in_middle_crc);
 		if (ret <= 0)
 			return ret;
-		m->middle->vec.iov_len += ret;
-		if (m->middle->vec.iov_len == middle_len)
-			con->in_middle_crc = crc32c(0, m->middle->vec.iov_base,
-						      m->middle->vec.iov_len);
 	}
 
 	/* (page) data */
@@ -2115,24 +2113,6 @@ out:
 	return ERR_PTR(-ENOMEM);
 }
 
-/*
- * Generic message allocator, for incoming messages.
- */
-struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-				struct ceph_msg_header *hdr)
-{
-	int type = le16_to_cpu(hdr->type);
-	int front_len = le32_to_cpu(hdr->front_len);
-	struct ceph_msg *msg = ceph_msg_new(type, front_len, 0, 0, NULL);
-
-	if (!msg) {
-		pr_err("unable to allocate msg type %d len %d\n",
-		       type, front_len);
-		return ERR_PTR(-ENOMEM);
-	}
-	return msg;
-}
-
 /*
  * Allocate "middle" portion of a message, if it is needed and wasn't
  * allocated by alloc_msg.  This allows us to read a small fixed-size
@@ -2140,7 +2120,7 @@ struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
  * propagate the error to the caller based on info in the front) when
  * the middle is too large.
  */
-int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	int type = le16_to_cpu(msg->hdr.type);
 	int middle_len = le32_to_cpu(msg->hdr.middle_len);
@@ -2156,6 +2136,48 @@ int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
 	return 0;
 }
 
+/*
+ * Generic message allocator, for incoming messages.
+ */
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr,
+				int *skip)
+{
+	int type = le16_to_cpu(hdr->type);
+	int front_len = le32_to_cpu(hdr->front_len);
+	int middle_len = le32_to_cpu(hdr->middle_len);
+	struct ceph_msg *msg = NULL;
+	int ret;
+
+	if (con->ops->alloc_msg) {
+		msg = con->ops->alloc_msg(con, hdr, skip);
+		if (IS_ERR(msg))
+			return msg;
+
+		if (*skip)
+			return NULL;
+	}
+	if (!msg) {
+		*skip = 0;
+		msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+		if (!msg) {
+			pr_err("unable to allocate msg type %d len %d\n",
+			       type, front_len);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+
+	if (middle_len) {
+		ret = ceph_alloc_middle(con, msg);
+
+		if (ret < 0) {
+			ceph_msg_put(msg);
+			return msg;
+		}
+	}
+	return msg;
+}
+
 
 /*
  * Free a generically kmalloc'd message.
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a7b684145092..b6bec59056d7 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -44,9 +44,8 @@ struct ceph_connection_operations {
 	void (*peer_reset) (struct ceph_connection *con);
 
 	struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
-					struct ceph_msg_header *hdr);
-	int (*alloc_middle) (struct ceph_connection *con,
-			     struct ceph_msg *msg);
+					struct ceph_msg_header *hdr,
+					int *skip);
 	/* an incoming message has a data payload; tell me what pages I
 	 * should read the data into. */
 	int (*prepare_pages) (struct ceph_connection *con, struct ceph_msg *m,
@@ -242,10 +241,6 @@ extern struct ceph_msg *ceph_msg_new(int type, int front_len,
 				     struct page **pages);
 extern void ceph_msg_kfree(struct ceph_msg *m);
 
-extern struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
-				       struct ceph_msg_header *hdr);
-extern int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg);
-
 
 static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
 {
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 223e8bc207e3..6c00b37cc554 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -692,21 +692,33 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
  * Allocate memory for incoming message
  */
 static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
-				      struct ceph_msg_header *hdr)
+				      struct ceph_msg_header *hdr,
+				      int *skip)
 {
 	struct ceph_mon_client *monc = con->private;
 	int type = le16_to_cpu(hdr->type);
-	int front = le32_to_cpu(hdr->front_len);
+	int front_len = le32_to_cpu(hdr->front_len);
+	struct ceph_msg *m;
 
+	*skip = 0;
 	switch (type) {
 	case CEPH_MSG_MON_SUBSCRIBE_ACK:
-		return ceph_msgpool_get(&monc->msgpool_subscribe_ack, front);
+		m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
+		break;
 	case CEPH_MSG_STATFS_REPLY:
-		return ceph_msgpool_get(&monc->msgpool_statfs_reply, front);
+		m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
+		break;
 	case CEPH_MSG_AUTH_REPLY:
-		return ceph_msgpool_get(&monc->msgpool_auth_reply, front);
+		m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
+		break;
+	default:
+		return NULL;
 	}
-	return ceph_alloc_msg(con, hdr);
+
+	if (!m)
+		*skip = 1;
+
+	return m;
 }
 
 /*
@@ -749,5 +761,4 @@ const static struct ceph_connection_operations mon_con_ops = {
 	.dispatch = dispatch,
 	.fault = mon_fault,
 	.alloc_msg = mon_alloc_msg,
-	.alloc_middle = ceph_alloc_middle,
 };
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 8417e21a3cb2..545e93617993 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1304,18 +1304,28 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 }
 
 static struct ceph_msg *alloc_msg(struct ceph_connection *con,
-				  struct ceph_msg_header *hdr)
+				  struct ceph_msg_header *hdr,
+				  int *skip)
 {
 	struct ceph_osd *osd = con->private;
 	struct ceph_osd_client *osdc = osd->o_osdc;
 	int type = le16_to_cpu(hdr->type);
 	int front = le32_to_cpu(hdr->front_len);
+	struct ceph_msg *m;
 
+	*skip = 0;
 	switch (type) {
 	case CEPH_MSG_OSD_OPREPLY:
-		return ceph_msgpool_get(&osdc->msgpool_op_reply, front);
+		m = ceph_msgpool_get(&osdc->msgpool_op_reply, front);
+		break;
+	default:
+		return NULL;
 	}
-	return ceph_alloc_msg(con, hdr);
+
+	if (!m)
+		*skip = 1;
+
+	return m;
 }
 
 /*
@@ -1390,6 +1400,5 @@ const static struct ceph_connection_operations osd_con_ops = {
 	.verify_authorizer_reply = verify_authorizer_reply,
 	.alloc_msg = alloc_msg,
 	.fault = osd_reset,
-	.alloc_middle = ceph_alloc_middle,
 	.prepare_pages = prepare_pages,
 };
-- 
cgit v1.2.2


From 9d7f0f139edfdce1a1539b100c617fd9182b0829 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Mon, 11 Jan 2010 10:32:02 -0800
Subject: ceph: refactor messages data section allocation

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/messenger.c | 67 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 25de15c006b1..e8742cc9ecdf 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1315,7 +1315,7 @@ static int read_partial_message(struct ceph_connection *con)
 	struct ceph_msg *m = con->in_msg;
 	void *p;
 	int ret;
-	int to, want, left;
+	int to, left;
 	unsigned front_len, middle_len, data_len, data_off;
 	int datacrc = con->msgr->nocrc;
 	int skip;
@@ -1351,6 +1351,7 @@ static int read_partial_message(struct ceph_connection *con)
 	data_len = le32_to_cpu(con->in_hdr.data_len);
 	if (data_len > CEPH_MSG_MAX_DATA_LEN)
 		return -EIO;
+	data_off = le16_to_cpu(con->in_hdr.data_off);
 
 	/* allocate message? */
 	if (!con->in_msg) {
@@ -1375,7 +1376,10 @@ static int read_partial_message(struct ceph_connection *con)
 		m->front.iov_len = 0;    /* haven't read it yet */
 		if (m->middle)
 			m->middle->vec.iov_len = 0;
-		memcpy(&m->hdr, &con->in_hdr, sizeof(con->in_hdr));
+
+		con->in_msg_pos.page = 0;
+		con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+		con->in_msg_pos.data_pos = 0;
 	}
 
 	/* front */
@@ -1393,31 +1397,6 @@ static int read_partial_message(struct ceph_connection *con)
 	}
 
 	/* (page) data */
-	data_off = le16_to_cpu(m->hdr.data_off);
-	if (data_len == 0)
-		goto no_data;
-
-	if (m->nr_pages == 0) {
-		con->in_msg_pos.page = 0;
-		con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
-		con->in_msg_pos.data_pos = 0;
-		/* find pages for data payload */
-		want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-		ret = -1;
-		mutex_unlock(&con->mutex);
-		if (con->ops->prepare_pages)
-			ret = con->ops->prepare_pages(con, m, want);
-		mutex_lock(&con->mutex);
-		if (ret < 0) {
-			dout("%p prepare_pages failed, skipping payload\n", m);
-			con->in_base_pos = -data_len - sizeof(m->footer);
-			ceph_msg_put(con->in_msg);
-			con->in_msg = NULL;
-			con->in_tag = CEPH_MSGR_TAG_READY;
-			return 0;
-		}
-		BUG_ON(m->nr_pages < want);
-	}
 	while (con->in_msg_pos.data_pos < data_len) {
 		left = min((int)(data_len - con->in_msg_pos.data_pos),
 			   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
@@ -1440,7 +1419,6 @@ static int read_partial_message(struct ceph_connection *con)
 		}
 	}
 
-no_data:
 	/* footer */
 	to = sizeof(m->hdr) + sizeof(m->footer);
 	while (con->in_base_pos < to) {
@@ -2136,6 +2114,25 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
 	return 0;
 }
 
+static int ceph_alloc_data_section(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	int ret;
+	int want;
+	int data_len = le32_to_cpu(msg->hdr.data_len);
+	unsigned data_off = le16_to_cpu(msg->hdr.data_off);
+
+	want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+	ret = -1;
+	mutex_unlock(&con->mutex);
+	if (con->ops->prepare_pages)
+		ret = con->ops->prepare_pages(con, msg, want);
+	mutex_lock(&con->mutex);
+
+	BUG_ON(msg->nr_pages < want);
+
+	return ret;
+}
+
 /*
  * Generic message allocator, for incoming messages.
  */
@@ -2146,6 +2143,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 	int type = le16_to_cpu(hdr->type);
 	int front_len = le32_to_cpu(hdr->front_len);
 	int middle_len = le32_to_cpu(hdr->middle_len);
+	int data_len = le32_to_cpu(hdr->data_len);
 	struct ceph_msg *msg = NULL;
 	int ret;
 
@@ -2166,6 +2164,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 			return ERR_PTR(-ENOMEM);
 		}
 	}
+	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
 
 	if (middle_len) {
 		ret = ceph_alloc_middle(con, msg);
@@ -2175,6 +2174,18 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 			return msg;
 		}
 	}
+
+	if (data_len) {
+		ret = ceph_alloc_data_section(con, msg);
+
+		if (ret < 0) {
+			*skip = 1;
+			ceph_msg_put(msg);
+			return NULL;
+		}
+	}
+
+
 	return msg;
 }
 
-- 
cgit v1.2.2


From 0547a9b30a5ac8680325752b61d3ffa9d4971b6e Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Mon, 11 Jan 2010 14:47:13 -0800
Subject: ceph: alloc message data pages and check if tid exists

Now doing it in the same callback that is also responsible for
allocating the 'front' part of the message. If we get a message
that we haven't got a corresponding tid for, mark it for skipping.

Moving the mutex unlock/lock from the osd alloc_msg callback
to the calling function in the messenger.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/messenger.c  | 33 ++------------------------
 fs/ceph/messenger.h  |  4 ----
 fs/ceph/mon_client.c |  1 +
 fs/ceph/osd_client.c | 66 +++++++++++++++++++++++++++++++++-------------------
 4 files changed, 45 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index e8742cc9ecdf..f708803e6857 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -2114,25 +2114,6 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
 	return 0;
 }
 
-static int ceph_alloc_data_section(struct ceph_connection *con, struct ceph_msg *msg)
-{
-	int ret;
-	int want;
-	int data_len = le32_to_cpu(msg->hdr.data_len);
-	unsigned data_off = le16_to_cpu(msg->hdr.data_off);
-
-	want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-	ret = -1;
-	mutex_unlock(&con->mutex);
-	if (con->ops->prepare_pages)
-		ret = con->ops->prepare_pages(con, msg, want);
-	mutex_lock(&con->mutex);
-
-	BUG_ON(msg->nr_pages < want);
-
-	return ret;
-}
-
 /*
  * Generic message allocator, for incoming messages.
  */
@@ -2143,12 +2124,13 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 	int type = le16_to_cpu(hdr->type);
 	int front_len = le32_to_cpu(hdr->front_len);
 	int middle_len = le32_to_cpu(hdr->middle_len);
-	int data_len = le32_to_cpu(hdr->data_len);
 	struct ceph_msg *msg = NULL;
 	int ret;
 
 	if (con->ops->alloc_msg) {
+		mutex_unlock(&con->mutex);
 		msg = con->ops->alloc_msg(con, hdr, skip);
+		mutex_lock(&con->mutex);
 		if (IS_ERR(msg))
 			return msg;
 
@@ -2175,17 +2157,6 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
 		}
 	}
 
-	if (data_len) {
-		ret = ceph_alloc_data_section(con, msg);
-
-		if (ret < 0) {
-			*skip = 1;
-			ceph_msg_put(msg);
-			return NULL;
-		}
-	}
-
-
 	return msg;
 }
 
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index b6bec59056d7..dca2d32b40de 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -46,10 +46,6 @@ struct ceph_connection_operations {
 	struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
 					struct ceph_msg_header *hdr,
 					int *skip);
-	/* an incoming message has a data payload; tell me what pages I
-	 * should read the data into. */
-	int (*prepare_pages) (struct ceph_connection *con, struct ceph_msg *m,
-			      int want);
 };
 
 extern const char *ceph_name_type_str(int t);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 6c00b37cc554..3f7ae7f73c50 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -701,6 +701,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 	struct ceph_msg *m;
 
 	*skip = 0;
+
 	switch (type) {
 	case CEPH_MSG_MON_SUBSCRIBE_ACK:
 		m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 545e93617993..44abe299c69f 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -998,31 +998,26 @@ bad:
  * find those pages.
  *  0 = success, -1 failure.
  */
-static int prepare_pages(struct ceph_connection *con, struct ceph_msg *m,
-			 int want)
+static int prepare_pages(struct ceph_connection *con,
+			 struct ceph_msg_header *hdr,
+			 struct ceph_osd_request *req,
+			 u64 tid,
+			 struct ceph_msg *m)
 {
 	struct ceph_osd *osd = con->private;
 	struct ceph_osd_client *osdc;
-	struct ceph_osd_request *req;
-	u64 tid;
 	int ret = -1;
-	int type = le16_to_cpu(m->hdr.type);
+	int data_len = le32_to_cpu(hdr->data_len);
+	unsigned data_off = le16_to_cpu(hdr->data_off);
+
+	int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
 
 	if (!osd)
 		return -1;
+
 	osdc = osd->o_osdc;
 
 	dout("prepare_pages on msg %p want %d\n", m, want);
-	if (unlikely(type != CEPH_MSG_OSD_OPREPLY))
-		return -1;  /* hmm! */
-
-	tid = le64_to_cpu(m->hdr.tid);
-	mutex_lock(&osdc->request_mutex);
-	req = __lookup_request(osdc, tid);
-	if (!req) {
-		dout("prepare_pages unknown tid %llu\n", tid);
-		goto out;
-	}
 	dout("prepare_pages tid %llu has %d pages, want %d\n",
 	     tid, req->r_num_pages, want);
 	if (unlikely(req->r_num_pages < want))
@@ -1040,7 +1035,8 @@ static int prepare_pages(struct ceph_connection *con, struct ceph_msg *m,
 	m->nr_pages = req->r_num_pages;
 	ret = 0; /* success */
 out:
-	mutex_unlock(&osdc->request_mutex);
+	BUG_ON(ret < 0 || m->nr_pages < want);
+
 	return ret;
 }
 
@@ -1311,19 +1307,42 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 	struct ceph_osd_client *osdc = osd->o_osdc;
 	int type = le16_to_cpu(hdr->type);
 	int front = le32_to_cpu(hdr->front_len);
+	int data_len = le32_to_cpu(hdr->data_len);
 	struct ceph_msg *m;
+	struct ceph_osd_request *req;
+	u64 tid;
+	int err;
 
 	*skip = 0;
-	switch (type) {
-	case CEPH_MSG_OSD_OPREPLY:
-		m = ceph_msgpool_get(&osdc->msgpool_op_reply, front);
-		break;
-	default:
+	if (type != CEPH_MSG_OSD_OPREPLY)
 		return NULL;
-	}
 
-	if (!m)
+	tid = le64_to_cpu(hdr->tid);
+	mutex_lock(&osdc->request_mutex);
+	req = __lookup_request(osdc, tid);
+	if (!req) {
+		*skip = 1;
+		m = NULL;
+		dout("prepare_pages unknown tid %llu\n", tid);
+		goto out;
+	}
+	m = ceph_msgpool_get(&osdc->msgpool_op_reply, front);
+	if (!m) {
 		*skip = 1;
+		goto out;
+	}
+
+	if (data_len > 0) {
+		err = prepare_pages(con, hdr, req, tid, m);
+		if (err < 0) {
+			*skip = 1;
+			ceph_msg_put(m);
+			m = ERR_PTR(err);
+		}
+	}
+
+out:
+	mutex_unlock(&osdc->request_mutex);
 
 	return m;
 }
@@ -1400,5 +1419,4 @@ const static struct ceph_connection_operations osd_con_ops = {
 	.verify_authorizer_reply = verify_authorizer_reply,
 	.alloc_msg = alloc_msg,
 	.fault = osd_reset,
-	.prepare_pages = prepare_pages,
 };
-- 
cgit v1.2.2


From 0d59ab81c3d3adf466c3fd37d7fb6d46b05d1fd4 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Wed, 13 Jan 2010 17:03:23 -0800
Subject: ceph: keep reserved replies on the request structure

This includes treating all the data preallocation and revokation
at the same place, not having to have a special case for
the reserved pages.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/messenger.c  |  20 ++++-----
 fs/ceph/messenger.h  |   4 +-
 fs/ceph/osd_client.c | 118 ++++++++++++++++++++++++++++++++++++---------------
 fs/ceph/osd_client.h |   8 ++--
 4 files changed, 100 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index f708803e6857..81bc779adb90 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1985,30 +1985,30 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 }
 
 /*
- * Revoke a page vector that we may be reading data into
+ * Revoke a message that we may be reading data into
  */
-void ceph_con_revoke_pages(struct ceph_connection *con, struct page **pages)
+void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	mutex_lock(&con->mutex);
-	if (con->in_msg && con->in_msg->pages == pages) {
+	if (con->in_msg && con->in_msg == msg) {
+		unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
+		unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
 		unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
 
 		/* skip rest of message */
-		dout("con_revoke_pages %p msg %p pages %p revoked\n", con,
-		     con->in_msg, pages);
-		if (con->in_msg_pos.data_pos < data_len)
-			con->in_base_pos = con->in_msg_pos.data_pos - data_len;
-		else
+		dout("con_revoke_pages %p msg %p revoked\n", con, msg);
 			con->in_base_pos = con->in_base_pos -
 				sizeof(struct ceph_msg_header) -
+				front_len -
+				middle_len -
+				data_len -
 				sizeof(struct ceph_msg_footer);
-		con->in_msg->pages = NULL;
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
 		con->in_tag = CEPH_MSGR_TAG_READY;
 	} else {
 		dout("con_revoke_pages %p msg %p pages %p no-op\n",
-		     con, con->in_msg, pages);
+		     con, con->in_msg, msg);
 	}
 	mutex_unlock(&con->mutex);
 }
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index dca2d32b40de..c26a3d8aa78c 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -226,8 +226,8 @@ extern void ceph_con_open(struct ceph_connection *con,
 extern void ceph_con_close(struct ceph_connection *con);
 extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
 extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
-extern void ceph_con_revoke_pages(struct ceph_connection *con,
-				  struct page **pages);
+extern void ceph_con_revoke_message(struct ceph_connection *con,
+				  struct ceph_msg *msg);
 extern void ceph_con_keepalive(struct ceph_connection *con);
 extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
 extern void ceph_con_put(struct ceph_connection *con);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 44abe299c69f..df2106839713 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -13,6 +13,8 @@
 #include "decode.h"
 #include "auth.h"
 
+#define OSD_REPLY_RESERVE_FRONT_LEN	512
+
 const static struct ceph_connection_operations osd_con_ops;
 
 static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
@@ -73,6 +75,16 @@ static void calc_layout(struct ceph_osd_client *osdc,
 	     req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
 }
 
+static void remove_replies(struct ceph_osd_request *req)
+{
+	int i;
+	int max = ARRAY_SIZE(req->replies);
+
+	for (i=0; i<max; i++) {
+		if (req->replies[i])
+			ceph_msg_put(req->replies[i]);
+	}
+}
 
 /*
  * requests
@@ -87,12 +99,13 @@ void ceph_osdc_release_request(struct kref *kref)
 		ceph_msg_put(req->r_request);
 	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
-	if (req->r_con_filling_pages) {
+	remove_replies(req);
+	if (req->r_con_filling_msg) {
 		dout("release_request revoking pages %p from con %p\n",
-		     req->r_pages, req->r_con_filling_pages);
-		ceph_con_revoke_pages(req->r_con_filling_pages,
-				      req->r_pages);
-		ceph_con_put(req->r_con_filling_pages);
+		     req->r_pages, req->r_con_filling_msg);
+		ceph_con_revoke_message(req->r_con_filling_msg,
+				      req->r_reply);
+		ceph_con_put(req->r_con_filling_msg);
 	}
 	if (req->r_own_pages)
 		ceph_release_page_vector(req->r_pages,
@@ -104,6 +117,60 @@ void ceph_osdc_release_request(struct kref *kref)
 		kfree(req);
 }
 
+static int alloc_replies(struct ceph_osd_request *req, int num_reply)
+{
+	int i;
+	int max = ARRAY_SIZE(req->replies);
+
+	BUG_ON(num_reply > max);
+
+	for (i=0; i<num_reply; i++) {
+		req->replies[i] = ceph_msg_new(0, OSD_REPLY_RESERVE_FRONT_LEN, 0, 0, NULL);
+		if (IS_ERR(req->replies[i])) {
+			int j;
+			int err = PTR_ERR(req->replies[i]);
+			for (j = 0; j<=i; j++) {
+				ceph_msg_put(req->replies[j]);
+			}
+			return err;
+		}
+	}
+
+	for (; i<max; i++) {
+		req->replies[i] = NULL;
+	}
+
+	req->cur_reply = 0;
+
+	return 0;
+}
+
+static struct ceph_msg *__get_next_reply(struct ceph_connection *con,
+				       struct ceph_osd_request *req,
+				       int front_len)
+{
+	struct ceph_msg *reply;
+	if (req->r_con_filling_msg) {
+		dout("revoking reply msg %p from old con %p\n", req->r_reply,
+		     req->r_con_filling_msg);
+		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+		ceph_con_put(req->r_con_filling_msg);
+		req->cur_reply = 0;
+	}
+	reply = req->replies[req->cur_reply];
+	if (!reply || front_len > OSD_REPLY_RESERVE_FRONT_LEN) {
+		/* maybe we can allocate it now? */
+		reply = ceph_msg_new(0, front_len, 0, 0, NULL);
+		if (!reply || IS_ERR(reply)) {
+			pr_err(" reply alloc failed, front_len=%d\n", front_len);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	req->r_con_filling_msg = ceph_con_get(con);
+	req->r_reply = ceph_msg_get(reply); /* for duration of read over socket */
+	return ceph_msg_get(reply);
+}
+
 /*
  * build new request AND message, calculate layout, and adjust file
  * extent as needed.
@@ -147,7 +214,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	if (req == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	err = ceph_msgpool_resv(&osdc->msgpool_op_reply, num_reply);
+	err = alloc_replies(req, num_reply);
 	if (err) {
 		ceph_osdc_put_request(req);
 		return ERR_PTR(-ENOMEM);
@@ -173,7 +240,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	else
 		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
 	if (IS_ERR(msg)) {
-		ceph_msgpool_resv(&osdc->msgpool_op_reply, -num_reply);
 		ceph_osdc_put_request(req);
 		return ERR_PTR(PTR_ERR(msg));
 	}
@@ -471,8 +537,6 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 	rb_erase(&req->r_node, &osdc->requests);
 	osdc->num_requests--;
 
-	ceph_msgpool_resv(&osdc->msgpool_op_reply, -req->r_num_prealloc_reply);
-
 	if (req->r_osd) {
 		/* make sure the original request isn't in flight. */
 		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
@@ -724,12 +788,12 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	flags = le32_to_cpu(rhead->flags);
 
 	/*
-	 * if this connection filled our pages, drop our reference now, to
+	 * if this connection filled our message, drop our reference now, to
 	 * avoid a (safe but slower) revoke later.
 	 */
-	if (req->r_con_filling_pages == con && req->r_pages == msg->pages) {
-		dout(" got pages, dropping con_filling_pages ref %p\n", con);
-		req->r_con_filling_pages = NULL;
+	if (req->r_con_filling_msg == con && req->r_reply == msg) {
+		dout(" got pages, dropping con_filling_msg ref %p\n", con);
+		req->r_con_filling_msg = NULL;
 		ceph_con_put(con);
 	}
 
@@ -998,7 +1062,7 @@ bad:
  * find those pages.
  *  0 = success, -1 failure.
  */
-static int prepare_pages(struct ceph_connection *con,
+static int __prepare_pages(struct ceph_connection *con,
 			 struct ceph_msg_header *hdr,
 			 struct ceph_osd_request *req,
 			 u64 tid,
@@ -1017,20 +1081,10 @@ static int prepare_pages(struct ceph_connection *con,
 
 	osdc = osd->o_osdc;
 
-	dout("prepare_pages on msg %p want %d\n", m, want);
-	dout("prepare_pages tid %llu has %d pages, want %d\n",
+	dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
 	     tid, req->r_num_pages, want);
 	if (unlikely(req->r_num_pages < want))
 		goto out;
-
-	if (req->r_con_filling_pages) {
-		dout("revoking pages %p from old con %p\n", req->r_pages,
-		     req->r_con_filling_pages);
-		ceph_con_revoke_pages(req->r_con_filling_pages, req->r_pages);
-		ceph_con_put(req->r_con_filling_pages);
-	}
-	req->r_con_filling_pages = ceph_con_get(con);
-	req->r_reply = ceph_msg_get(m); /* for duration of read over socket */
 	m->pages = req->r_pages;
 	m->nr_pages = req->r_num_pages;
 	ret = 0; /* success */
@@ -1164,13 +1218,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	err = ceph_msgpool_init(&osdc->msgpool_op, 4096, 10, true);
 	if (err < 0)
 		goto out_mempool;
-	err = ceph_msgpool_init(&osdc->msgpool_op_reply, 512, 0, false);
-	if (err < 0)
-		goto out_msgpool;
 	return 0;
 
-out_msgpool:
-	ceph_msgpool_destroy(&osdc->msgpool_op);
 out_mempool:
 	mempool_destroy(osdc->req_mempool);
 out:
@@ -1186,7 +1235,6 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	}
 	mempool_destroy(osdc->req_mempool);
 	ceph_msgpool_destroy(&osdc->msgpool_op);
-	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
 }
 
 /*
@@ -1323,17 +1371,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 	if (!req) {
 		*skip = 1;
 		m = NULL;
-		dout("prepare_pages unknown tid %llu\n", tid);
+		dout("alloc_msg unknown tid %llu\n", tid);
 		goto out;
 	}
-	m = ceph_msgpool_get(&osdc->msgpool_op_reply, front);
-	if (!m) {
+	m = __get_next_reply(con, req, front);
+	if (!m || IS_ERR(m)) {
 		*skip = 1;
 		goto out;
 	}
 
 	if (data_len > 0) {
-		err = prepare_pages(con, hdr, req, tid, m);
+		err = __prepare_pages(con, hdr, req, tid, m);
 		if (err < 0) {
 			*skip = 1;
 			ceph_msg_put(m);
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index 4162c6810a8f..8d533d9406ff 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -44,7 +44,7 @@ struct ceph_osd_request {
 	struct ceph_osd *r_osd;
 	struct ceph_pg   r_pgid;
 
-	struct ceph_connection *r_con_filling_pages;
+	struct ceph_connection *r_con_filling_msg;
 
 	struct ceph_msg  *r_request, *r_reply;
 	int               r_result;
@@ -75,6 +75,9 @@ struct ceph_osd_request {
 	struct page     **r_pages;            /* pages for data payload */
 	int               r_pages_from_pool;
 	int               r_own_pages;        /* if true, i own page list */
+
+	struct ceph_msg   *replies[2];
+	int		  cur_reply;
 };
 
 struct ceph_osd_client {
@@ -98,8 +101,7 @@ struct ceph_osd_client {
 
 	mempool_t              *req_mempool;
 
-	struct ceph_msgpool   msgpool_op;
-	struct ceph_msgpool   msgpool_op_reply;
+	struct ceph_msgpool	msgpool_op;
 };
 
 extern int ceph_osdc_init(struct ceph_osd_client *osdc,
-- 
cgit v1.2.2


From 361be8601d78e488b5249032cc4e779b81d7928e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 25 Jan 2010 16:03:02 -0800
Subject: ceph: precede encoded ceph_pg_pool struct with version

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h | 2 +-
 fs/ceph/osdmap.c  | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index d8923fed8c2e..f3bfc3c4f6e6 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -38,7 +38,7 @@
 #define CEPH_OSD_PROTOCOL     8 /* cluster internal */
 #define CEPH_MDS_PROTOCOL     9 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
-#define CEPH_OSDC_PROTOCOL   22 /* server/client */
+#define CEPH_OSDC_PROTOCOL   23 /* server/client */
 #define CEPH_MDSC_PROTOCOL   32 /* server/client */
 #define CEPH_MONC_PROTOCOL   15 /* server/client */
 
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 0dbd606e21c4..a143c51c2cfb 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -414,6 +414,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	struct ceph_osdmap *map;
 	u16 version;
 	u32 len, max, i;
+	u8 ev;
 	int err = -EINVAL;
 	void *start = *p;
 
@@ -441,10 +442,11 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	}
 	ceph_decode_32_safe(p, end, max, bad);
 	while (max--) {
-		ceph_decode_need(p, end, 4+sizeof(map->pg_pool->v), bad);
+		ceph_decode_need(p, end, 4+1+sizeof(map->pg_pool->v), bad);
 		i = ceph_decode_32(p);
 		if (i >= map->num_pools)
 			goto bad;
+		ev = ceph_decode_8(p); /* encoding version */
 		ceph_decode_copy(p, &map->pg_pool[i].v,
 				 sizeof(map->pg_pool->v));
 		calc_pg_masks(&map->pg_pool[i]);
@@ -603,6 +605,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	/* new_pool */
 	ceph_decode_32_safe(p, end, len, bad);
 	while (len--) {
+		__u8 ev;
+
 		ceph_decode_32_safe(p, end, pool, bad);
 		if (pool >= map->num_pools) {
 			void *pg_pool = kcalloc(pool + 1,
@@ -618,6 +622,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 			map->pg_pool = pg_pool;
 			map->num_pools = pool+1;
 		}
+		ceph_decode_need(p, end, 1 + sizeof(map->pg_pool->v), bad);
+		ev = ceph_decode_8(p);  /* encoding version */
 		ceph_decode_copy(p, &map->pg_pool[pool].v,
 				 sizeof(map->pg_pool->v));
 		calc_pg_masks(&map->pg_pool[pool]);
-- 
cgit v1.2.2


From ac8839d7b264d0fa478fca7c4f9b6bb833540a80 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 27 Jan 2010 14:28:10 -0800
Subject: ceph: include type in ceph_entity_addr, filepath

Include a type/version in ceph_entity_addr and filepath.  Include extra
byte in filepath encoding as necessary.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/decode.h     | 1 +
 fs/ceph/mds_client.c | 2 +-
 fs/ceph/messenger.c  | 1 +
 fs/ceph/msgr.h       | 5 +++--
 4 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
index 10de84896244..b90a33b948c7 100644
--- a/fs/ceph/decode.h
+++ b/fs/ceph/decode.h
@@ -138,6 +138,7 @@ static inline void ceph_encode_filepath(void **p, void *end,
 {
 	u32 len = path ? strlen(path) : 0;
 	BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
+	ceph_encode_8(p, 1);
 	ceph_encode_64(p, ino);
 	ceph_encode_32(p, len);
 	if (len)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 93998a0678c4..4e3e8b229e67 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1325,7 +1325,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	}
 
 	len = sizeof(*head) +
-		pathlen1 + pathlen2 + 2*(sizeof(u32) + sizeof(u64));
+		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
 
 	/* calculate (max) length for cap releases */
 	len += sizeof(struct ceph_mds_request_release) *
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 81bc779adb90..e4e8d4439d3a 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1909,6 +1909,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
 		msgr->inst.addr = *myaddr;
 
 	/* select a random nonce */
+	msgr->inst.addr.type = 0;
 	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
 	encode_my_addr(msgr);
 
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 40b6189aa9e3..8aaab414f3f8 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -21,7 +21,7 @@
  * whenever the wire protocol changes.  try to keep this string length
  * constant.
  */
-#define CEPH_BANNER "ceph v026"
+#define CEPH_BANNER "ceph v027"
 #define CEPH_BANNER_MAX_LEN 30
 
 
@@ -61,7 +61,8 @@ extern const char *ceph_entity_type_name(int type);
  * entity_addr -- network address
  */
 struct ceph_entity_addr {
-	__le64 nonce;  /* unique id for process (e.g. pid) */
+	__le32 type;
+	__le32 nonce;  /* unique id for process (e.g. pid) */
 	struct sockaddr_storage in_addr;
 } __attribute__ ((packed));
 
-- 
cgit v1.2.2


From 0f26c4b21b684825a6dd41f2bc04d48ff62d72f8 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Fri, 29 Jan 2010 11:01:11 -0800
Subject: ceph: remove unreachable code

We never truncate to a smaller size without contacting the MDS.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 71e107fb4dbc..a4f573ab232e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1396,7 +1396,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	int release = 0, dirtied = 0;
 	int mask = 0;
 	int err = 0;
-	int queue_trunc = 0;
 
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
@@ -1510,11 +1509,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 		if ((issued & CEPH_CAP_FILE_EXCL) &&
 		    attr->ia_size > inode->i_size) {
 			inode->i_size = attr->ia_size;
-			if (attr->ia_size < inode->i_size) {
-				ci->i_truncate_size = attr->ia_size;
-				ci->i_truncate_pending++;
-				queue_trunc = 1;
-			}
 			inode->i_blocks =
 				(attr->ia_size + (1 << 9) - 1) >> 9;
 			inode->i_ctime = attr->ia_ctime;
@@ -1567,9 +1561,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	release &= issued;
 	spin_unlock(&inode->i_lock);
 
-	if (queue_trunc)
-		__ceph_do_pending_vmtruncate(inode);
-
 	if (mask) {
 		req->r_inode = igrab(inode);
 		req->r_inode_drop = release;
-- 
cgit v1.2.2


From d7eecb483cc29e929bbc5515b8def830d7fc6ad2 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@caiaq.de>
Date: Thu, 28 Jan 2010 16:13:01 +0800
Subject: jfs_dmap.[ch]: trivial typo fix: s/heigth/height/g
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniel Mack <daniel@caiaq.de>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: Jiri Kosina <trivial@kernel.org>
Cc: André Goddard Rosa <andre.goddard@gmail.com>
Cc: jfs-discussion@lists.sourceforge.net
---
 fs/jfs/jfs_dmap.c | 16 ++++++++--------
 fs/jfs/jfs_dmap.h |  6 +++---
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d9b031cf69f5..7e19d2fe0098 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -195,7 +195,7 @@ int dbMount(struct inode *ipbmap)
 	bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
 	bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
 	bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
-	bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth);
+	bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
 	bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
 	bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
 	bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
@@ -287,7 +287,7 @@ int dbSync(struct inode *ipbmap)
 	dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
 	dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
 	dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
-	dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth);
+	dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight);
 	dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
 	dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
 	dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
@@ -1440,7 +1440,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
 	 * tree index of this allocation group within the control page.
 	 */
 	agperlev =
-	    (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth;
+	    (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
 	ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
 
 	/* dmap control page trees fan-out by 4 and a single allocation
@@ -1459,7 +1459,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
 		 * the subtree to find the leftmost leaf that describes this
 		 * free space.
 		 */
-		for (k = bmp->db_agheigth; k > 0; k--) {
+		for (k = bmp->db_agheight; k > 0; k--) {
 			for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
 				if (l2nb <= dcp->stree[m + n]) {
 					ti = m + n;
@@ -3606,7 +3606,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
 	}
 
 	/*
-	 * compute db_aglevel, db_agheigth, db_width, db_agstart:
+	 * compute db_aglevel, db_agheight, db_width, db_agstart:
 	 * an ag is covered in aglevel dmapctl summary tree,
 	 * at agheight level height (from leaf) with agwidth number of nodes
 	 * each, which starts at agstart index node of the smmary tree node
@@ -3615,9 +3615,9 @@ void dbFinalizeBmap(struct inode *ipbmap)
 	bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
 	l2nl =
 	    bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
-	bmp->db_agheigth = l2nl >> 1;
-	bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1));
-	for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0;
+	bmp->db_agheight = l2nl >> 1;
+	bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1));
+	for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0;
 	     i--) {
 		bmp->db_agstart += n;
 		n <<= 2;
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 1a6eb41569bc..6dcb906c55d8 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -210,7 +210,7 @@ struct dbmap_disk {
 	__le32 dn_maxag;	/* 4: max active alloc group number	*/
 	__le32 dn_agpref;	/* 4: preferred alloc group (hint)	*/
 	__le32 dn_aglevel;	/* 4: dmapctl level holding the AG	*/
-	__le32 dn_agheigth;	/* 4: height in dmapctl of the AG	*/
+	__le32 dn_agheight;	/* 4: height in dmapctl of the AG	*/
 	__le32 dn_agwidth;	/* 4: width in dmapctl of the AG	*/
 	__le32 dn_agstart;	/* 4: start tree index at AG height	*/
 	__le32 dn_agl2size;	/* 4: l2 num of blks per alloc group	*/
@@ -229,7 +229,7 @@ struct dbmap {
 	int dn_maxag;		/* max active alloc group number	*/
 	int dn_agpref;		/* preferred alloc group (hint)		*/
 	int dn_aglevel;		/* dmapctl level holding the AG		*/
-	int dn_agheigth;	/* height in dmapctl of the AG		*/
+	int dn_agheight;	/* height in dmapctl of the AG		*/
 	int dn_agwidth;		/* width in dmapctl of the AG		*/
 	int dn_agstart;		/* start tree index at AG height	*/
 	int dn_agl2size;	/* l2 num of blks per alloc group	*/
@@ -255,7 +255,7 @@ struct bmap {
 #define	db_agsize	db_bmap.dn_agsize
 #define	db_agl2size	db_bmap.dn_agl2size
 #define	db_agwidth	db_bmap.dn_agwidth
-#define	db_agheigth	db_bmap.dn_agheigth
+#define	db_agheight	db_bmap.dn_agheight
 #define	db_agstart	db_bmap.dn_agstart
 #define	db_numag	db_bmap.dn_numag
 #define	db_maxlevel	db_bmap.dn_maxlevel
-- 
cgit v1.2.2


From 0c948992a00d478c17042f4790b7d6b35299cf94 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Mon, 1 Feb 2010 16:10:45 -0800
Subject: ceph: always send truncation info with read and write osd ops

This fixes a bug where the read/write ops arrive the osd after
a following truncation request.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h    |  2 +-
 fs/ceph/osd_client.c | 16 +++-------------
 fs/ceph/rados.h      |  6 ++----
 3 files changed, 6 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index f3bfc3c4f6e6..004aae59d4ba 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -38,7 +38,7 @@
 #define CEPH_OSD_PROTOCOL     8 /* cluster internal */
 #define CEPH_MDS_PROTOCOL     9 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
-#define CEPH_OSDC_PROTOCOL   23 /* server/client */
+#define CEPH_OSDC_PROTOCOL   24 /* server/client */
 #define CEPH_MDSC_PROTOCOL   32 /* server/client */
 #define CEPH_MONC_PROTOCOL   15 /* server/client */
 
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index df2106839713..944759b3079f 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -199,11 +199,9 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	struct ceph_osd_request_head *head;
 	struct ceph_osd_op *op;
 	void *p;
-	int do_trunc = truncate_seq && (off + *plen > truncate_size);
-	int num_op = 1 + do_sync + do_trunc;
+	int num_op = 1 + do_sync;
 	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
 	int err, i;
-	u64 prevofs;
 
 	if (use_mempool) {
 		req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
@@ -268,22 +266,14 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		req->r_request->hdr.data_len = cpu_to_le32(*plen);
 		op->payload_len = cpu_to_le32(*plen);
 	}
+	op->extent.truncate_size = cpu_to_le64(truncate_size);
+	op->extent.truncate_seq = cpu_to_le32(truncate_seq);
 
 	/* fill in oid */
 	head->object_len = cpu_to_le32(req->r_oid_len);
 	memcpy(p, req->r_oid, req->r_oid_len);
 	p += req->r_oid_len;
 
-	/* additional ops */
-	if (do_trunc) {
-		op++;
-		op->op = cpu_to_le16(opcode == CEPH_OSD_OP_READ ?
-			     CEPH_OSD_OP_MASKTRUNC : CEPH_OSD_OP_SETTRUNC);
-		op->trunc.truncate_seq = cpu_to_le32(truncate_seq);
-		prevofs = le64_to_cpu((op-1)->extent.offset);
-		op->trunc.truncate_size = cpu_to_le64(truncate_size -
-						      (off-prevofs));
-	}
 	if (do_sync) {
 		op++;
 		op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index c5614d4ae34a..123fd845459e 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -304,15 +304,13 @@ struct ceph_osd_op {
 	union {
 		struct {
 			__le64 offset, length;
+			__le64 truncate_size;
+			__le32 truncate_seq;
 		} __attribute__ ((packed)) extent;
 		struct {
 			__le32 name_len;
 			__le32 value_len;
 		} __attribute__ ((packed)) xattr;
-		struct {
-			__le64 truncate_size;
-			__le32 truncate_seq;
-		} __attribute__ ((packed)) trunc;
 		struct {
 			__u8 class_len;
 			__u8 method_len;
-- 
cgit v1.2.2


From 79788c698b290426320e60374ed1324e4b5c69eb Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 2 Feb 2010 16:34:04 -0800
Subject: ceph: release all pages after successful osd write response

We release all the pages, even if the osd response was
different than the number of pages written. This could only
happen due to truncation that arrives the osd in
different order, for which we want the pages released anyway.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a3bd9deb555c..8065dc92c611 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -524,9 +524,13 @@ static void writepages_finish(struct ceph_osd_request *req,
 	bytes = le64_to_cpu(op->extent.length);
 
 	if (rc >= 0) {
-		wrote = (bytes + (offset & ~PAGE_CACHE_MASK) + ~PAGE_CACHE_MASK)
-			>> PAGE_CACHE_SHIFT;
-		WARN_ON(wrote != req->r_num_pages);
+		/*
+		 * Assume we wrote the pages we originally sent.  The
+		 * osd might reply with fewer pages if our writeback
+		 * raced with a truncation and was adjusted at the osd,
+		 * so don't believe the reply.
+		 */
+		wrote = req->r_num_pages;
 	} else {
 		wrote = 0;
 		mapping_set_error(mapping, rc);
-- 
cgit v1.2.2


From c7e337d6490d6f2f5e66ddf1b04d00b0dbd10108 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 2 Feb 2010 16:11:19 -0800
Subject: ceph: buffer decoding helpers

Helper for decoding into a ceph_buffer, and other misc decoding helpers
we will need.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/buffer.c | 17 +++++++++++++++++
 fs/ceph/buffer.h |  2 ++
 fs/ceph/decode.h | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
index 2576bd452cb8..b98086c7aeba 100644
--- a/fs/ceph/buffer.c
+++ b/fs/ceph/buffer.c
@@ -1,6 +1,7 @@
 
 #include "ceph_debug.h"
 #include "buffer.h"
+#include "decode.h"
 
 struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
 {
@@ -59,3 +60,19 @@ int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
 	return 0;
 }
 
+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+	size_t len;
+
+	ceph_decode_need(p, end, sizeof(u32), bad);
+	len = ceph_decode_32(p);
+	dout("decode_buffer len %d\n", (int)len);
+	ceph_decode_need(p, end, len, bad);
+	*b = ceph_buffer_new(len, GFP_NOFS);
+	if (!*b)
+		return -ENOMEM;
+	ceph_decode_copy(p, (*b)->vec.iov_base, len);
+	return 0;
+bad:
+	return -EINVAL;
+}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
index 47b9514c5bbd..58d19014068f 100644
--- a/fs/ceph/buffer.h
+++ b/fs/ceph/buffer.h
@@ -34,4 +34,6 @@ static inline void ceph_buffer_put(struct ceph_buffer *b)
 	kref_put(&b->kref, ceph_buffer_release);
 }
 
+extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
+
 #endif
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
index b90a33b948c7..65b3e022eaf5 100644
--- a/fs/ceph/decode.h
+++ b/fs/ceph/decode.h
@@ -2,6 +2,7 @@
 #define __CEPH_DECODE_H
 
 #include <asm/unaligned.h>
+#include <linux/time.h>
 
 #include "types.h"
 
@@ -65,6 +66,11 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
 		ceph_decode_need(p, end, sizeof(u16), bad);	\
 		v = ceph_decode_16(p);				\
 	} while (0)
+#define ceph_decode_8_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u8), bad);	\
+		v = ceph_decode_8(p);				\
+	} while (0)
 
 #define ceph_decode_copy_safe(p, end, pv, n, bad)		\
 	do {							\
@@ -156,5 +162,33 @@ static inline void ceph_encode_string(void **p, void *end,
 	*p += len;
 }
 
+#define ceph_encode_need(p, end, n, bad)		\
+	do {						\
+		if (unlikely(*(p) + (n) > (end))) 	\
+			goto bad;			\
+	} while (0)
+
+#define ceph_encode_64_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u64), bad);	\
+		ceph_encode_64(p, v);				\
+	} while (0)
+#define ceph_encode_32_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u32), bad);	\
+		ceph_encode_32(p, v);			\
+	} while (0)
+#define ceph_encode_16_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u16), bad);	\
+		ceph_encode_16(p, v);			\
+	} while (0)
+
+#define ceph_encode_copy_safe(p, end, pv, n, bad)		\
+	do {							\
+		ceph_encode_need(p, end, n, bad);		\
+		ceph_encode_copy(p, pv, n);			\
+	} while (0)
+
 
 #endif
-- 
cgit v1.2.2


From 8b6e4f2d8b21c25225b1ce8d53a2e03b92cc8522 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 2 Feb 2010 16:07:07 -0800
Subject: ceph: aes crypto and base64 encode/decode helpers

Helpers to encrypt/decrypt AES and base64.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/Kconfig  |   1 +
 fs/ceph/Makefile |   1 +
 fs/ceph/armor.c  |  99 ++++++++++++++
 fs/ceph/crypto.c | 408 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/crypto.h |  48 +++++++
 5 files changed, 557 insertions(+)
 create mode 100644 fs/ceph/armor.c
 create mode 100644 fs/ceph/crypto.c
 create mode 100644 fs/ceph/crypto.h

(limited to 'fs')

diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index bc1fbd956187..04b8280582a9 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -2,6 +2,7 @@ config CEPH_FS
         tristate "Ceph distributed file system (EXPERIMENTAL)"
 	depends on INET && EXPERIMENTAL
 	select LIBCRC32C
+	select CONFIG_CRYPTO_AES
 	help
 	  Choose Y or M here to include support for mounting the
 	  experimental Ceph distributed file system.  Ceph is an extremely
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 47caf2f1b75a..85a588e43e7d 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -14,6 +14,7 @@ ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
 	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
 	debugfs.o \
 	auth.o auth_none.o \
+	crypto.o armor.o \
 	ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
 
 else
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
+
+#include <linux/errno.h>
+
+/*
+ * base64 encode/decode.
+ */
+
+const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+	return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+	if (c >= 'A' && c <= 'Z')
+		return c - 'A';
+	if (c >= 'a' && c <= 'z')
+		return c - 'a' + 26;
+	if (c >= '0' && c <= '9')
+		return c - '0' + 52;
+	if (c == '+')
+		return 62;
+	if (c == '/')
+		return 63;
+	if (c == '=')
+		return 0; /* just non-negative, please */
+	return -EINVAL;
+}
+
+int ceph_armor(char *dst, const char *src, const char *end)
+{
+	int olen = 0;
+	int line = 0;
+
+	while (src < end) {
+		unsigned char a, b, c;
+
+		a = *src++;
+		*dst++ = encode_bits(a >> 2);
+		if (src < end) {
+			b = *src++;
+			*dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+			if (src < end) {
+				c = *src++;
+				*dst++ = encode_bits(((b & 15) << 2) |
+						     (c >> 6));
+				*dst++ = encode_bits(c & 63);
+			} else {
+				*dst++ = encode_bits((b & 15) << 2);
+				*dst++ = '=';
+			}
+		} else {
+			*dst++ = encode_bits(((a & 3) << 4));
+			*dst++ = '=';
+			*dst++ = '=';
+		}
+		olen += 4;
+		line += 4;
+		if (line == 64) {
+			line = 0;
+			*(dst++) = '\n';
+			olen++;
+		}
+	}
+	return olen;
+}
+
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+	int olen = 0;
+
+	while (src < end) {
+		int a, b, c, d;
+
+		if (src < end && src[0] == '\n')
+			src++;
+		if (src + 4 > end)
+			return -EINVAL;
+		a = decode_bits(src[0]);
+		b = decode_bits(src[1]);
+		c = decode_bits(src[2]);
+		d = decode_bits(src[3]);
+		if (a < 0 || b < 0 || c < 0 || d < 0)
+			return -EINVAL;
+
+		*dst++ = (a << 2) | (b >> 4);
+		if (src[2] == '=')
+			return olen + 1;
+		*dst++ = ((b & 15) << 4) | (c >> 2);
+		if (src[3] == '=')
+			return olen + 2;
+		*dst++ = ((c & 3) << 6) | d;
+		olen += 3;
+		src += 4;
+	}
+	return olen;
+}
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..291ac288e791
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,408 @@
+
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <crypto/hash.h>
+
+#include "crypto.h"
+#include "decode.h"
+
+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
+{
+	if (*p + sizeof(u16) + sizeof(key->created) +
+	    sizeof(u16) + key->len > end)
+		return -ERANGE;
+	ceph_encode_16(p, key->type);
+	ceph_encode_copy(p, &key->created, sizeof(key->created));
+	ceph_encode_16(p, key->len);
+	ceph_encode_copy(p, key->key, key->len);
+	return 0;
+}
+
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
+{
+	ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
+	key->type = ceph_decode_16(p);
+	ceph_decode_copy(p, &key->created, sizeof(key->created));
+	key->len = ceph_decode_16(p);
+	ceph_decode_need(p, end, key->len, bad);
+	key->key = kmalloc(key->len, GFP_NOFS);
+	if (!key->key)
+		return -ENOMEM;
+	ceph_decode_copy(p, key->key, key->len);
+	return 0;
+
+bad:
+	dout("failed to decode crypto key\n");
+	return -EINVAL;
+}
+
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
+{
+	int inlen = strlen(inkey);
+	int blen = inlen * 3 / 4;
+	void *buf, *p;
+	int ret;
+
+	dout("crypto_key_unarmor %s\n", inkey);
+	buf = kmalloc(blen, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+	blen = ceph_unarmor(buf, inkey, inkey+inlen);
+	if (blen < 0) {
+		kfree(buf);
+		return blen;
+	}
+
+	p = buf;
+	ret = ceph_crypto_key_decode(key, &p, p + blen);
+	kfree(buf);
+	if (ret)
+		return ret;
+	dout("crypto_key_unarmor key %p type %d len %d\n", key,
+	     key->type, key->len);
+	return 0;
+}
+
+
+
+#define AES_KEY_SIZE 16
+
+static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+{
+	return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+}
+
+const u8 *aes_iv = "cephsageyudagreg";
+
+int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+		     const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[2], sg_out[1];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	int ret;
+	void *iv;
+	int ivsize;
+	size_t zero_padding = (0x10 - (src_len & 0x0f));
+	char pad[16];
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	memset(pad, zero_padding, zero_padding);
+
+	*dst_len = src_len + zero_padding;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 2);
+	sg_set_buf(&sg_in[0], src, src_len);
+	sg_set_buf(&sg_in[1], pad, zero_padding);
+	sg_init_table(sg_out, 1);
+	sg_set_buf(sg_out, dst, *dst_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+	/*
+	print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
+			src, src_len, 1);
+	print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+			pad, zero_padding, 1);
+	*/
+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+				     src_len + zero_padding);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0)
+		pr_err("ceph_aes_crypt failed %d\n", ret);
+	/*
+	print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
+		      const void *src1, size_t src1_len,
+		      const void *src2, size_t src2_len)
+{
+	struct scatterlist sg_in[3], sg_out[1];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	int ret;
+	void *iv;
+	int ivsize;
+	size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
+	char pad[16];
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	memset(pad, zero_padding, zero_padding);
+
+	*dst_len = src1_len + src2_len + zero_padding;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 3);
+	sg_set_buf(&sg_in[0], src1, src1_len);
+	sg_set_buf(&sg_in[1], src2, src2_len);
+	sg_set_buf(&sg_in[2], pad, zero_padding);
+	sg_init_table(sg_out, 1);
+	sg_set_buf(sg_out, dst, *dst_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+	/*
+	print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
+			src1, src1_len, 1);
+	print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
+			src2, src2_len, 1);
+	print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
+			pad, zero_padding, 1);
+	*/
+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+				     src1_len + src2_len + zero_padding);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0)
+		pr_err("ceph_aes_crypt2 failed %d\n", ret);
+	/*
+	print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+		     const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[1], sg_out[2];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm };
+	char pad[16];
+	void *iv;
+	int ivsize;
+	int ret;
+	int last_byte;
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 1);
+	sg_init_table(sg_out, 2);
+	sg_set_buf(sg_in, src, src_len);
+	sg_set_buf(&sg_out[0], dst, *dst_len);
+	sg_set_buf(&sg_out[1], pad, sizeof(pad));
+
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+
+	/*
+	print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
+		       src, src_len, 1);
+	*/
+
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0) {
+		pr_err("ceph_aes_decrypt failed %d\n", ret);
+		return ret;
+	}
+
+	if (src_len <= *dst_len)
+		last_byte = ((char *)dst)[src_len - 1];
+	else
+		last_byte = pad[src_len - *dst_len - 1];
+	if (last_byte <= 16 && src_len >= last_byte) {
+		*dst_len = src_len - last_byte;
+	} else {
+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+		       last_byte, (int)src_len);
+		return -EPERM;  /* bad padding */
+	}
+	/*
+	print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+int ceph_aes_decrypt2(const void *key, int key_len,
+		      void *dst1, size_t *dst1_len,
+		      void *dst2, size_t *dst2_len,
+		      const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[1], sg_out[3];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm };
+	char pad[16];
+	void *iv;
+	int ivsize;
+	int ret;
+	int last_byte;
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	sg_init_table(sg_in, 1);
+	sg_set_buf(sg_in, src, src_len);
+	sg_init_table(sg_out, 3);
+	sg_set_buf(&sg_out[0], dst1, *dst1_len);
+	sg_set_buf(&sg_out[1], dst2, *dst2_len);
+	sg_set_buf(&sg_out[2], pad, sizeof(pad));
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+
+	/*
+	print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
+		       src, src_len, 1);
+	*/
+
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0) {
+		pr_err("ceph_aes_decrypt failed %d\n", ret);
+		return ret;
+	}
+
+	if (src_len <= *dst1_len)
+		last_byte = ((char *)dst1)[src_len - 1];
+	else if (src_len <= *dst1_len + *dst2_len)
+		last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
+	else
+		last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
+	if (last_byte <= 16 && src_len >= last_byte) {
+		src_len -= last_byte;
+	} else {
+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+		       last_byte, (int)src_len);
+		return -EPERM;  /* bad padding */
+	}
+
+	if (src_len < *dst1_len) {
+		*dst1_len = src_len;
+		*dst2_len = 0;
+	} else {
+		*dst2_len = src_len - *dst1_len;
+	}
+	/*
+	print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst1, *dst1_len, 1);
+	print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst2, *dst2_len, 1);
+	*/
+
+	return 0;
+}
+
+
+int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		 const void *src, size_t src_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src_len)
+			return -ERANGE;
+		memcpy(dst, src, src_len);
+		*dst_len = src_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_decrypt(secret->key, secret->len, dst,
+					dst_len, src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_decrypt2(struct ceph_crypto_key *secret,
+			void *dst1, size_t *dst1_len,
+			void *dst2, size_t *dst2_len,
+			const void *src, size_t src_len)
+{
+	size_t t;
+
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst1_len + *dst2_len < src_len)
+			return -ERANGE;
+		t = min(*dst1_len, src_len);
+		memcpy(dst1, src, t);
+		*dst1_len = t;
+		src += t;
+		src_len -= t;
+		if (src_len) {
+			t = min(*dst2_len, src_len);
+			memcpy(dst2, src, t);
+			*dst2_len = t;
+		}
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_decrypt2(secret->key, secret->len,
+					 dst1, dst1_len, dst2, dst2_len,
+					 src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		 const void *src, size_t src_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src_len)
+			return -ERANGE;
+		memcpy(dst, src, src_len);
+		*dst_len = src_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_encrypt(secret->key, secret->len, dst,
+					dst_len, src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		  const void *src1, size_t src1_len,
+		  const void *src2, size_t src2_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src1_len + src2_len)
+			return -ERANGE;
+		memcpy(dst, src1, src1_len);
+		memcpy(dst + src1_len, src2, src2_len);
+		*dst_len = src1_len + src2_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
+					 src1, src1_len, src2, src2_len);
+
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
+#ifndef _FS_CEPH_CRYPTO_H
+#define _FS_CEPH_CRYPTO_H
+
+#include "types.h"
+#include "buffer.h"
+
+/*
+ * cryptographic secret
+ */
+struct ceph_crypto_key {
+	int type;
+	struct ceph_timespec created;
+	int len;
+	void *key;
+};
+
+static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
+{
+	kfree(key->key);
+}
+
+extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
+				  void **p, void *end);
+extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
+				  void **p, void *end);
+extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+
+/* crypto.c */
+extern int ceph_decrypt(struct ceph_crypto_key *secret,
+			void *dst, size_t *dst_len,
+			const void *src, size_t src_len);
+extern int ceph_encrypt(struct ceph_crypto_key *secret,
+			void *dst, size_t *dst_len,
+			const void *src, size_t src_len);
+extern int ceph_decrypt2(struct ceph_crypto_key *secret,
+			void *dst1, size_t *dst1_len,
+			void *dst2, size_t *dst2_len,
+			const void *src, size_t src_len);
+extern int ceph_encrypt2(struct ceph_crypto_key *secret,
+			 void *dst, size_t *dst_len,
+			 const void *src1, size_t src1_len,
+			 const void *src2, size_t src2_len);
+
+/* armor.c */
+extern int ceph_armor(char *dst, const void *src, const void *end);
+extern int ceph_unarmor(void *dst, const char *src, const char *end);
+
+#endif
-- 
cgit v1.2.2


From 9bd2e6f8ba71facf1cadb7154a7e0e4d345a6aba Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 2 Feb 2010 16:21:06 -0800
Subject: ceph: allow renewal of auth credentials

Add infrastructure to allow the mon_client to periodically renew its auth
credentials.  Also add a messenger callback that will force such a renewal
if a peer rejects our authenticator.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/auth.c       | 61 ++++++++++++++++++++++++++++++++++++----------------
 fs/ceph/auth.h       |  7 ++++++
 fs/ceph/mds_client.c | 13 +++++++++++
 fs/ceph/messenger.c  |  9 ++++++++
 fs/ceph/messenger.h  |  1 +
 fs/ceph/mon_client.c | 55 ++++++++++++++++++++++++++++++++++++++++------
 fs/ceph/mon_client.h |  3 +++
 fs/ceph/osd_client.c | 12 +++++++++++
 fs/ceph/super.c      | 12 +++++------
 fs/ceph/super.h      |  4 ++--
 10 files changed, 144 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 32f2e2a021ab..d5872d4f92bf 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -125,6 +125,30 @@ bad:
 	return -ERANGE;
 }
 
+int ceph_build_auth_request(struct ceph_auth_client *ac,
+			   void *msg_buf, size_t msg_len)
+{
+	struct ceph_mon_request_header *monhdr = msg_buf;
+	void *p = monhdr + 1;
+	void *end = msg_buf + msg_len;
+	int ret;
+
+	monhdr->have_version = 0;
+	monhdr->session_mon = cpu_to_le16(-1);
+	monhdr->session_mon_tid = 0;
+
+	ceph_encode_32(&p, ac->protocol);
+
+	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+	if (ret < 0) {
+		pr_err("error %d building request\n", ret);
+		return ret;
+	}
+	dout(" built request %d bytes\n", ret);
+	ceph_encode_32(&p, ret);
+	return p + ret - msg_buf;
+}
+
 /*
  * Handle auth message from monitor.
  */
@@ -188,28 +212,13 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 				goto out;
 			}
 		}
+
+		ac->negotiating = false;
 	}
 
 	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
 	if (ret == -EAGAIN) {
-		struct ceph_mon_request_header *monhdr = reply_buf;
-		void *p = reply_buf + 1;
-		void *end = reply_buf + reply_len;
-
-		monhdr->have_version = 0;
-		monhdr->session_mon = cpu_to_le16(-1);
-		monhdr->session_mon_tid = 0;
-
-		ceph_encode_32(&p, ac->protocol);
-
-		ret = ac->ops->build_request(ac, p + sizeof(u32), end);
-		if (ret < 0) {
-			pr_err("error %d building request\n", ret);
-			goto out;
-		}
-		dout(" built request %d bytes\n", ret);
-		ceph_encode_32(&p, ret);
-		return p + ret - reply_buf;
+		return ceph_build_auth_request(ac, reply_buf, reply_len);
 	} else if (ret) {
 		pr_err("authentication error %d\n", ret);
 		return ret;
@@ -222,4 +231,20 @@ out:
 	return ret;
 }
 
+int ceph_build_auth(struct ceph_auth_client *ac,
+		    void *msg_buf, size_t msg_len)
+{
+	if (!ac->protocol)
+		return ceph_auth_build_hello(ac, msg_buf, msg_len);
+	BUG_ON(!ac->ops);
+	if (!ac->ops->is_authenticated(ac))
+		return ceph_build_auth_request(ac, msg_buf, msg_len);
+	return 0;
+}
 
+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
+{
+	if (!ac->ops)
+		return 0;
+	return ac->ops->is_authenticated(ac);
+}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index 4d8cdf6bb3b6..ca4f57cfb267 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -42,6 +42,8 @@ struct ceph_auth_client_ops {
 				       struct ceph_authorizer *a, size_t len);
 	void (*destroy_authorizer)(struct ceph_auth_client *ac,
 				   struct ceph_authorizer *a);
+	void (*invalidate_authorizer)(struct ceph_auth_client *ac,
+				      int peer_type);
 
 	/* reset when we (re)connect to a monitor */
 	void (*reset)(struct ceph_auth_client *ac);
@@ -74,4 +76,9 @@ extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 				  void *reply_buf, size_t reply_len);
 extern int ceph_entity_name_encode(const char *name, void **p, void *end);
 
+extern int ceph_build_auth(struct ceph_auth_client *ac,
+		    void *msg_buf, size_t msg_len);
+
+extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+
 #endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4e3e8b229e67..aa8506bad42d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2946,12 +2946,25 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 	return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
 }
 
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
+
+	return ceph_monc_validate_auth(&mdsc->client->monc);
+}
+
 const static struct ceph_connection_operations mds_con_ops = {
 	.get = con_get,
 	.put = con_put,
 	.dispatch = dispatch,
 	.get_authorizer = get_authorizer,
 	.verify_authorizer_reply = verify_authorizer_reply,
+	.invalidate_authorizer = invalidate_authorizer,
 	.peer_reset = peer_reset,
 };
 
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index e4e8d4439d3a..c4341784ec8f 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1849,6 +1849,15 @@ static void ceph_fault(struct ceph_connection *con)
 		con->in_msg = NULL;
 	}
 
+	/*
+	 * in case we faulted due to authentication, invalidate our
+	 * current tickets so that we can get new ones.
+         */
+	if (con->auth_retry && con->ops->invalidate_authorizer) {
+		dout("calling invalidate_authorizer()\n");
+		con->ops->invalidate_authorizer(con);
+	}
+
 	/* If there are no messages in the queue, place the connection
 	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
 	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index c26a3d8aa78c..c9735378be3f 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -32,6 +32,7 @@ struct ceph_connection_operations {
 			       void **buf, int *len, int *proto,
 			       void **reply_buf, int *reply_len, int force_new);
 	int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
+	int (*invalidate_authorizer)(struct ceph_connection *con);
 
 	/* protocol version mismatch */
 	void (*bad_proto) (struct ceph_connection *con);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 3f7ae7f73c50..fec41a0eff86 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -29,6 +29,8 @@
 
 const static struct ceph_connection_operations mon_con_ops;
 
+static int __validate_auth(struct ceph_mon_client *monc);
+
 /*
  * Decode a monmap blob (e.g., during mount).
  */
@@ -103,6 +105,7 @@ static void __close_session(struct ceph_mon_client *monc)
 		ceph_con_revoke(monc->con, monc->m_auth);
 		ceph_con_close(monc->con);
 		monc->cur_mon = -1;
+		monc->pending_auth = 0;
 		ceph_auth_reset(monc->auth);
 	}
 }
@@ -334,7 +337,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 
 out:
 	mutex_unlock(&monc->mutex);
-	wake_up(&client->mount_wq);
+	wake_up(&client->auth_wq);
 }
 
 /*
@@ -477,6 +480,11 @@ static void delayed_work(struct work_struct *work)
 		__open_session(monc);  /* continue hunting */
 	} else {
 		ceph_con_keepalive(monc->con);
+		mutex_unlock(&monc->mutex);
+
+		__validate_auth(monc);
+
+		mutex_lock(&monc->mutex);
 		if (monc->auth->ops->is_authenticated(monc->auth))
 			__send_subscribe(monc);
 	}
@@ -557,6 +565,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 		goto out_pool2;
 
 	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+	monc->pending_auth = 0;
 	if (IS_ERR(monc->m_auth)) {
 		err = PTR_ERR(monc->m_auth);
 		monc->m_auth = NULL;
@@ -614,6 +623,15 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 	kfree(monc->monmap);
 }
 
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+	monc->pending_auth = 1;
+	monc->m_auth->front.iov_len = len;
+	monc->m_auth->hdr.front_len = cpu_to_le32(len);
+	ceph_msg_get(monc->m_auth);  /* keep our ref */
+	ceph_con_send(monc->con, monc->m_auth);
+}
+
 
 static void handle_auth_reply(struct ceph_mon_client *monc,
 			      struct ceph_msg *msg)
@@ -621,18 +639,16 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 	int ret;
 
 	mutex_lock(&monc->mutex);
+	monc->pending_auth = 0;
 	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
 				     msg->front.iov_len,
 				     monc->m_auth->front.iov_base,
 				     monc->m_auth->front_max);
 	if (ret < 0) {
-		monc->client->mount_err = ret;
-		wake_up(&monc->client->mount_wq);
+		monc->client->auth_err = ret;
+		wake_up(&monc->client->auth_wq);
 	} else if (ret > 0) {
-		monc->m_auth->front.iov_len = ret;
-		monc->m_auth->hdr.front_len = cpu_to_le32(ret);
-		ceph_msg_get(monc->m_auth);  /* keep our ref */
-		ceph_con_send(monc->con, monc->m_auth);
+		__send_prepared_auth_request(monc, ret);
 	} else if (monc->auth->ops->is_authenticated(monc->auth)) {
 		dout("authenticated, starting session\n");
 
@@ -645,6 +661,31 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 	mutex_unlock(&monc->mutex);
 }
 
+static int __validate_auth(struct ceph_mon_client *monc)
+{
+	int ret;
+
+	if (monc->pending_auth)
+		return 0;
+
+	ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
+			      monc->m_auth->front_max);
+	if (ret <= 0)
+		return ret; /* either an error, or no need to authenticate */
+	__send_prepared_auth_request(monc, ret);
+	return 0;
+}
+
+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
+{
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = __validate_auth(monc);
+	mutex_unlock(&monc->mutex);
+	return ret;
+}
+
 /*
  * handle incoming message
  */
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index c75b53302ecc..5ca8e48d4379 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -61,6 +61,7 @@ struct ceph_mon_client {
 
 	struct ceph_auth_client *auth;
 	struct ceph_msg *m_auth;
+	int pending_auth;
 
 	bool hunting;
 	int cur_mon;                       /* last monitor i contacted */
@@ -110,6 +111,8 @@ extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
 
 extern int ceph_monc_open_session(struct ceph_mon_client *monc);
 
+extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
+
 
 
 #endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 944759b3079f..35c8afea13ec 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1448,6 +1448,17 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 	return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
 }
 
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
+
+	return ceph_monc_validate_auth(&osdc->client->monc);
+}
 
 const static struct ceph_connection_operations osd_con_ops = {
 	.get = get_osd_con,
@@ -1455,6 +1466,7 @@ const static struct ceph_connection_operations osd_con_ops = {
 	.dispatch = dispatch,
 	.get_authorizer = get_authorizer,
 	.verify_authorizer_reply = verify_authorizer_reply,
+	.invalidate_authorizer = invalidate_authorizer,
 	.alloc_msg = alloc_msg,
 	.fault = osd_reset,
 };
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index cd81c84e96fc..3a2548951fe6 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -542,7 +542,7 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
 
 	mutex_init(&client->mount_mutex);
 
-	init_waitqueue_head(&client->mount_wq);
+	init_waitqueue_head(&client->auth_wq);
 
 	client->sb = NULL;
 	client->mount_state = CEPH_MOUNT_MOUNTING;
@@ -550,7 +550,7 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
 
 	client->msgr = NULL;
 
-	client->mount_err = 0;
+	client->auth_err = 0;
 	atomic_long_set(&client->writeback_count, 0);
 
 	err = bdi_init(&client->backing_dev_info);
@@ -742,13 +742,13 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
 
 		/* wait */
 		dout("mount waiting for mon_map\n");
-		err = wait_event_interruptible_timeout(client->mount_wq, /* FIXME */
-			       have_mon_map(client) || (client->mount_err < 0),
+		err = wait_event_interruptible_timeout(client->auth_wq,
+			       have_mon_map(client) || (client->auth_err < 0),
 			       timeout);
 		if (err == -EINTR || err == -ERESTARTSYS)
 			goto out;
-		if (client->mount_err < 0) {
-			err = client->mount_err;
+		if (client->auth_err < 0) {
+			err = client->auth_err;
 			goto out;
 		}
 	}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 62d9ae482d72..770f7b507fce 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -123,9 +123,9 @@ struct ceph_client {
 	struct super_block *sb;
 
 	unsigned long mount_state;
-	wait_queue_head_t mount_wq;
+	wait_queue_head_t auth_wq;
 
-	int mount_err;
+	int auth_err;
 
 	struct ceph_messenger *msgr;   /* messenger instance */
 	struct ceph_mon_client monc;
-- 
cgit v1.2.2


From 07c8739c521cb029d0f3549556aae2d304513978 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 4 Feb 2010 09:42:20 -0800
Subject: ceph: add struct version to auth encoding

Inlucde struct version in encoding. This will streamline future protocol
changes.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/auth.c      | 3 +++
 fs/ceph/auth_none.c | 1 +
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index d5872d4f92bf..b34ce0e41b4c 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -107,8 +107,11 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
 	lenp = p;
 	p += sizeof(u32);
 
+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+	ceph_encode_8(&p, 1);
 	num = ARRAY_SIZE(supported_protocols);
 	ceph_encode_32(&p, num);
+	ceph_decode_need(&p, end, num * sizeof(u32), bad);
 	for (i = 0; i < num; i++)
 		ceph_encode_32(&p, supported_protocols[i]);
 
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 631017eb7117..b4ef6f0a6c85 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -62,6 +62,7 @@ static int ceph_auth_none_create_authorizer(
 	if (!ai->built_authorizer) {
 		p = au->buf;
 		end = p + sizeof(au->buf);
+		ceph_encode_8(&p, 1);
 		ret = ceph_entity_name_encode(ac->name, &p, end - 8);
 		if (ret < 0)
 			goto bad;
-- 
cgit v1.2.2


From ec0994e48ea2aebf62ff08376227f3a9ccf46262 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 2 Feb 2010 16:25:35 -0800
Subject: ceph: add support for auth_x authentication protocol

The auth_x protocol implements support for a kerberos-like mutual
authentication infrastructure used by Ceph.  We do not simply use vanilla
kerberos because of scalability and performance issues when dealing with
a large cluster of nodes providing a single logical service.

Auth_x provides mutual authentication of client and server and protects
against replay and man in the middle attacks.  It does not encrypt
the full session over the wire, however, so data payload may still be
snooped.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/Makefile          |   1 +
 fs/ceph/auth.c            |   6 +-
 fs/ceph/auth_x.c          | 656 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/auth_x.h          |  49 ++++
 fs/ceph/auth_x_protocol.h |  90 +++++++
 5 files changed, 801 insertions(+), 1 deletion(-)
 create mode 100644 fs/ceph/auth_x.c
 create mode 100644 fs/ceph/auth_x.h
 create mode 100644 fs/ceph/auth_x_protocol.h

(limited to 'fs')

diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 85a588e43e7d..6a660e610be8 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -15,6 +15,7 @@ ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
 	debugfs.o \
 	auth.o auth_none.o \
 	crypto.o armor.o \
+	auth_x.o \
 	ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
 
 else
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index b34ce0e41b4c..abb204fea6c7 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -5,6 +5,7 @@
 
 #include "types.h"
 #include "auth_none.h"
+#include "auth_x.h"
 #include "decode.h"
 #include "super.h"
 
@@ -14,7 +15,8 @@
  * get protocol handler
  */
 static u32 supported_protocols[] = {
-	CEPH_AUTH_NONE
+	CEPH_AUTH_NONE,
+	CEPH_AUTH_CEPHX
 };
 
 int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
@@ -22,6 +24,8 @@ int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
 	switch (protocol) {
 	case CEPH_AUTH_NONE:
 		return ceph_auth_none_init(ac);
+	case CEPH_AUTH_CEPHX:
+		return ceph_x_init(ac);
 	default:
 		return -ENOENT;
 	}
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..f0318427b6da
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,656 @@
+
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+#include "crypto.h"
+#include "auth.h"
+#include "decode.h"
+
+struct kmem_cache *ceph_x_ticketbuf_cachep;
+
+#define TEMP_TICKET_BUF_LEN	256
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+
+	ceph_x_validate_tickets(ac, &need);
+	dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
+	     ac->want_keys, need, xi->have_keys);
+	return (ac->want_keys & xi->have_keys) == ac->want_keys;
+}
+
+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
+			  void *ibuf, int ilen, void *obuf, size_t olen)
+{
+	struct ceph_x_encrypt_header head = {
+		.struct_v = 1,
+		.magic = cpu_to_le64(CEPHX_ENC_MAGIC)
+	};
+	size_t len = olen - sizeof(u32);
+	int ret;
+
+	ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
+			    &head, sizeof(head), ibuf, ilen);
+	if (ret)
+		return ret;
+	ceph_encode_32(&obuf, len);
+	return len + sizeof(u32);
+}
+
+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
+			  void **p, void *end, void *obuf, size_t olen)
+{
+	struct ceph_x_encrypt_header head;
+	size_t head_len = sizeof(head);
+	int len, ret;
+
+	len = ceph_decode_32(p);
+	if (*p + len > end)
+		return -EINVAL;
+
+	dout("ceph_x_decrypt len %d\n", len);
+	ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
+			    *p, len);
+	if (ret)
+		return ret;
+	if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+		return -EPERM;
+	*p += len;
+	return olen;
+}
+
+/*
+ * get existing (or insert new) ticket handler
+ */
+struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
+						 int service)
+{
+	struct ceph_x_ticket_handler *th;
+	struct ceph_x_info *xi = ac->private;
+	struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
+
+	while (*p) {
+		parent = *p;
+		th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+		if (service < th->service)
+			p = &(*p)->rb_left;
+		else if (service > th->service)
+			p = &(*p)->rb_right;
+		else
+			return th;
+	}
+
+	/* add it */
+	th = kzalloc(sizeof(*th), GFP_NOFS);
+	if (!th)
+		return ERR_PTR(-ENOMEM);
+	th->service = service;
+	rb_link_node(&th->node, parent, p);
+	rb_insert_color(&th->node, &xi->ticket_handlers);
+	return th;
+}
+
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+				  struct ceph_x_ticket_handler *th)
+{
+	struct ceph_x_info *xi = ac->private;
+
+	dout("remove_ticket_handler %p %d\n", th, th->service);
+	rb_erase(&th->node, &xi->ticket_handlers);
+	ceph_crypto_key_destroy(&th->session_key);
+	if (th->ticket_blob)
+		ceph_buffer_put(th->ticket_blob);
+	kfree(th);
+}
+
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+				    struct ceph_crypto_key *secret,
+				    void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	int num;
+	void *p = buf;
+	int ret;
+	char *dbuf;
+	char *ticket_buf;
+	u8 struct_v;
+
+	dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
+	if (!dbuf)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
+				      GFP_NOFS | GFP_ATOMIC);
+	if (!ticket_buf)
+		goto out_dbuf;
+
+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+	struct_v = ceph_decode_8(&p);
+	if (struct_v != 1)
+		goto bad;
+	num = ceph_decode_32(&p);
+	dout("%d tickets\n", num);
+	while (num--) {
+		int type;
+		u8 struct_v;
+		struct ceph_x_ticket_handler *th;
+		void *dp, *dend;
+		int dlen;
+		char is_enc;
+		struct timespec validity;
+		struct ceph_crypto_key old_key;
+		void *tp, *tpend;
+
+		ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
+
+		type = ceph_decode_32(&p);
+		dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+
+		struct_v = ceph_decode_8(&p);
+		if (struct_v != 1)
+			goto bad;
+
+		th = get_ticket_handler(ac, type);
+		if (IS_ERR(th)) {
+			ret = PTR_ERR(th);
+			goto out;
+		}
+
+		/* blob for me */
+		dlen = ceph_x_decrypt(secret, &p, end, dbuf,
+				      TEMP_TICKET_BUF_LEN);
+		if (dlen <= 0) {
+			ret = dlen;
+			goto out;
+		}
+		dout(" decrypted %d bytes\n", dlen);
+		dend = dbuf + dlen;
+		dp = dbuf;
+
+		struct_v = ceph_decode_8(&dp);
+		if (struct_v != 1)
+			goto bad;
+
+		memcpy(&old_key, &th->session_key, sizeof(old_key));
+		ret = ceph_crypto_key_decode(&th->session_key, &dp, dend);
+		if (ret)
+			goto out;
+
+		ceph_decode_copy(&dp, &th->validity, sizeof(th->validity));
+		ceph_decode_timespec(&validity, &th->validity);
+		th->expires = get_seconds() + validity.tv_sec;
+		th->renew_after = th->expires - (validity.tv_sec / 4);
+		dout(" expires=%lu renew_after=%lu\n", th->expires,
+		     th->renew_after);
+
+		/* ticket blob for service */
+		ceph_decode_8_safe(&p, end, is_enc, bad);
+		tp = ticket_buf;
+		if (is_enc) {
+			/* encrypted */
+			dout(" encrypted ticket\n");
+			dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
+					      TEMP_TICKET_BUF_LEN);
+			if (dlen < 0) {
+				ret = dlen;
+				goto out;
+			}
+			dlen = ceph_decode_32(&tp);
+		} else {
+			/* unencrypted */
+			ceph_decode_32_safe(&p, end, dlen, bad);
+			ceph_decode_need(&p, end, dlen, bad);
+			ceph_decode_copy(&p, ticket_buf, dlen);
+		}
+		tpend = tp + dlen;
+		dout(" ticket blob is %d bytes\n", dlen);
+		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
+		struct_v = ceph_decode_8(&tp);
+		th->secret_id = ceph_decode_64(&tp);
+		ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend);
+		if (ret)
+			goto out;
+		dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+		     type, ceph_entity_type_name(type), th->secret_id,
+		     (int)th->ticket_blob->vec.iov_len);
+		xi->have_keys |= th->service;
+	}
+
+	ret = 0;
+out:
+	kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
+out_dbuf:
+	kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
+	return ret;
+
+bad:
+	ret = -EINVAL;
+	goto out;
+}
+
+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+				   struct ceph_x_ticket_handler *th,
+				   struct ceph_x_authorizer *au)
+{
+	int len;
+	struct ceph_x_authorize_a *msg_a;
+	struct ceph_x_authorize_b msg_b;
+	void *p, *end;
+	int ret;
+	int ticket_blob_len =
+		(th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+
+	dout("build_authorizer for %s %p\n",
+	     ceph_entity_type_name(th->service), au);
+
+	len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) +
+		ticket_blob_len + 16;
+	dout("  need len %d\n", len);
+	if (au->buf && au->buf->alloc_len < len) {
+		ceph_buffer_put(au->buf);
+		au->buf = NULL;
+	}
+	if (!au->buf) {
+		au->buf = ceph_buffer_new(len, GFP_NOFS);
+		if (!au->buf)
+			return -ENOMEM;
+	}
+	au->service = th->service;
+
+	msg_a = au->buf->vec.iov_base;
+	msg_a->struct_v = 1;
+	msg_a->global_id = cpu_to_le64(ac->global_id);
+	msg_a->service_id = cpu_to_le32(th->service);
+	msg_a->ticket_blob.struct_v = 1;
+	msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+	msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+	if (ticket_blob_len) {
+		memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+		       th->ticket_blob->vec.iov_len);
+	}
+	dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+	     le64_to_cpu(msg_a->ticket_blob.secret_id));
+
+	p = msg_a + 1;
+	p += ticket_blob_len;
+	end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+
+	get_random_bytes(&au->nonce, sizeof(au->nonce));
+	msg_b.struct_v = 1;
+	msg_b.nonce = cpu_to_le64(au->nonce);
+	ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
+			     p, end - p);
+	if (ret < 0)
+		goto out_buf;
+	p += ret;
+	au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+	dout(" built authorizer nonce %llx len %d\n", au->nonce,
+	     (int)au->buf->vec.iov_len);
+	return 0;
+
+out_buf:
+	ceph_buffer_put(au->buf);
+	au->buf = NULL;
+	return ret;
+}
+
+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+				void **p, void *end)
+{
+	ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+	ceph_encode_8(p, 1);
+	ceph_encode_64(p, th->secret_id);
+	if (th->ticket_blob) {
+		const char *buf = th->ticket_blob->vec.iov_base;
+		u32 len = th->ticket_blob->vec.iov_len;
+
+		ceph_encode_32_safe(p, end, len, bad);
+		ceph_encode_copy_safe(p, end, buf, len, bad);
+	} else {
+		ceph_encode_32_safe(p, end, 0, bad);
+	}
+
+	return 0;
+bad:
+	return -ERANGE;
+}
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+	int want = ac->want_keys;
+	struct ceph_x_info *xi = ac->private;
+	int service;
+
+	*pneed = ac->want_keys & ~(xi->have_keys);
+
+	for (service = 1; service <= want; service <<= 1) {
+		struct ceph_x_ticket_handler *th;
+
+		if (!(ac->want_keys & service))
+			continue;
+
+		if (*pneed & service)
+			continue;
+
+		th = get_ticket_handler(ac, service);
+
+		if (!th) {
+			*pneed |= service;
+			continue;
+		}
+
+		if (get_seconds() >= th->renew_after)
+			*pneed |= service;
+		if (get_seconds() >= th->expires)
+			xi->have_keys &= ~service;
+	}
+}
+
+
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+				void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+	struct ceph_x_request_header *head = buf;
+	int ret;
+	struct ceph_x_ticket_handler *th =
+		get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+
+	ceph_x_validate_tickets(ac, &need);
+
+	dout("build_request want %x have %x need %x\n",
+	     ac->want_keys, xi->have_keys, need);
+
+	if (need & CEPH_ENTITY_TYPE_AUTH) {
+		struct ceph_x_authenticate *auth = (void *)(head + 1);
+		void *p = auth + 1;
+		struct ceph_x_challenge_blob tmp;
+		char tmp_enc[40];
+		u64 *u;
+
+		if (p > end)
+			return -ERANGE;
+
+		dout(" get_auth_session_key\n");
+		head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+
+		/* encrypt and hash */
+		get_random_bytes(&auth->client_challenge, sizeof(u64));
+		tmp.client_challenge = auth->client_challenge;
+		tmp.server_challenge = cpu_to_le64(xi->server_challenge);
+		ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
+				     tmp_enc, sizeof(tmp_enc));
+		if (ret < 0)
+			return ret;
+
+		auth->struct_v = 1;
+		auth->key = 0;
+		for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+			auth->key ^= *u;
+		dout(" server_challenge %llx client_challenge %llx key %llx\n",
+		     xi->server_challenge, le64_to_cpu(auth->client_challenge),
+		     le64_to_cpu(auth->key));
+
+		/* now encode the old ticket if exists */
+		ret = ceph_x_encode_ticket(th, &p, end);
+		if (ret < 0)
+			return ret;
+
+		return p - buf;
+	}
+
+	if (need) {
+		void *p = head + 1;
+		struct ceph_x_service_ticket_request *req;
+
+		if (p > end)
+			return -ERANGE;
+		head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
+
+		BUG_ON(!th);
+		ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+		if (ret)
+			return ret;
+		ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
+				 xi->auth_authorizer.buf->vec.iov_len);
+
+		req = p;
+		req->keys = cpu_to_le32(need);
+		p += sizeof(*req);
+		return p - buf;
+	}
+
+	return 0;
+}
+
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+			       void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct ceph_x_reply_header *head = buf;
+	struct ceph_x_ticket_handler *th;
+	int len = end - buf;
+	int op;
+	int ret;
+
+	if (result)
+		return result;  /* XXX hmm? */
+
+	if (xi->starting) {
+		/* it's a hello */
+		struct ceph_x_server_challenge *sc = buf;
+
+		if (len != sizeof(*sc))
+			return -EINVAL;
+		xi->server_challenge = le64_to_cpu(sc->server_challenge);
+		dout("handle_reply got server challenge %llx\n",
+		     xi->server_challenge);
+		xi->starting = false;
+		xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+		return -EAGAIN;
+	}
+
+	op = le32_to_cpu(head->op);
+	result = le32_to_cpu(head->result);
+	dout("handle_reply op %d result %d\n", op, result);
+	switch (op) {
+	case CEPHX_GET_AUTH_SESSION_KEY:
+		/* verify auth key */
+		ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
+					       buf + sizeof(*head), end);
+		break;
+
+	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+		th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+		BUG_ON(!th);
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+					       buf + sizeof(*head), end);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	if (ret)
+		return ret;
+	if (ac->want_keys == xi->have_keys)
+		return 0;
+	return -EAGAIN;
+}
+
+static int ceph_x_create_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_authorizer **a,
+	void **buf, size_t *len,
+	void **reply_buf, size_t *reply_len)
+{
+	struct ceph_x_authorizer *au;
+	struct ceph_x_ticket_handler *th;
+	int ret;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	au = kzalloc(sizeof(*au), GFP_NOFS);
+	if (!au)
+		return -ENOMEM;
+
+	ret = ceph_x_build_authorizer(ac, th, au);
+	if (ret) {
+		kfree(au);
+		return ret;
+	}
+
+	*a = (struct ceph_authorizer *)au;
+	*buf = au->buf->vec.iov_base;
+	*len = au->buf->vec.iov_len;
+	*reply_buf = au->reply_buf;
+	*reply_len = sizeof(au->reply_buf);
+	return 0;
+}
+
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+					  struct ceph_authorizer *a, size_t len)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+	struct ceph_x_ticket_handler *th;
+	int ret = 0;
+	struct ceph_x_authorize_reply reply;
+	void *p = au->reply_buf;
+	void *end = p + sizeof(au->reply_buf);
+
+	th = get_ticket_handler(ac, au->service);
+	if (!th)
+		return -EIO;  /* hrm! */
+	ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
+	if (ret < 0)
+		return ret;
+	if (ret != sizeof(reply))
+		return -EPERM;
+
+	if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+		ret = -EPERM;
+	else
+		ret = 0;
+	dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
+	     au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+	return ret;
+}
+
+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+
+	ceph_buffer_put(au->buf);
+	kfree(au);
+}
+
+
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+
+	dout("reset\n");
+	xi->starting = true;
+	xi->server_challenge = 0;
+}
+
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct rb_node *p;
+
+	dout("ceph_x_destroy %p\n", ac);
+	ceph_crypto_key_destroy(&xi->secret);
+
+	while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+		struct ceph_x_ticket_handler *th =
+			rb_entry(p, struct ceph_x_ticket_handler, node);
+		remove_ticket_handler(ac, th);
+	}
+
+	kmem_cache_destroy(ceph_x_ticketbuf_cachep);
+
+	kfree(ac->private);
+	ac->private = NULL;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+				   int peer_type)
+{
+	struct ceph_x_ticket_handler *th;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (th && !IS_ERR(th))
+		remove_ticket_handler(ac, th);
+}
+
+
+static const struct ceph_auth_client_ops ceph_x_ops = {
+	.is_authenticated = ceph_x_is_authenticated,
+	.build_request = ceph_x_build_request,
+	.handle_reply = ceph_x_handle_reply,
+	.create_authorizer = ceph_x_create_authorizer,
+	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+	.destroy_authorizer = ceph_x_destroy_authorizer,
+	.invalidate_authorizer = ceph_x_invalidate_authorizer,
+	.reset =  ceph_x_reset,
+	.destroy = ceph_x_destroy,
+};
+
+
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi;
+	int ret;
+
+	dout("ceph_x_init %p\n", ac);
+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
+	if (!xi)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
+				      TEMP_TICKET_BUF_LEN, 8,
+				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+				      NULL);
+	if (!ceph_x_ticketbuf_cachep)
+		goto done_nomem;
+	ret = -EINVAL;
+	if (!ac->secret) {
+		pr_err("no secret set (for auth_x protocol)\n");
+		goto done_nomem;
+	}
+
+	ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
+	if (ret)
+		goto done_nomem;
+
+	xi->starting = true;
+	xi->ticket_handlers = RB_ROOT;
+
+	ac->protocol = CEPH_AUTH_CEPHX;
+	ac->private = xi;
+	ac->ops = &ceph_x_ops;
+	return 0;
+
+done_nomem:
+	kfree(xi);
+	if (ceph_x_ticketbuf_cachep)
+		kmem_cache_destroy(ceph_x_ticketbuf_cachep);
+	return ret;
+}
+
+
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+
+#include <linux/rbtree.h>
+
+#include "crypto.h"
+#include "auth.h"
+#include "auth_x_protocol.h"
+
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+	struct rb_node node;
+	unsigned service;
+
+	struct ceph_crypto_key session_key;
+	struct ceph_timespec validity;
+
+	u64 secret_id;
+	struct ceph_buffer *ticket_blob;
+
+	unsigned long renew_after, expires;
+};
+
+
+struct ceph_x_authorizer {
+	struct ceph_buffer *buf;
+	unsigned service;
+	u64 nonce;
+	char reply_buf[128];  /* big enough for encrypted blob */
+};
+
+struct ceph_x_info {
+	struct ceph_crypto_key secret;
+
+	bool starting;
+	u64 server_challenge;
+
+	unsigned have_keys;
+	struct rb_root ticket_handlers;
+
+	struct ceph_x_authorizer auth_authorizer;
+};
+
+extern int ceph_x_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+
+#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY          0x0400
+
+/* common bits */
+struct ceph_x_ticket_blob {
+	__u8 struct_v;
+	__le64 secret_id;
+	__le32 blob_len;
+	char blob[];
+} __attribute__ ((packed));
+
+
+/* common request/reply headers */
+struct ceph_x_request_header {
+	__le16 op;
+} __attribute__ ((packed));
+
+struct ceph_x_reply_header {
+	__le16 op;
+	__le32 result;
+} __attribute__ ((packed));
+
+
+/* authenticate handshake */
+
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+	__u8 struct_v;
+	__le64 server_challenge;
+} __attribute__ ((packed));
+
+struct ceph_x_authenticate {
+	__u8 struct_v;
+	__le64 client_challenge;
+	__le64 key;
+	/* ticket blob */
+} __attribute__ ((packed));
+
+struct ceph_x_service_ticket_request {
+	__u8 struct_v;
+	__le32 keys;
+} __attribute__ ((packed));
+
+struct ceph_x_challenge_blob {
+	__le64 server_challenge;
+	__le64 client_challenge;
+} __attribute__ ((packed));
+
+
+
+/* authorize handshake */
+
+/*
+ * The authorizer consists of two pieces:
+ *  a - service id, ticket blob
+ *  b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+	__u8 struct_v;
+	__le64 global_id;
+	__le32 service_id;
+	struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_b {
+	__u8 struct_v;
+	__le64 nonce;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_reply {
+	__u8 struct_v;
+	__le64 nonce_plus_one;
+} __attribute__ ((packed));
+
+
+/*
+ * encyption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+
+struct ceph_x_encrypt_header {
+	__u8 struct_v;
+	__le64 magic;
+} __attribute__ ((packed));
+
+#endif
-- 
cgit v1.2.2


From b056c8769d1da6a6a80ce780a4b8957b70434a41 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Wed, 3 Feb 2010 10:47:48 -0800
Subject: ceph: remove unused variable

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8065dc92c611..92f482150742 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -505,7 +505,6 @@ static void writepages_finish(struct ceph_osd_request *req,
 	struct ceph_osd_op *op;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	unsigned wrote;
-	loff_t offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
 	struct page *page;
 	int i;
 	struct ceph_snap_context *snapc = req->r_snapc;
-- 
cgit v1.2.2


From f5a2041bd96c9f05ff10172b9c814c14f247084e Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Wed, 3 Feb 2010 11:00:26 -0800
Subject: ceph: put unused osd connections on lru

Instead of removing osd connection immediately when the
requests list is empty, put the osd connection on an lru.
Only if that osd has not been used for more than a specified
time, will it be removed.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 76 +++++++++++++++++++++++++++++++++++++++++++++-------
 fs/ceph/osd_client.h |  4 +++
 fs/ceph/super.c      |  3 +++
 fs/ceph/super.h      |  2 ++
 4 files changed, 76 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 35c8afea13ec..7f8a26fdcc2c 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -389,6 +389,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
 	atomic_set(&osd->o_ref, 1);
 	osd->o_osdc = osdc;
 	INIT_LIST_HEAD(&osd->o_requests);
+	INIT_LIST_HEAD(&osd->o_osd_lru);
 	osd->o_incarnation = 1;
 
 	ceph_con_init(osdc->client->msgr, &osd->o_con);
@@ -422,25 +423,56 @@ static void put_osd(struct ceph_osd *osd)
 /*
  * remove an osd from our map
  */
-static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 {
-	dout("remove_osd %p\n", osd);
+	dout("__remove_osd %p\n", osd);
 	BUG_ON(!list_empty(&osd->o_requests));
 	rb_erase(&osd->o_node, &osdc->osds);
+	list_del_init(&osd->o_osd_lru);
 	ceph_con_close(&osd->o_con);
 	put_osd(osd);
 }
 
+static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+			      struct ceph_osd *osd)
+{
+	dout("__move_osd_to_lru %p\n", osd);
+	BUG_ON(!list_empty(&osd->o_osd_lru));
+	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+	osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
+}
+
+static void __remove_osd_from_lru(struct ceph_osd *osd)
+{
+	dout("__remove_osd_from_lru %p\n", osd);
+	if (!list_empty(&osd->o_osd_lru))
+		list_del_init(&osd->o_osd_lru);
+}
+
+static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
+{
+	struct ceph_osd *osd, *nosd;
+
+	dout("__remove_old_osds %p\n", osdc);
+	mutex_lock(&osdc->request_mutex);
+	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+		if (!remove_all && time_before(jiffies, osd->lru_ttl))
+			break;
+		__remove_osd(osdc, osd);
+	}
+	mutex_unlock(&osdc->request_mutex);
+}
+
 /*
  * reset osd connect
  */
-static int reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 {
 	int ret = 0;
 
-	dout("reset_osd %p osd%d\n", osd, osd->o_osd);
+	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
 	if (list_empty(&osd->o_requests)) {
-		remove_osd(osdc, osd);
+		__remove_osd(osdc, osd);
 	} else {
 		ceph_con_close(&osd->o_con);
 		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
@@ -533,7 +565,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 
 		list_del_init(&req->r_osd_item);
 		if (list_empty(&req->r_osd->o_requests))
-			remove_osd(osdc, req->r_osd);
+			__move_osd_to_lru(osdc, req->r_osd);
 		req->r_osd = NULL;
 	}
 
@@ -611,7 +643,7 @@ static int __map_osds(struct ceph_osd_client *osdc,
 		if (list_empty(&req->r_osd->o_requests)) {
 			/* try to re-use r_osd if possible */
 			newosd = get_osd(req->r_osd);
-			remove_osd(osdc, newosd);
+			__remove_osd(osdc, newosd);
 		}
 		req->r_osd = NULL;
 	}
@@ -636,8 +668,10 @@ static int __map_osds(struct ceph_osd_client *osdc,
 		ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
 	}
 
-	if (req->r_osd)
+	if (req->r_osd) {
+		__remove_osd_from_lru(req->r_osd);
 		list_add(&req->r_osd_item, &req->r_osd->o_requests);
+	}
 	err = 1;   /* osd changed */
 
 out:
@@ -744,6 +778,23 @@ static void handle_timeout(struct work_struct *work)
 	up_read(&osdc->map_sem);
 }
 
+static void handle_osds_timeout(struct work_struct *work)
+{
+	struct ceph_osd_client *osdc =
+		container_of(work, struct ceph_osd_client,
+			     osds_timeout_work.work);
+	unsigned long delay =
+		osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
+
+	dout("osds timeout\n");
+	down_read(&osdc->map_sem);
+	remove_old_osds(osdc, 0);
+	up_read(&osdc->map_sem);
+
+	schedule_delayed_work(&osdc->osds_timeout_work,
+			      round_jiffies_relative(delay));
+}
+
 /*
  * handle osd op reply.  either call the callback if it is specified,
  * or do the completion to wake up the waiting thread.
@@ -881,7 +932,7 @@ static void kick_requests(struct ceph_osd_client *osdc,
 				   ceph_osd_addr(osdc->osdmap,
 						 osd->o_osd),
 				   sizeof(struct ceph_entity_addr)) != 0)
-				reset_osd(osdc, osd);
+				__reset_osd(osdc, osd);
 		}
 	}
 
@@ -1195,9 +1246,14 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->timeout_tid = 0;
 	osdc->last_tid = 0;
 	osdc->osds = RB_ROOT;
+	INIT_LIST_HEAD(&osdc->osd_lru);
 	osdc->requests = RB_ROOT;
 	osdc->num_requests = 0;
 	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+
+	schedule_delayed_work(&osdc->osds_timeout_work,
+	   round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
 
 	err = -ENOMEM;
 	osdc->req_mempool = mempool_create_kmalloc_pool(10,
@@ -1219,10 +1275,12 @@ out:
 void ceph_osdc_stop(struct ceph_osd_client *osdc)
 {
 	cancel_delayed_work_sync(&osdc->timeout_work);
+	cancel_delayed_work_sync(&osdc->osds_timeout_work);
 	if (osdc->osdmap) {
 		ceph_osdmap_destroy(osdc->osdmap);
 		osdc->osdmap = NULL;
 	}
+	remove_old_osds(osdc, 1);
 	mempool_destroy(osdc->req_mempool);
 	ceph_msgpool_destroy(&osdc->msgpool_op);
 }
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index 8d533d9406ff..70f31b61f02c 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -31,9 +31,11 @@ struct ceph_osd {
 	struct rb_node o_node;
 	struct ceph_connection o_con;
 	struct list_head o_requests;
+	struct list_head o_osd_lru;
 	struct ceph_authorizer *o_authorizer;
 	void *o_authorizer_buf, *o_authorizer_reply_buf;
 	size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
+	unsigned long lru_ttl;
 };
 
 /* an in-flight request */
@@ -90,11 +92,13 @@ struct ceph_osd_client {
 
 	struct mutex           request_mutex;
 	struct rb_root         osds;          /* osds */
+	struct list_head       osd_lru;       /* idle osds */
 	u64                    timeout_tid;   /* tid of timeout triggering rq */
 	u64                    last_tid;      /* tid of last request */
 	struct rb_root         requests;      /* pending requests */
 	int                    num_requests;
 	struct delayed_work    timeout_work;
+	struct delayed_work    osds_timeout_work;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry 	       *debugfs_file;
 #endif
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 3a2548951fe6..39aaf29a04a0 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -293,6 +293,7 @@ enum {
 	Opt_rsize,
 	Opt_osdtimeout,
 	Opt_mount_timeout,
+	Opt_osd_idle_ttl,
 	Opt_caps_wanted_delay_min,
 	Opt_caps_wanted_delay_max,
 	Opt_readdir_max_entries,
@@ -322,6 +323,7 @@ static match_table_t arg_tokens = {
 	{Opt_rsize, "rsize=%d"},
 	{Opt_osdtimeout, "osdtimeout=%d"},
 	{Opt_mount_timeout, "mount_timeout=%d"},
+	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
 	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
 	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
 	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
@@ -367,6 +369,7 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
 	args->flags = CEPH_OPT_DEFAULT;
 	args->osd_timeout = 5;    /* seconds */
 	args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+	args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
 	args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
 	args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
 	args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 770f7b507fce..3930fb685f0b 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -53,6 +53,7 @@ struct ceph_mount_args {
 	struct ceph_entity_addr *mon_addr;
 	int flags;
 	int mount_timeout;
+	int osd_idle_ttl;
 	int caps_wanted_delay_min, caps_wanted_delay_max;
 	struct ceph_fsid fsid;
 	struct ceph_entity_addr my_addr;
@@ -71,6 +72,7 @@ struct ceph_mount_args {
  * defaults
  */
 #define CEPH_MOUNT_TIMEOUT_DEFAULT  60
+#define CEPH_OSD_IDLE_TTL_DEFAULT    60
 #define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
 
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
-- 
cgit v1.2.2


From 02f90c61096ec3ad691e808a4aa7ca5a06e550ec Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 4 Feb 2010 16:18:10 -0800
Subject: ceph: add uid field to ceph_pg_pool

Also verify encoding version as we go.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 20 ++++++++++++++++++++
 fs/ceph/rados.h  |  8 ++++++++
 2 files changed, 28 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index a143c51c2cfb..a6afe3836f7e 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -426,6 +426,11 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	map->pg_temp = RB_ROOT;
 
 	ceph_decode_16_safe(p, end, version, bad);
+	if (version > CEPH_OSDMAP_VERSION) {
+		pr_warning("got unknown v %d > %d of osdmap\n", version,
+			   CEPH_OSDMAP_VERSION);
+		goto bad;
+	}
 
 	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
 	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
@@ -447,6 +452,11 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		if (i >= map->num_pools)
 			goto bad;
 		ev = ceph_decode_8(p); /* encoding version */
+		if (ev > CEPH_PG_POOL_VERSION) {
+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+				   ev, CEPH_PG_POOL_VERSION);
+			goto bad;
+		}
 		ceph_decode_copy(p, &map->pg_pool[i].v,
 				 sizeof(map->pg_pool->v));
 		calc_pg_masks(&map->pg_pool[i]);
@@ -552,6 +562,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	struct rb_node *rbp;
 
 	ceph_decode_16_safe(p, end, version, bad);
+	if (version > CEPH_OSDMAP_INC_VERSION) {
+		pr_warning("got unknown v %d > %d of inc osdmap\n", version,
+			   CEPH_OSDMAP_INC_VERSION);
+		goto bad;
+	}
 
 	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
 			 bad);
@@ -624,6 +639,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		}
 		ceph_decode_need(p, end, 1 + sizeof(map->pg_pool->v), bad);
 		ev = ceph_decode_8(p);  /* encoding version */
+		if (ev > CEPH_PG_POOL_VERSION) {
+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+				   ev, CEPH_PG_POOL_VERSION);
+			goto bad;
+		}
 		ceph_decode_copy(p, &map->pg_pool[pool].v,
 				 sizeof(map->pg_pool->v));
 		calc_pg_masks(&map->pg_pool[pool]);
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 123fd845459e..1f4c78640541 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -8,6 +8,12 @@
 
 #include "msgr.h"
 
+/*
+ * osdmap encoding versions
+ */
+#define CEPH_OSDMAP_INC_VERSION 3
+#define CEPH_OSDMAP_VERSION     3
+
 /*
  * fs id
  */
@@ -80,6 +86,7 @@ struct ceph_pg {
  */
 #define CEPH_PG_TYPE_REP     1
 #define CEPH_PG_TYPE_RAID4   2
+#define CEPH_PG_POOL_VERSION 2
 struct ceph_pg_pool {
 	__u8 type;                /* CEPH_PG_TYPE_* */
 	__u8 size;                /* number of osds in each pg */
@@ -92,6 +99,7 @@ struct ceph_pg_pool {
 	__le32 snap_epoch;        /* epoch of last snap */
 	__le32 num_snaps;
 	__le32 num_removed_snap_intervals;
+	__le64 uid;
 } __attribute__ ((packed));
 
 /*
-- 
cgit v1.2.2


From 972f0d3ab1a15cb5d790dd8c53903066084b28f2 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Thu, 4 Feb 2010 13:41:41 -0800
Subject: ceph: fix short synchronous reads

Zeroing of holes was not done correctly: page_off was miscalculated and
zeroing the tail didn't not adjust the 'read' value to include the zeroed
portion.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/file.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 2d88c805a56c..43bd2f2e51a5 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -395,23 +395,22 @@ static void zero_page_vector_range(int off, int len, struct page **pages)
 {
 	int i = off >> PAGE_CACHE_SHIFT;
 
+	off &= ~PAGE_CACHE_MASK;
+
 	dout("zero_page_vector_page %u~%u\n", off, len);
-	BUG_ON(len < PAGE_CACHE_SIZE);
 
 	/* leading partial page? */
-	if (off & ~PAGE_CACHE_MASK) {
+	if (off) {
+		int end = min((int)PAGE_CACHE_SIZE, off + len);
 		dout("zeroing %d %p head from %d\n", i, pages[i],
-		     (int)(off & ~PAGE_CACHE_MASK));
-		zero_user_segment(pages[i], off & ~PAGE_CACHE_MASK,
-				  PAGE_CACHE_SIZE);
-		off += PAGE_CACHE_SIZE;
-		off &= PAGE_CACHE_MASK;
+		     (int)off);
+		zero_user_segment(pages[i], off, end);
+		len -= (end - off);
 		i++;
 	}
 	while (len >= PAGE_CACHE_SIZE) {
 		dout("zeroing %d %p\n", i, pages[i]);
 		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
-		off += PAGE_CACHE_SIZE;
 		len -= PAGE_CACHE_SIZE;
 		i++;
 	}
@@ -437,7 +436,7 @@ static int striped_read(struct inode *inode,
 	struct ceph_client *client = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 pos, this_len;
-	int page_off = off & ~PAGE_CACHE_SIZE; /* first byte's offset in page */
+	int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
 	int left, pages_left;
 	int read;
 	struct page **page_pos;
@@ -493,6 +492,7 @@ more:
 			dout("zero tail\n");
 			zero_page_vector_range(page_off + read, len - read,
 					       pages);
+			read = len;
 			goto out;
 		}
 
-- 
cgit v1.2.2


From 4af6b2257ee0eb8f4bf3b1dc8acb643c0e8a887f Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 9 Feb 2010 11:02:51 -0800
Subject: ceph: refactor ceph_write_begin, fix ceph_page_mkwrite

Originally ceph_page_mkwrite called ceph_write_begin, hoping that
the returned locked page would be the page that it was requested
to mkwrite. Factored out relevant part of ceph_page_mkwrite and
we lock the right page anyway.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c | 80 +++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 92f482150742..89c5ff3b59d5 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -907,15 +907,13 @@ static int context_is_writeable_or_written(struct inode *inode,
  * We are only allowed to write into/dirty the page if the page is
  * clean, or already dirty within the same snap context.
  */
-static int ceph_write_begin(struct file *file, struct address_space *mapping,
-			    loff_t pos, unsigned len, unsigned flags,
-			    struct page **pagep, void **fsdata)
+static int ceph_update_writeable_page(struct file *file,
+			    loff_t pos, unsigned len,
+			    struct page *page)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
-	struct page *page;
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	loff_t page_off = pos & PAGE_CACHE_MASK;
 	int pos_in_page = pos & ~PAGE_CACHE_MASK;
 	int end_in_page = pos_in_page + len;
@@ -923,16 +921,6 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 	struct ceph_snap_context *snapc;
 	int r;
 
-	/* get a page*/
-retry:
-	page = grab_cache_page_write_begin(mapping, index, 0);
-	if (!page)
-		return -ENOMEM;
-	*pagep = page;
-
-	dout("write_begin file %p inode %p page %p %d~%d\n", file,
-	     inode, page, (int)pos, (int)len);
-
 retry_locked:
 	/* writepages currently holds page lock, but if we change that later, */
 	wait_on_page_writeback(page);
@@ -964,7 +952,7 @@ retry_locked:
 			wait_event_interruptible(ci->i_cap_wq,
 			       context_is_writeable_or_written(inode, snapc));
 			ceph_put_snap_context(snapc);
-			goto retry;
+			return -EAGAIN;
 		}
 
 		/* yay, writeable, do it now (without dropping page lock) */
@@ -1021,6 +1009,35 @@ fail_nosnap:
 	return r;
 }
 
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct page *page;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	int r;
+
+	do {
+		/* get a page*/
+		page = grab_cache_page_write_begin(mapping, index, 0);
+		if (!page)
+			return -ENOMEM;
+		*pagep = page;
+
+		dout("write_begin file %p inode %p page %p %d~%d\n", file,
+	     	inode, page, (int)pos, (int)len);
+
+		r = ceph_update_writeable_page(file, pos, len, page);
+	} while (r == -EAGAIN);
+
+	return r;
+}
+
 /*
  * we don't do anything in here that simple_write_end doesn't do
  * except adjust dirty page accounting and drop read lock on
@@ -1104,8 +1121,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
 	loff_t off = page->index << PAGE_CACHE_SHIFT;
 	loff_t size, len;
-	struct page *locked_page = NULL;
-	void *fsdata = NULL;
 	int ret;
 
 	size = i_size_read(inode);
@@ -1116,23 +1131,30 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
 	     off, len, page, page->index);
-	ret = ceph_write_begin(vma->vm_file, inode->i_mapping, off, len, 0,
-			       &locked_page, &fsdata);
-	WARN_ON(page != locked_page);
-	if (!ret) {
-		/*
-		 * doing the following, instead of calling
-		 * ceph_write_end. Note that we keep the
-		 * page locked
-		 */
+
+	lock_page(page);
+
+	ret = VM_FAULT_NOPAGE;
+	if ((off > size) ||
+	    (page->mapping != inode->i_mapping))
+		goto out;
+
+	ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+	if (ret == 0) {
+		/* success.  we'll keep the page locked. */
 		set_page_dirty(page);
 		up_read(&mdsc->snap_rwsem);
-		page_cache_release(page);
 		ret = VM_FAULT_LOCKED;
 	} else {
-		ret = VM_FAULT_SIGBUS;
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else
+			ret = VM_FAULT_SIGBUS;
 	}
+out:
 	dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
+	if (ret != VM_FAULT_LOCKED)
+		unlock_page(page);
 	return ret;
 }
 
-- 
cgit v1.2.2


From 3d497d858ae6e5f23a28783030aecc69074e102d Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 9 Feb 2010 11:08:40 -0800
Subject: ceph: fix truncation when not holding caps

A truncation should occur when either we have the
specified caps for the file, or (in cases where we are
not the only ones referencing the file) when it is mapped
or when it is opened. The latter two cases were not
handled.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index a4f573ab232e..af85f2de2f7c 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -416,9 +416,17 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 			dout("truncate_seq %u -> %u\n",
 			     ci->i_truncate_seq, truncate_seq);
 			ci->i_truncate_seq = truncate_seq;
-			if (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
+			/*
+			 * If we hold relevant caps, or in the case where we're
+			 * not the only client referencing this file and we
+			 * don't hold those caps, then we need to check whether
+			 * the file is either opened or mmaped
+			 */
+			if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
 				      CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
-				      CEPH_CAP_FILE_EXCL)) {
+				      CEPH_CAP_FILE_EXCL)) ||
+			    mapping_mapped(inode->i_mapping) ||
+			    __ceph_caps_file_wanted(ci)) {
 				ci->i_truncate_pending++;
 				queue_trunc = 1;
 			}
-- 
cgit v1.2.2


From 29065a513aa4c7e4b46b77cbcd25f814a4ca0bfe Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 9 Feb 2010 11:14:41 -0800
Subject: ceph: sync read/write considers page cache

In the cases where we either do a sync read or a write, we
need to make sure that everything in the page cache is flushed.
In the case of a sync write we invalidate the relevant pages,
so that subsequent read/write reflects the new data written.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/file.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 43bd2f2e51a5..bbf1ccf2d56e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -409,7 +409,7 @@ static void zero_page_vector_range(int off, int len, struct page **pages)
 		i++;
 	}
 	while (len >= PAGE_CACHE_SIZE) {
-		dout("zeroing %d %p\n", i, pages[i]);
+		dout("zeroing %d %p len=%d\n", i, pages[i], len);
 		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
 		len -= PAGE_CACHE_SIZE;
 		i++;
@@ -542,13 +542,16 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 		 * but it will at least behave sensibly when they are
 		 * in sequence.
 		 */
-		filemap_write_and_wait(inode->i_mapping);
 	} else {
 		pages = alloc_page_vector(num_pages);
 	}
 	if (IS_ERR(pages))
 		return PTR_ERR(pages);
 
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret < 0)
+		goto done;
+
 	ret = striped_read(inode, off, len, pages, num_pages);
 
 	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
@@ -556,6 +559,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 	if (ret >= 0)
 		*poff = off + ret;
 
+done:
 	if (file->f_flags & O_DIRECT)
 		put_page_vector(pages, num_pages);
 	else
@@ -617,6 +621,16 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 	else
 		pos = *offset;
 
+	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
+	if (ret < 0)
+		return ret;
+
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    pos >> PAGE_CACHE_SHIFT,
+					    (pos + left) >> PAGE_CACHE_SHIFT);
+	if (ret < 0)
+		dout("invalidate_inode_pages2_range returned %d\n", ret);
+
 	flags = CEPH_OSD_FLAG_ORDERSNAP |
 		CEPH_OSD_FLAG_ONDISK |
 		CEPH_OSD_FLAG_WRITE;
-- 
cgit v1.2.2


From cbd03635913a86afb7c2cfc0058932956b05b51e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 9 Feb 2010 13:41:18 -0800
Subject: ceph: cap revocation fixes

Try to invalidate pages in ceph_check_caps() if FILE_CACHE is being
revoked.  If we fail, queue an immediate async invalidate if FILE_CACHE
is being revoked.  (If it's not being revoked, we just queue the caps
for later evaluation later, as per the old behavior.)

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 847ae64346fe..822f7d3632fe 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1374,12 +1374,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	int file_wanted, used;
 	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
 	int drop_session_lock = session ? 0 : 1;
-	int want, retain, revoking, flushing = 0;
+	int issued, implemented, want, retain, revoking, flushing = 0;
 	int mds = -1;   /* keep track of how far we've gone through i_caps list
 			   to avoid an infinite loop on retry */
 	struct rb_node *p;
 	int tried_invalidate = 0;
 	int delayed = 0, sent = 0, force_requeue = 0, num;
+	int queue_invalidate = 0;
 	int is_delayed = flags & CHECK_CAPS_NODELAY;
 
 	/* if we are unmounting, flush any unused caps immediately. */
@@ -1401,6 +1402,8 @@ retry_locked:
 	file_wanted = __ceph_caps_file_wanted(ci);
 	used = __ceph_caps_used(ci);
 	want = file_wanted | used;
+	issued = __ceph_caps_issued(ci, &implemented);
+	revoking = implemented & ~issued;
 
 	retain = want | CEPH_CAP_PIN;
 	if (!mdsc->stopping && inode->i_nlink > 0) {
@@ -1419,11 +1422,11 @@ retry_locked:
 	}
 
 	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
-	     " issued %s retain %s %s%s%s\n", inode,
+	     " issued %s revoking %s retain %s %s%s%s\n", inode,
 	     ceph_cap_string(file_wanted),
 	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
 	     ceph_cap_string(ci->i_flushing_caps),
-	     ceph_cap_string(__ceph_caps_issued(ci, NULL)),
+	     ceph_cap_string(issued), ceph_cap_string(revoking),
 	     ceph_cap_string(retain),
 	     (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
 	     (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
@@ -1437,7 +1440,8 @@ retry_locked:
 	if ((!is_delayed || mdsc->stopping) &&
 	    ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
 	    ci->i_rdcache_gen &&                     /* may have cached pages */
-	    file_wanted == 0 &&                      /* no open files */
+	    (file_wanted == 0 ||                     /* no open files */
+	     (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
 	    !ci->i_truncate_pending &&
 	    !tried_invalidate) {
 		u32 invalidating_gen = ci->i_rdcache_gen;
@@ -1451,6 +1455,10 @@ retry_locked:
 			/* success. */
 			ci->i_rdcache_gen = 0;
 			ci->i_rdcache_revoking = 0;
+		} else if (revoking & CEPH_CAP_FILE_CACHE) {
+			dout("check_caps queuing invalidate\n");
+			queue_invalidate = 1;
+			ci->i_rdcache_revoking = ci->i_rdcache_gen;
 		} else {
 			dout("check_caps failed to invalidate pages\n");
 			/* we failed to invalidate pages.  check these
@@ -1476,7 +1484,7 @@ retry_locked:
 
 		revoking = cap->implemented & ~cap->issued;
 		if (revoking)
-			dout("mds%d revoking %s\n", cap->mds,
+			dout(" mds%d revoking %s\n", cap->mds,
 			     ceph_cap_string(revoking));
 
 		if (cap == ci->i_auth_cap &&
@@ -1591,6 +1599,10 @@ ack:
 
 	spin_unlock(&inode->i_lock);
 
+	if (queue_invalidate)
+		if (ceph_queue_page_invalidation(inode))
+			igrab(inode);
+
 	if (session && drop_session_lock)
 		mutex_unlock(&session->s_mutex);
 	if (took_snap_rwsem)
-- 
cgit v1.2.2


From 68c283236a1e0772e1a469dd2ffc17afc300b07b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 9 Feb 2010 13:41:47 -0800
Subject: ceph: do not retain caps that are being revoked

Never retain caps in __send_cap() that are being revoked.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 822f7d3632fe..7f4841cd3a2b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1042,10 +1042,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	struct ceph_inode_info *ci = cap->ci;
 	struct inode *inode = &ci->vfs_inode;
 	u64 cap_id = cap->cap_id;
-	int held = cap->issued | cap->implemented;
-	int revoking = cap->implemented & ~cap->issued;
-	int dropping = cap->issued & ~retain;
-	int keep;
+	int held, revoking, dropping, keep;
 	u64 seq, issue_seq, mseq, time_warp_seq, follows;
 	u64 size, max_size;
 	struct timespec mtime, atime;
@@ -1060,6 +1057,11 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	int i;
 	int ret;
 
+	held = cap->issued | cap->implemented;
+	revoking = cap->implemented & ~cap->issued;
+	retain &= ~revoking;
+	dropping = cap->issued & ~retain;
+
 	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
 	     inode, cap, cap->session,
 	     ceph_cap_string(held), ceph_cap_string(held & retain),
-- 
cgit v1.2.2


From 6a026589ba333185c466c906376fe022a27a53f9 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 9 Feb 2010 14:04:02 -0800
Subject: ceph: fix sync read eof check deadlock

If a sync read gets a short result from the OSD, it may need to do a
getattr to see if it is short due to reaching end-of-file.  The getattr
was being done while holding a reference to FILE_RD, which can lead to
a deadlock if the MDS is revoking that capability bit and can't process
the getattr until it does.

We fix this by setting a flag if EOF size validation is needed, and doing
the getattr in ceph_aio_read, after the RD cap ref is dropped.  If the
read needs to be continued, we loop and continue traversing the file.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/file.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index bbf1ccf2d56e..2c4ae4441cab 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -431,7 +431,8 @@ static void zero_page_vector_range(int off, int len, struct page **pages)
  */
 static int striped_read(struct inode *inode,
 			u64 off, u64 len,
-			struct page **pages, int num_pages)
+			struct page **pages, int num_pages,
+			int *checkeof)
 {
 	struct ceph_client *client = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -497,15 +498,7 @@ more:
 		}
 
 		/* check i_size */
-		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
-		if (ret < 0)
-			goto out;
-
-		/* hit EOF? */
-		if (pos >= inode->i_size)
-			goto out;
-
-		goto more;
+		*checkeof = 1;
 	}
 
 out:
@@ -522,7 +515,7 @@ out:
  * If the read spans object boundary, just do multiple reads.
  */
 static ssize_t ceph_sync_read(struct file *file, char __user *data,
-			      unsigned len, loff_t *poff)
+			      unsigned len, loff_t *poff, int *checkeof)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct page **pages;
@@ -552,7 +545,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
 	if (ret < 0)
 		goto done;
 
-	ret = striped_read(inode, off, len, pages, num_pages);
+	ret = striped_read(inode, off, len, pages, num_pages, checkeof);
 
 	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
 		ret = copy_page_vector_to_user(pages, data, off, ret);
@@ -746,11 +739,14 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	size_t len = iov->iov_len;
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	void *base = iov->iov_base;
 	ssize_t ret;
 	int got = 0;
+	int checkeof = 0, read = 0;
 
 	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
 	     inode, ceph_vinop(inode), pos, (unsigned)len, inode);
+again:
 	__ceph_do_pending_vmtruncate(inode);
 	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
 			    &got, -1);
@@ -764,7 +760,7 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	    (iocb->ki_filp->f_flags & O_DIRECT) ||
 	    (inode->i_sb->s_flags & MS_SYNCHRONOUS))
 		/* hmm, this isn't really async... */
-		ret = ceph_sync_read(filp, iov->iov_base, len, ppos);
+		ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
 	else
 		ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
 
@@ -772,6 +768,23 @@ out:
 	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
 	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
 	ceph_put_cap_refs(ci, got);
+
+	if (checkeof && ret >= 0) {
+		int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+
+		/* hit EOF or hole? */
+		if (statret == 0 && *ppos < inode->i_size) {
+			dout("aio_read sync_read hit hole, reading more\n");
+			read += ret;
+			base += ret;
+			len -= ret;
+			checkeof = 0;
+			goto again;
+		}
+	}
+	if (ret >= 0)
+		ret += read;
+
 	return ret;
 }
 
-- 
cgit v1.2.2


From 3c6f6b79a64db7f1c7abf09d693db3b0066784fb Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 9 Feb 2010 15:24:44 -0800
Subject: ceph: cleanup async writeback, truncation, invalidate helpers

Grab inode ref in helper.  Make work functions static, with consistent
naming.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c  |  3 +--
 fs/ceph/caps.c  | 25 ++++++++---------------
 fs/ceph/inode.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 fs/ceph/super.h | 19 ++++--------------
 4 files changed, 65 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 89c5ff3b59d5..71f5ad1c1e26 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -947,8 +947,7 @@ retry_locked:
 			 */
 			snapc = ceph_get_snap_context((void *)page->private);
 			unlock_page(page);
-			if (ceph_queue_writeback(inode))
-				igrab(inode);
+			ceph_queue_writeback(inode);
 			wait_event_interruptible(ci->i_cap_wq,
 			       context_is_writeable_or_written(inode, snapc));
 			ceph_put_snap_context(snapc);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7f4841cd3a2b..68ee78109224 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1602,8 +1602,7 @@ ack:
 	spin_unlock(&inode->i_lock);
 
 	if (queue_invalidate)
-		if (ceph_queue_page_invalidation(inode))
-			igrab(inode);
+		ceph_queue_invalidate(inode);
 
 	if (session && drop_session_lock)
 		mutex_unlock(&session->s_mutex);
@@ -2178,7 +2177,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	int wake = 0;
 	int writeback = 0;
 	int revoked_rdcache = 0;
-	int invalidate_async = 0;
+	int queue_invalidate = 0;
 	int tried_invalidate = 0;
 	int ret;
 
@@ -2205,7 +2204,7 @@ restart:
 			/* there were locked pages.. invalidate later
 			   in a separate thread. */
 			if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
-				invalidate_async = 1;
+				queue_invalidate = 1;
 				ci->i_rdcache_revoking = ci->i_rdcache_gen;
 			}
 		} else {
@@ -2319,21 +2318,15 @@ restart:
 	}
 
 	spin_unlock(&inode->i_lock);
-	if (writeback) {
+	if (writeback)
 		/*
 		 * queue inode for writeback: we can't actually call
 		 * filemap_write_and_wait, etc. from message handler
 		 * context.
 		 */
-		dout("queueing %p for writeback\n", inode);
-		if (ceph_queue_writeback(inode))
-			igrab(inode);
-	}
-	if (invalidate_async) {
-		dout("queueing %p for page invalidation\n", inode);
-		if (ceph_queue_page_invalidation(inode))
-			igrab(inode);
-	}
+		ceph_queue_writeback(inode);
+	if (queue_invalidate)
+		ceph_queue_invalidate(inode);
 	if (wake)
 		wake_up(&ci->i_cap_wq);
 	return reply;
@@ -2479,9 +2472,7 @@ static void handle_cap_trunc(struct inode *inode,
 	spin_unlock(&inode->i_lock);
 
 	if (queue_trunc)
-		if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
-			       &ci->i_vmtruncate_work))
-			igrab(inode);
+		ceph_queue_vmtruncate(inode);
 }
 
 /*
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index af85f2de2f7c..58bdff09c2c1 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -28,7 +28,9 @@
 
 static const struct inode_operations ceph_symlink_iops;
 
-static void ceph_inode_invalidate_pages(struct work_struct *work);
+static void ceph_invalidate_work(struct work_struct *work);
+static void ceph_writeback_work(struct work_struct *work);
+static void ceph_vmtruncate_work(struct work_struct *work);
 
 /*
  * find or create an inode, given the ceph ino number
@@ -357,8 +359,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	INIT_LIST_HEAD(&ci->i_snap_realm_item);
 	INIT_LIST_HEAD(&ci->i_snap_flush_item);
 
-	INIT_WORK(&ci->i_wb_work, ceph_inode_writeback);
-	INIT_WORK(&ci->i_pg_inv_work, ceph_inode_invalidate_pages);
+	INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
+	INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
 
 	INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
 
@@ -675,9 +677,7 @@ no_change:
 
 	/* queue truncate if we saw i_size decrease */
 	if (queue_trunc)
-		if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
-			       &ci->i_vmtruncate_work))
-			igrab(inode);
+		ceph_queue_vmtruncate(inode);
 
 	/* populate frag tree */
 	/* FIXME: move me up, if/when version reflects fragtree changes */
@@ -1243,7 +1243,18 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
  * Write back inode data in a worker thread.  (This can't be done
  * in the message handler context.)
  */
-void ceph_inode_writeback(struct work_struct *work)
+void ceph_queue_writeback(struct inode *inode)
+{
+	if (queue_work(ceph_inode_to_client(inode)->wb_wq,
+		       &ceph_inode(inode)->i_wb_work)) {
+		dout("ceph_queue_invalidate %p\n", inode);
+		igrab(inode);
+	} else {
+		dout("ceph_queue_invalidate %p failed\n", inode);
+	}
+}
+
+static void ceph_writeback_work(struct work_struct *work)
 {
 	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
 						  i_wb_work);
@@ -1254,11 +1265,25 @@ void ceph_inode_writeback(struct work_struct *work)
 	iput(inode);
 }
 
+/*
+ * queue an async invalidation
+ */
+void ceph_queue_invalidate(struct inode *inode)
+{
+	if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
+		       &ceph_inode(inode)->i_pg_inv_work)) {
+		dout("ceph_queue_invalidate %p\n", inode);
+		igrab(inode);
+	} else {
+		dout("ceph_queue_invalidate %p failed\n", inode);
+	}
+}
+
 /*
  * Invalidate inode pages in a worker thread.  (This can't be done
  * in the message handler context.)
  */
-static void ceph_inode_invalidate_pages(struct work_struct *work)
+static void ceph_invalidate_work(struct work_struct *work)
 {
 	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
 						  i_pg_inv_work);
@@ -1307,7 +1332,7 @@ out:
  *
  * We also truncate in a separate thread as well.
  */
-void ceph_vmtruncate_work(struct work_struct *work)
+static void ceph_vmtruncate_work(struct work_struct *work)
 {
 	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
 						  i_vmtruncate_work);
@@ -1320,6 +1345,24 @@ void ceph_vmtruncate_work(struct work_struct *work)
 	iput(inode);
 }
 
+/*
+ * Queue an async vmtruncate.  If we fail to queue work, we will handle
+ * the truncation the next time we call __ceph_do_pending_vmtruncate.
+ */
+void ceph_queue_vmtruncate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+		       &ci->i_vmtruncate_work)) {
+		dout("ceph_queue_vmtruncate %p\n", inode);
+		igrab(inode);
+	} else {
+		dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
+		     inode, ci->i_truncate_pending);
+	}
+}
+
 /*
  * called with i_mutex held.
  *
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3930fb685f0b..b2adfccbab98 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -573,18 +573,6 @@ static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
 	return (struct ceph_client *)sb->s_fs_info;
 }
 
-static inline int ceph_queue_writeback(struct inode *inode)
-{
-	return queue_work(ceph_inode_to_client(inode)->wb_wq,
-		   &ceph_inode(inode)->i_wb_work);
-}
-
-static inline int ceph_queue_page_invalidation(struct inode *inode)
-{
-	return queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
-		   &ceph_inode(inode)->i_pg_inv_work);
-}
-
 
 /*
  * we keep buffered readdir results attached to file->private_data
@@ -772,10 +760,11 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 extern int ceph_inode_holds_cap(struct inode *inode, int mask);
 
 extern int ceph_inode_set_size(struct inode *inode, loff_t size);
-extern void ceph_inode_writeback(struct work_struct *work);
-extern void ceph_vmtruncate_work(struct work_struct *work);
 extern void __ceph_do_pending_vmtruncate(struct inode *inode);
-extern void __ceph_queue_vmtruncate(struct inode *inode);
+extern void ceph_queue_vmtruncate(struct inode *inode);
+
+extern void ceph_queue_invalidate(struct inode *inode);
+extern void ceph_queue_writeback(struct inode *inode);
 
 extern int ceph_do_getattr(struct inode *inode, int mask);
 extern int ceph_permission(struct inode *inode, int mask);
-- 
cgit v1.2.2


From 0840d8af3e6e40bcd5f2366698eb2755f88acfea Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 9 Feb 2010 15:44:16 -0800
Subject: ceph: invalidate pages even if truncate is pending

There is no reason not to invalidate pages when a truncate is pending.
Both throw out page cache pages.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 68ee78109224..20b28dc0c97c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1444,7 +1444,6 @@ retry_locked:
 	    ci->i_rdcache_gen &&                     /* may have cached pages */
 	    (file_wanted == 0 ||                     /* no open files */
 	     (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
-	    !ci->i_truncate_pending &&
 	    !tried_invalidate) {
 		u32 invalidating_gen = ci->i_rdcache_gen;
 		int ret;
-- 
cgit v1.2.2


From 8031049147c58d9d8b6226c3ac31a9d72d053e25 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 9 Feb 2010 16:43:11 -0800
Subject: ceph: remove bogus invalidate_mapping_pages

We were invalidating mapping pages when dropping FILE_CACHE in
__send_cap().  But ceph_check_caps attempts to invalidate already, and
also checks for success, so we should never get to this point.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 20b28dc0c97c..ab9b571dda11 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1137,12 +1137,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 
 	spin_unlock(&inode->i_lock);
 
-	if (dropping & CEPH_CAP_FILE_CACHE) {
-		/* invalidate what we can */
-		dout("invalidating pages on %p\n", inode);
-		invalidate_mapping_pages(&inode->i_data, 0, -1);
-	}
-
 	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
 		op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
 		size, max_size, &mtime, &atime, time_warp_seq,
-- 
cgit v1.2.2


From 6c5d1a49e5e88ee831117f4b2375829933ad15da Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sat, 13 Feb 2010 20:29:31 -0800
Subject: ceph: fix msgr to keep sent messages until acked

The test was backwards from commit b3d1dbbd: keep the message if the
connection _isn't_ lossy.  This allows the client to continue when the
TCP connection drops for some reason (network glitch) but both ends
survive.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index c4341784ec8f..44bdaf439924 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -463,11 +463,11 @@ static void prepare_write_message(struct ceph_connection *con)
 		       struct ceph_msg, list_head);
 	con->out_msg = m;
 	if (test_bit(LOSSYTX, &con->state)) {
+		list_del_init(&m->list_head);
+	} else {
 		/* put message on sent list */
 		ceph_msg_get(m);
 		list_move_tail(&m->list_head, &con->out_sent);
-	} else {
-		list_del_init(&m->list_head);
 	}
 
 	m->hdr.seq = cpu_to_le64(++con->out_seq);
-- 
cgit v1.2.2


From 153a008bf7915ea9127341409170cb197d111282 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 15 Feb 2010 12:11:51 -0800
Subject: ceph: reset osd connections after fault

A single osd connection fault (e.g. tcp disconnect) wasn't
reopening the connection, which causes all current and future
requests for that osd to hang.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 7f8a26fdcc2c..fa0f73703954 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -369,7 +369,6 @@ static void osd_reset(struct ceph_connection *con)
 		return;
 	dout("osd_reset osd%d\n", osd->o_osd);
 	osdc = osd->o_osdc;
-	osd->o_incarnation++;
 	down_read(&osdc->map_sem);
 	kick_requests(osdc, osd);
 	up_read(&osdc->map_sem);
@@ -921,7 +920,9 @@ static void kick_requests(struct ceph_osd_client *osdc,
 
 	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
 	mutex_lock(&osdc->request_mutex);
-	if (!kickosd) {
+	if (kickosd) {
+		__reset_osd(osdc, kickosd);
+	} else {
 		for (p = rb_first(&osdc->osds); p; p = n) {
 			struct ceph_osd *osd =
 				rb_entry(p, struct ceph_osd, o_node);
-- 
cgit v1.2.2


From e2663ab60de59d20fa33da3528f6d5359f8eb003 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 16 Feb 2010 22:01:03 -0800
Subject: ceph: allow connection to be reopened by fault callback

Fix the messenger to allow a ceph_con_open() during the fault callback.
Previously the work wasn't getting queued on the connection because the
fault path avoids requeued work (normally spurious).  Loop on reopening by
checking for the OPENING state bit.

This fixes OSD reconnects when a TCP connection drops.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 44bdaf439924..acf383f6a9cd 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1808,7 +1808,7 @@ done:
 	clear_bit(BUSY, &con->state);
 	dout("con->state=%lu\n", con->state);
 	if (test_bit(QUEUED, &con->state)) {
-		if (!backoff) {
+		if (!backoff || test_bit(OPENING, &con->state)) {
 			dout("con_work %p QUEUED reset, looping\n", con);
 			goto more;
 		}
-- 
cgit v1.2.2


From 91e45ce38946a8efa21fefbc65d023ca3c0b434f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 15 Feb 2010 12:05:09 -0800
Subject: ceph: cancel delayed work when closing connection

This ensures that if/when we reopen the connection, we can requeue work on
the connection immediately, without waiting for an old timer to expire.
Queue new delayed work inside con->mutex to avoid any race.

This fixes problems with clients failing to reconnect to the MDS due to
the client_reconnect message arriving too late (due to waiting for an old
delayed work timeout to expire).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index acf383f6a9cd..ca2ad0e5bb28 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -344,6 +344,7 @@ void ceph_con_close(struct ceph_connection *con)
 	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
 	mutex_lock(&con->mutex);
 	reset_connection(con);
+	cancel_delayed_work(&con->work);
 	mutex_unlock(&con->mutex);
 	queue_con(con);
 }
@@ -1841,6 +1842,8 @@ static void ceph_fault(struct ceph_connection *con)
 	clear_bit(BUSY, &con->state);  /* to avoid an improbable race */
 
 	mutex_lock(&con->mutex);
+	if (test_bit(CLOSED, &con->state))
+		goto out_unlock;
 
 	con_close_socket(con);
 
@@ -1876,8 +1879,6 @@ static void ceph_fault(struct ceph_connection *con)
 	else if (con->delay < MAX_DELAY_INTERVAL)
 		con->delay *= 2;
 
-	mutex_unlock(&con->mutex);
-
 	/* explicitly schedule work to try to reconnect again later. */
 	dout("fault queueing %p delay %lu\n", con, con->delay);
 	con->ops->get(con);
@@ -1885,6 +1886,8 @@ static void ceph_fault(struct ceph_connection *con)
 			       round_jiffies_relative(con->delay)) == 0)
 		con->ops->put(con);
 
+out_unlock:
+	mutex_unlock(&con->mutex);
 out:
 	if (con->ops->fault)
 		con->ops->fault(con);
-- 
cgit v1.2.2


From 44ca18f2682eb1cfbed153849adedb79e3e19790 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 15 Feb 2010 12:08:46 -0800
Subject: ceph: use rbtree for mds requests

The rbtree is a more appropriate data structure than a radix_tree.  It
avoids extra memory usage and simplifies the code.

It also fixes a bug where the debugfs 'mdsc' file wasn't including the
most recent mds request.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/debugfs.c    |  13 ++---
 fs/ceph/mds_client.c | 149 +++++++++++++++++++++++++++++++--------------------
 fs/ceph/mds_client.h |   4 +-
 3 files changed, 97 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fba44b2a6086..cd5dd805e4be 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -142,21 +142,16 @@ static int monc_show(struct seq_file *s, void *p)
 static int mdsc_show(struct seq_file *s, void *p)
 {
 	struct ceph_client *client = s->private;
-	struct ceph_mds_request *req;
-	u64 nexttid = 0;
-	int got;
 	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	struct rb_node *rp;
 	int pathlen;
 	u64 pathbase;
 	char *path;
 
 	mutex_lock(&mdsc->mutex);
-	while (nexttid < mdsc->last_tid) {
-		got = radix_tree_gang_lookup(&mdsc->request_tree,
-					     (void **)&req, nexttid, 1);
-		if (got == 0)
-			break;
-		nexttid = req->r_tid + 1;
+	for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
+		req = rb_entry(rp, struct ceph_mds_request, r_node);
 
 		if (req->r_request)
 			seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index aa8506bad42d..81840d6b68a4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -255,6 +255,7 @@ static const char *session_state_name(int s)
 	case CEPH_MDS_SESSION_OPEN: return "open";
 	case CEPH_MDS_SESSION_HUNG: return "hung";
 	case CEPH_MDS_SESSION_CLOSING: return "closing";
+	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
 	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
 	default: return "???";
 	}
@@ -448,10 +449,42 @@ static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
 					     u64 tid)
 {
 	struct ceph_mds_request *req;
-	req = radix_tree_lookup(&mdsc->request_tree, tid);
-	if (req)
-		ceph_mdsc_get_request(req);
-	return req;
+	struct rb_node *n = mdsc->request_tree.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_mds_request, r_node);
+		if (tid < req->r_tid)
+			n = n->rb_left;
+		else if (tid > req->r_tid)
+			n = n->rb_right;
+		else {
+			ceph_mdsc_get_request(req);
+			return req;
+		}
+	}
+	return NULL;
+}
+
+static void __insert_request(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_request *new)
+{
+	struct rb_node **p = &mdsc->request_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_mds_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_mds_request, r_node);
+		if (new->r_tid < req->r_tid)
+			p = &(*p)->rb_left;
+		else if (new->r_tid > req->r_tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, &mdsc->request_tree);
 }
 
 /*
@@ -469,7 +502,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
 		ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
 	dout("__register_request %p tid %lld\n", req, req->r_tid);
 	ceph_mdsc_get_request(req);
-	radix_tree_insert(&mdsc->request_tree, req->r_tid, (void *)req);
+	__insert_request(mdsc, req);
 
 	if (dir) {
 		struct ceph_inode_info *ci = ceph_inode(dir);
@@ -485,7 +518,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 				 struct ceph_mds_request *req)
 {
 	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
-	radix_tree_delete(&mdsc->request_tree, req->r_tid);
+	rb_erase(&req->r_node, &mdsc->request_tree);
 	ceph_mdsc_put_request(req);
 
 	if (req->r_unsafe_dir) {
@@ -1115,17 +1148,25 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
 }
 
 /*
- * return oldest (lowest) tid in request tree, 0 if none.
+ * return oldest (lowest) request, tid in request tree, 0 if none.
  *
  * called under mdsc->mutex.
  */
+static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
+{
+	if (RB_EMPTY_ROOT(&mdsc->request_tree))
+		return NULL;
+	return rb_entry(rb_first(&mdsc->request_tree),
+			struct ceph_mds_request, r_node);
+}
+
 static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
 {
-	struct ceph_mds_request *first;
-	if (radix_tree_gang_lookup(&mdsc->request_tree,
-				   (void **)&first, 0, 1) <= 0)
-		return 0;
-	return first->r_tid;
+	struct ceph_mds_request *req = __get_oldest_req(mdsc);
+
+	if (req)
+		return req->r_tid;
+	return 0;
 }
 
 /*
@@ -1540,26 +1581,19 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
  */
 static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
 {
-	struct ceph_mds_request *reqs[10];
-	u64 nexttid = 0;
-	int i, got;
+	struct ceph_mds_request *req;
+	struct rb_node *p;
 
 	dout("kick_requests mds%d\n", mds);
-	while (nexttid <= mdsc->last_tid) {
-		got = radix_tree_gang_lookup(&mdsc->request_tree,
-					     (void **)&reqs, nexttid, 10);
-		if (got == 0)
-			break;
-		nexttid = reqs[got-1]->r_tid + 1;
-		for (i = 0; i < got; i++) {
-			if (reqs[i]->r_got_unsafe)
-				continue;
-			if (reqs[i]->r_session &&
-			    reqs[i]->r_session->s_mds == mds) {
-				dout(" kicking tid %llu\n", reqs[i]->r_tid);
-				put_request_session(reqs[i]);
-				__do_request(mdsc, reqs[i]);
-			}
+	for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_mds_request, r_node);
+		if (req->r_got_unsafe)
+			continue;
+		if (req->r_session &&
+		    req->r_session->s_mds == mds) {
+			dout(" kicking tid %llu\n", req->r_tid);
+			put_request_session(req);
+			__do_request(mdsc, req);
 		}
 	}
 }
@@ -1748,7 +1782,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 			list_del_init(&req->r_unsafe_item);
 
 			/* last unsafe request during umount? */
-			if (mdsc->stopping && !__get_oldest_tid(mdsc))
+			if (mdsc->stopping && !__get_oldest_req(mdsc))
 				complete(&mdsc->safe_umount_waiters);
 			mutex_unlock(&mdsc->mutex);
 			goto out;
@@ -2573,7 +2607,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 	INIT_LIST_HEAD(&mdsc->snap_empty);
 	spin_lock_init(&mdsc->snap_empty_lock);
 	mdsc->last_tid = 0;
-	INIT_RADIX_TREE(&mdsc->request_tree, GFP_NOFS);
+	mdsc->request_tree = RB_ROOT;
 	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
 	mdsc->last_renew_caps = jiffies;
 	INIT_LIST_HEAD(&mdsc->cap_delay_list);
@@ -2600,20 +2634,19 @@ static void wait_requests(struct ceph_mds_client *mdsc)
 	struct ceph_client *client = mdsc->client;
 
 	mutex_lock(&mdsc->mutex);
-	if (__get_oldest_tid(mdsc)) {
+	if (__get_oldest_req(mdsc)) {
 		mutex_unlock(&mdsc->mutex);
+
 		dout("wait_requests waiting for requests\n");
 		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
 				    client->mount_args->mount_timeout * HZ);
-		mutex_lock(&mdsc->mutex);
 
 		/* tear down remaining requests */
-		while (radix_tree_gang_lookup(&mdsc->request_tree,
-					      (void **)&req, 0, 1)) {
+		mutex_lock(&mdsc->mutex);
+		while ((req = __get_oldest_req(mdsc))) {
 			dout("wait_requests timed out on tid %llu\n",
 			     req->r_tid);
-			radix_tree_delete(&mdsc->request_tree, req->r_tid);
-			ceph_mdsc_put_request(req);
+			__unregister_request(mdsc, req);
 		}
 	}
 	mutex_unlock(&mdsc->mutex);
@@ -2639,31 +2672,29 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
  */
 static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
 {
-	struct ceph_mds_request *req;
-	u64 next_tid = 0;
-	int got;
+	struct ceph_mds_request *req = NULL;
+	struct rb_node *n;
 
 	mutex_lock(&mdsc->mutex);
 	dout("wait_unsafe_requests want %lld\n", want_tid);
-	while (1) {
-		got = radix_tree_gang_lookup(&mdsc->request_tree, (void **)&req,
-					     next_tid, 1);
-		if (!got)
-			break;
-		if (req->r_tid > want_tid)
+	req = __get_oldest_req(mdsc);
+	while (req && req->r_tid <= want_tid) {
+		if ((req->r_op & CEPH_MDS_OP_WRITE)) {
+			/* write op */
+			ceph_mdsc_get_request(req);
+			mutex_unlock(&mdsc->mutex);
+			dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
+			     req->r_tid, want_tid);
+			wait_for_completion(&req->r_safe_completion);
+			mutex_lock(&mdsc->mutex);
+			n = rb_next(&req->r_node);
+			ceph_mdsc_put_request(req);
+		} else {
+			n = rb_next(&req->r_node);
+		}
+		if (!n)
 			break;
-
-		next_tid = req->r_tid + 1;
-		if ((req->r_op & CEPH_MDS_OP_WRITE) == 0)
-			continue;  /* not a write op */
-
-		ceph_mdsc_get_request(req);
-		mutex_unlock(&mdsc->mutex);
-		dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
-		     req->r_tid, want_tid);
-		wait_for_completion(&req->r_safe_completion);
-		mutex_lock(&mdsc->mutex);
-		ceph_mdsc_put_request(req);
+		req = rb_entry(n, struct ceph_mds_request, r_node);
 	}
 	mutex_unlock(&mdsc->mutex);
 	dout("wait_unsafe_requests done\n");
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ee71495e27c4..98f09cd06006 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -6,6 +6,7 @@
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/radix-tree.h>
+#include <linux/rbtree.h>
 #include <linux/spinlock.h>
 
 #include "types.h"
@@ -150,6 +151,7 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
  */
 struct ceph_mds_request {
 	u64 r_tid;                   /* transaction id */
+	struct rb_node r_node;
 
 	int r_op;                    /* mds op code */
 	int r_mds;
@@ -249,7 +251,7 @@ struct ceph_mds_client {
 	spinlock_t              snap_empty_lock;  /* protect snap_empty */
 
 	u64                    last_tid;      /* most recent mds request */
-	struct radix_tree_root request_tree;  /* pending mds requests */
+	struct rb_root         request_tree;  /* pending mds requests */
 	struct delayed_work    delayed_work;  /* delayed work */
 	unsigned long    last_renew_caps;  /* last time we renewed our caps */
 	struct list_head cap_delay_list;   /* caps with delayed release */
-- 
cgit v1.2.2


From a105f00cf17d711e876b3dc67e15f9a89b7de5a3 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 15 Feb 2010 14:37:55 -0800
Subject: ceph: use rbtree for snap_realms

Switch from radix tree to rbtree for snap realms.  This is much more
appropriate given that realm keys are few and far between.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 16 +++++-----------
 fs/ceph/mds_client.h |  3 +--
 fs/ceph/snap.c       | 51 ++++++++++++++++++++++++++++++++++++++++-----------
 fs/ceph/super.h      |  2 ++
 4 files changed, 48 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 81840d6b68a4..02834cecc3a0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2097,9 +2097,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 {
 	struct ceph_mds_session *session = NULL;
 	struct ceph_msg *reply;
+	struct rb_node *p;
 	int err;
-	int got;
-	u64 next_snap_ino = 0;
 	struct ceph_pagelist *pagelist;
 
 	pr_info("reconnect to recovering mds%d\n", mds);
@@ -2155,14 +2154,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 	 * parent for all of our realms.  If the mds has any newer info,
 	 * it will tell us.
 	 */
-	next_snap_ino = 0;
-	while (1) {
-		struct ceph_snap_realm *realm;
+	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+		struct ceph_snap_realm *realm =
+			rb_entry(p, struct ceph_snap_realm, node);
 		struct ceph_mds_snaprealm_reconnect sr_rec;
-		got = radix_tree_gang_lookup(&mdsc->snap_realms,
-					     (void **)&realm, next_snap_ino, 1);
-		if (!got)
-			break;
 
 		dout(" adding snap realm %llx seq %lld parent %llx\n",
 		     realm->ino, realm->seq, realm->parent_ino);
@@ -2172,7 +2167,6 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
 		if (err)
 			goto fail;
-		next_snap_ino = realm->ino + 1;
 	}
 
 send:
@@ -2603,7 +2597,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 	mdsc->max_sessions = 0;
 	mdsc->stopping = 0;
 	init_rwsem(&mdsc->snap_rwsem);
-	INIT_RADIX_TREE(&mdsc->snap_realms, GFP_NOFS);
+	mdsc->snap_realms = RB_ROOT;
 	INIT_LIST_HEAD(&mdsc->snap_empty);
 	spin_lock_init(&mdsc->snap_empty_lock);
 	mdsc->last_tid = 0;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 98f09cd06006..9d6b90173879 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -5,7 +5,6 @@
 #include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
-#include <linux/radix-tree.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
 
@@ -246,7 +245,7 @@ struct ceph_mds_client {
 	 * should be destroyed.
 	 */
 	struct rw_semaphore     snap_rwsem;
-	struct radix_tree_root  snap_realms;
+	struct rb_root          snap_realms;
 	struct list_head        snap_empty;
 	spinlock_t              snap_empty_lock;  /* protect snap_empty */
 
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index dcf18d92130a..49d0c4c59d81 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,6 +1,5 @@
 #include "ceph_debug.h"
 
-#include <linux/radix-tree.h>
 #include <linux/sort.h>
 
 #include "super.h"
@@ -77,6 +76,28 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
 	atomic_inc(&realm->nref);
 }
 
+static void __insert_snap_realm(struct rb_root *root,
+				struct ceph_snap_realm *new)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_snap_realm *r = NULL;
+
+	while (*p) {
+		parent = *p;
+		r = rb_entry(parent, struct ceph_snap_realm, node);
+		if (new->ino < r->ino)
+			p = &(*p)->rb_left;
+		else if (new->ino > r->ino)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+}
+
 /*
  * create and get the realm rooted at @ino and bump its ref count.
  *
@@ -92,8 +113,6 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
 	if (!realm)
 		return ERR_PTR(-ENOMEM);
 
-	radix_tree_insert(&mdsc->snap_realms, ino, realm);
-
 	atomic_set(&realm->nref, 0);    /* tree does not take a ref */
 	realm->ino = ino;
 	INIT_LIST_HEAD(&realm->children);
@@ -101,24 +120,34 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
 	INIT_LIST_HEAD(&realm->empty_item);
 	INIT_LIST_HEAD(&realm->inodes_with_caps);
 	spin_lock_init(&realm->inodes_with_caps_lock);
+	__insert_snap_realm(&mdsc->snap_realms, realm);
 	dout("create_snap_realm %llx %p\n", realm->ino, realm);
 	return realm;
 }
 
 /*
- * find and get (if found) the realm rooted at @ino and bump its ref count.
+ * lookup the realm rooted at @ino.
  *
  * caller must hold snap_rwsem for write.
  */
 struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
 					       u64 ino)
 {
-	struct ceph_snap_realm *realm;
-
-	realm = radix_tree_lookup(&mdsc->snap_realms, ino);
-	if (realm)
-		dout("lookup_snap_realm %llx %p\n", realm->ino, realm);
-	return realm;
+	struct rb_node *n = mdsc->snap_realms.rb_node;
+	struct ceph_snap_realm *r;
+
+	while (n) {
+		r = rb_entry(n, struct ceph_snap_realm, node);
+		if (ino < r->ino)
+			n = n->rb_left;
+		else if (ino > r->ino)
+			n = n->rb_right;
+		else {
+			dout("lookup_snap_realm %llx %p\n", r->ino, r);
+			return r;
+		}
+	}
+	return NULL;
 }
 
 static void __put_snap_realm(struct ceph_mds_client *mdsc,
@@ -132,7 +161,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
 {
 	dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
 
-	radix_tree_delete(&mdsc->snap_realms, realm->ino);
+	rb_erase(&realm->node, &mdsc->snap_realms);
 
 	if (realm->parent) {
 		list_del_init(&realm->child_item);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b2adfccbab98..1f3928785e12 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -656,6 +656,8 @@ static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
 struct ceph_snap_realm {
 	u64 ino;
 	atomic_t nref;
+	struct rb_node node;
+
 	u64 created, seq;
 	u64 parent_ino;
 	u64 parent_since;   /* snapid when our current parent became so */
-- 
cgit v1.2.2


From 85ff03f6bfef7d5b59ab3aefd4773f497ffad8a4 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 15 Feb 2010 14:47:28 -0800
Subject: ceph: use rbtree for mon statfs requests

An rbtree is lighter weight, particularly given we will generally have
very few in-flight statfs requests.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/debugfs.c    | 14 ++++-------
 fs/ceph/mon_client.c | 67 ++++++++++++++++++++++++++++++++++++----------------
 fs/ceph/mon_client.h |  5 ++--
 3 files changed, 54 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index cd5dd805e4be..b58bd9188692 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -112,9 +112,8 @@ static int monc_show(struct seq_file *s, void *p)
 {
 	struct ceph_client *client = s->private;
 	struct ceph_mon_statfs_request *req;
-	u64 nexttid = 0;
-	int got;
 	struct ceph_mon_client *monc = &client->monc;
+	struct rb_node *rp;
 
 	mutex_lock(&monc->mutex);
 
@@ -125,17 +124,12 @@ static int monc_show(struct seq_file *s, void *p)
 	if (monc->want_next_osdmap)
 		seq_printf(s, "want next osdmap\n");
 
-	while (nexttid < monc->last_tid) {
-		got = radix_tree_gang_lookup(&monc->statfs_request_tree,
-					     (void **)&req, nexttid, 1);
-		if (got == 0)
-			break;
-		nexttid = req->tid + 1;
-
+	for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
+		req = rb_entry(rp, struct ceph_mon_statfs_request, node);
 		seq_printf(s, "%lld statfs\n", req->tid);
 	}
-	mutex_unlock(&monc->mutex);
 
+	mutex_unlock(&monc->mutex);
 	return 0;
 }
 
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index fec41a0eff86..542276e60798 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -343,6 +343,46 @@ out:
 /*
  * statfs
  */
+static struct ceph_mon_statfs_request *__lookup_statfs(
+	struct ceph_mon_client *monc, u64 tid)
+{
+	struct ceph_mon_statfs_request *req;
+	struct rb_node *n = monc->statfs_request_tree.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_mon_statfs_request, node);
+		if (tid < req->tid)
+			n = n->rb_left;
+		else if (tid > req->tid)
+			n = n->rb_right;
+		else
+			return req;
+	}
+	return NULL;
+}
+
+static void __insert_statfs(struct ceph_mon_client *monc,
+			    struct ceph_mon_statfs_request *new)
+{
+	struct rb_node **p = &monc->statfs_request_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_mon_statfs_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_mon_statfs_request, node);
+		if (new->tid < req->tid)
+			p = &(*p)->rb_left;
+		else if (new->tid > req->tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, &monc->statfs_request_tree);
+}
+
 static void handle_statfs_reply(struct ceph_mon_client *monc,
 				struct ceph_msg *msg)
 {
@@ -356,7 +396,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
 	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
 
 	mutex_lock(&monc->mutex);
-	req = radix_tree_lookup(&monc->statfs_request_tree, tid);
+	req = __lookup_statfs(monc, tid);
 	if (req) {
 		*req->buf = reply->st;
 		req->result = 0;
@@ -416,11 +456,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 	req.tid = ++monc->last_tid;
 	req.last_attempt = jiffies;
 	req.delay = BASE_DELAY_INTERVAL;
-	if (radix_tree_insert(&monc->statfs_request_tree, req.tid, &req) < 0) {
-		mutex_unlock(&monc->mutex);
-		pr_err("ENOMEM in do_statfs\n");
-		return -ENOMEM;
-	}
+	__insert_statfs(monc, &req);
 	monc->num_statfs_requests++;
 	mutex_unlock(&monc->mutex);
 
@@ -430,7 +466,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
 		err = wait_for_completion_interruptible(&req.completion);
 
 	mutex_lock(&monc->mutex);
-	radix_tree_delete(&monc->statfs_request_tree, req.tid);
+	rb_erase(&req.node, &monc->statfs_request_tree);
 	monc->num_statfs_requests--;
 	ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
 	mutex_unlock(&monc->mutex);
@@ -445,20 +481,11 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
  */
 static void __resend_statfs(struct ceph_mon_client *monc)
 {
-	u64 next_tid = 0;
-	int got;
-	int did = 0;
 	struct ceph_mon_statfs_request *req;
+	struct rb_node *p;
 
-	while (1) {
-		got = radix_tree_gang_lookup(&monc->statfs_request_tree,
-					     (void **)&req,
-					     next_tid, 1);
-		if (got == 0)
-			break;
-		did++;
-		next_tid = req->tid + 1;
-
+	for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_mon_statfs_request, node);
 		send_statfs(monc, req);
 	}
 }
@@ -578,7 +605,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 	monc->sub_sent = 0;
 
 	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
-	INIT_RADIX_TREE(&monc->statfs_request_tree, GFP_NOFS);
+	monc->statfs_request_tree = RB_ROOT;
 	monc->num_statfs_requests = 0;
 	monc->last_tid = 0;
 
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index 5ca8e48d4379..b958ad5afa06 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,7 +2,7 @@
 #define _FS_CEPH_MON_CLIENT_H
 
 #include <linux/completion.h>
-#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
 
 #include "messenger.h"
 #include "msgpool.h"
@@ -45,6 +45,7 @@ struct ceph_mon_request {
  */
 struct ceph_mon_statfs_request {
 	u64 tid;
+	struct rb_node node;
 	int result;
 	struct ceph_statfs *buf;
 	struct completion completion;
@@ -75,7 +76,7 @@ struct ceph_mon_client {
 	struct ceph_msgpool msgpool_auth_reply;
 
 	/* pending statfs requests */
-	struct radix_tree_root statfs_request_tree;
+	struct rb_root statfs_request_tree;
 	int num_statfs_requests;
 	u64 last_tid;
 
-- 
cgit v1.2.2


From 5ce6e9dbe6805ab8ee67e21936d17f431adc63c6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 15 Feb 2010 16:22:28 -0800
Subject: ceph: fix authentication races, auth_none oops

Call __validate_auth() under monc->mutex, and use helper for
initial hello so that the pending_auth flag is set.  This fixes
possible races in which we have an authentication request (hello
or otherwise) pending and send another one.  In particular, with
auth_none, we _never_ want to call ceph_build_auth() from
__validate_auth(), since the ->build_request() method is NULL.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mon_client.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 542276e60798..40d7d90bbed1 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -95,6 +95,18 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
 	return 0;
 }
 
+/*
+ * Send an auth request.
+ */
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+	monc->pending_auth = 1;
+	monc->m_auth->front.iov_len = len;
+	monc->m_auth->hdr.front_len = cpu_to_le32(len);
+	ceph_msg_get(monc->m_auth);  /* keep our ref */
+	ceph_con_send(monc->con, monc->m_auth);
+}
+
 /*
  * Close monitor session, if any.
  */
@@ -137,10 +149,7 @@ static int __open_session(struct ceph_mon_client *monc)
 		ret = ceph_auth_build_hello(monc->auth,
 					    monc->m_auth->front.iov_base,
 					    monc->m_auth->front_max);
-		monc->m_auth->front.iov_len = ret;
-		monc->m_auth->hdr.front_len = cpu_to_le32(ret);
-		ceph_msg_get(monc->m_auth);  /* keep our ref */
-		ceph_con_send(monc->con, monc->m_auth);
+		__send_prepared_auth_request(monc, ret);
 	} else {
 		dout("open_session mon%d already open\n", monc->cur_mon);
 	}
@@ -507,11 +516,9 @@ static void delayed_work(struct work_struct *work)
 		__open_session(monc);  /* continue hunting */
 	} else {
 		ceph_con_keepalive(monc->con);
-		mutex_unlock(&monc->mutex);
 
 		__validate_auth(monc);
 
-		mutex_lock(&monc->mutex);
 		if (monc->auth->ops->is_authenticated(monc->auth))
 			__send_subscribe(monc);
 	}
@@ -650,16 +657,6 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
 	kfree(monc->monmap);
 }
 
-static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
-{
-	monc->pending_auth = 1;
-	monc->m_auth->front.iov_len = len;
-	monc->m_auth->hdr.front_len = cpu_to_le32(len);
-	ceph_msg_get(monc->m_auth);  /* keep our ref */
-	ceph_con_send(monc->con, monc->m_auth);
-}
-
-
 static void handle_auth_reply(struct ceph_mon_client *monc,
 			      struct ceph_msg *msg)
 {
-- 
cgit v1.2.2


From 85ccce43a3fc15a40ded6ae1603e3f68a17f4d24 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 17 Feb 2010 10:02:43 -0800
Subject: ceph: clean up readdir caps reservation

Use a global counter for the minimum number of allocated caps instead of
hard coding a check against readdir_max.  This takes into account multiple
client instances, and avoids examining the superblock mount options when a
cap is dropped.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c    | 23 +++++++++++++++++------
 fs/ceph/debugfs.c | 13 +++++++------
 fs/ceph/super.c   |  5 +++++
 fs/ceph/super.h   |  5 ++++-
 4 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ab9b571dda11..f94b56faba3b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -128,6 +128,7 @@ static int caps_total_count;        /* total caps allocated */
 static int caps_use_count;          /* in use */
 static int caps_reserve_count;      /* unused, reserved */
 static int caps_avail_count;        /* unused, unreserved */
+static int caps_min_count;          /* keep at least this many (unreserved) */
 
 void __init ceph_caps_init(void)
 {
@@ -149,6 +150,15 @@ void ceph_caps_finalize(void)
 	caps_avail_count = 0;
 	caps_use_count = 0;
 	caps_reserve_count = 0;
+	caps_min_count = 0;
+	spin_unlock(&caps_list_lock);
+}
+
+void ceph_adjust_min_caps(int delta)
+{
+	spin_lock(&caps_list_lock);
+	caps_min_count += delta;
+	BUG_ON(caps_min_count < 0);
 	spin_unlock(&caps_list_lock);
 }
 
@@ -265,12 +275,10 @@ static void put_cap(struct ceph_cap *cap,
 	     caps_reserve_count, caps_avail_count);
 	caps_use_count--;
 	/*
-	 * Keep some preallocated caps around, at least enough to do a
-	 * readdir (which needs to preallocate lots of them), to avoid
-	 * lots of free/alloc churn.
+	 * Keep some preallocated caps around (ceph_min_count), to
+	 * avoid lots of free/alloc churn.
 	 */
-	if (caps_avail_count >= caps_reserve_count +
-	    ceph_client(cap->ci->vfs_inode.i_sb)->mount_args->max_readdir) {
+	if (caps_avail_count >= caps_reserve_count + caps_min_count) {
 		caps_total_count--;
 		kmem_cache_free(ceph_cap_cachep, cap);
 	} else {
@@ -289,7 +297,8 @@ static void put_cap(struct ceph_cap *cap,
 }
 
 void ceph_reservation_status(struct ceph_client *client,
-			     int *total, int *avail, int *used, int *reserved)
+			     int *total, int *avail, int *used, int *reserved,
+			     int *min)
 {
 	if (total)
 		*total = caps_total_count;
@@ -299,6 +308,8 @@ void ceph_reservation_status(struct ceph_client *client,
 		*used = caps_use_count;
 	if (reserved)
 		*reserved = caps_reserve_count;
+	if (min)
+		*min = caps_min_count;
 }
 
 /*
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index b58bd9188692..1a47b5c25b5f 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -255,14 +255,15 @@ static int osdc_show(struct seq_file *s, void *pp)
 static int caps_show(struct seq_file *s, void *p)
 {
 	struct ceph_client *client = p;
-	int total, avail, used, reserved;
+	int total, avail, used, reserved, min;
 
-	ceph_reservation_status(client, &total, &avail, &used, &reserved);
+	ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
 	seq_printf(s, "total\t\t%d\n"
-		      "avail\t\t%d\n"
-		      "used\t\t%d\n"
-		      "reserved\t%d\n",
-		   total, avail, used, reserved);
+		   "avail\t\t%d\n"
+		   "used\t\t%d\n"
+		   "reserved\t%d\n"
+		   "min\t%d\n",
+		   total, avail, used, reserved, min);
 	return 0;
 }
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 39aaf29a04a0..74953be75f8f 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -578,6 +578,9 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
 	if (!client->wb_pagevec_pool)
 		goto fail_trunc_wq;
 
+	/* caps */
+	client->min_caps = args->max_readdir;
+	ceph_adjust_min_caps(client->min_caps);
 
 	/* subsystems */
 	err = ceph_monc_init(&client->monc, client);
@@ -619,6 +622,8 @@ static void ceph_destroy_client(struct ceph_client *client)
 	ceph_monc_stop(&client->monc);
 	ceph_osdc_stop(&client->osdc);
 
+	ceph_adjust_min_caps(-client->min_caps);
+
 	ceph_debugfs_client_cleanup(client);
 	destroy_workqueue(client->wb_wq);
 	destroy_workqueue(client->pg_inv_wq);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1f3928785e12..3b5faf9980f8 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -129,6 +129,8 @@ struct ceph_client {
 
 	int auth_err;
 
+	int min_caps;                  /* min caps i added */
+
 	struct ceph_messenger *msgr;   /* messenger instance */
 	struct ceph_mon_client monc;
 	struct ceph_mds_client mdsc;
@@ -557,11 +559,12 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
 
 extern void ceph_caps_init(void);
 extern void ceph_caps_finalize(void);
+extern void ceph_adjust_min_caps(int delta);
 extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
 extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
 extern void ceph_reservation_status(struct ceph_client *client,
 				    int *total, int *avail, int *used,
-				    int *reserved);
+				    int *reserved, int *min);
 
 static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
 {
-- 
cgit v1.2.2


From 7c1332b8cb5b27656cf6ab1f5fe808a8eb8bb2c0 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 16 Feb 2010 11:39:45 -0800
Subject: ceph: fix iterate_caps removal race

We need to be able to iterate over all caps on a session with a
possibly slow callback on each cap.  To allow this, we used to
prevent cap reordering while we were iterating.  However, we were
not safe from races with removal: removing the 'next' cap would
make the next pointer from list_for_each_entry_safe be invalid,
and cause a lock up or similar badness.

Instead, we keep an iterator pointer in the session pointing to
the current cap.  As before, we avoid reordering.  For removal,
if the cap isn't the current cap we are iterating over, we are
fine.  If it is, we clear cap->ci (to mark the cap as pending
removal) but leave it in the session list.  In iterate_caps, we
can safely finish removal and get the next cap pointer.

While we're at it, clean up put_cap to not take a cap reservation
context, as it was never used.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       | 47 ++++++++++++++++++++++++-----------------------
 fs/ceph/mds_client.c | 51 ++++++++++++++++++++++++++++++++++++++++++---------
 fs/ceph/mds_client.h |  2 +-
 fs/ceph/super.h      |  6 +++---
 4 files changed, 70 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f94b56faba3b..4958a2ef3e04 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -266,12 +266,11 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
 	return cap;
 }
 
-static void put_cap(struct ceph_cap *cap,
-		    struct ceph_cap_reservation *ctx)
+void ceph_put_cap(struct ceph_cap *cap)
 {
 	spin_lock(&caps_list_lock);
-	dout("put_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
-	     ctx, ctx ? ctx->count : 0, caps_total_count, caps_use_count,
+	dout("put_cap %p %d = %d used + %d resv + %d avail\n",
+	     cap, caps_total_count, caps_use_count,
 	     caps_reserve_count, caps_avail_count);
 	caps_use_count--;
 	/*
@@ -282,12 +281,7 @@ static void put_cap(struct ceph_cap *cap,
 		caps_total_count--;
 		kmem_cache_free(ceph_cap_cachep, cap);
 	} else {
-		if (ctx) {
-			ctx->count++;
-			caps_reserve_count++;
-		} else {
-			caps_avail_count++;
-		}
+		caps_avail_count++;
 		list_add(&cap->caps_item, &caps_list);
 	}
 
@@ -709,7 +703,7 @@ static void __touch_cap(struct ceph_cap *cap)
 	struct ceph_mds_session *s = cap->session;
 
 	spin_lock(&s->s_cap_lock);
-	if (!s->s_iterating_caps) {
+	if (s->s_cap_iterator == NULL) {
 		dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
 		     s->s_mds);
 		list_move_tail(&cap->session_caps, &s->s_caps);
@@ -865,8 +859,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
  * caller should hold i_lock, and session s_mutex.
  * returns true if this is the last cap.  if so, caller should iput.
  */
-void __ceph_remove_cap(struct ceph_cap *cap,
-		       struct ceph_cap_reservation *ctx)
+void __ceph_remove_cap(struct ceph_cap *cap)
 {
 	struct ceph_mds_session *session = cap->session;
 	struct ceph_inode_info *ci = cap->ci;
@@ -874,19 +867,27 @@ void __ceph_remove_cap(struct ceph_cap *cap,
 
 	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
 
-	/* remove from session list */
-	spin_lock(&session->s_cap_lock);
-	list_del_init(&cap->session_caps);
-	session->s_nr_caps--;
-	spin_unlock(&session->s_cap_lock);
-
 	/* remove from inode list */
 	rb_erase(&cap->ci_node, &ci->i_caps);
-	cap->session = NULL;
+	cap->ci = NULL;
 	if (ci->i_auth_cap == cap)
 		ci->i_auth_cap = NULL;
 
-	put_cap(cap, ctx);
+	/* remove from session list */
+	spin_lock(&session->s_cap_lock);
+	if (session->s_cap_iterator == cap) {
+		/* not yet, we are iterating over this very cap */
+		dout("__ceph_remove_cap  delaying %p removal from session %p\n",
+		     cap, cap->session);
+	} else {
+		list_del_init(&cap->session_caps);
+		session->s_nr_caps--;
+		cap->session = NULL;
+	}
+	spin_unlock(&session->s_cap_lock);
+
+	if (cap->session == NULL)
+		ceph_put_cap(cap);
 
 	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
 		struct ceph_snap_realm *realm = ci->i_snap_realm;
@@ -1022,7 +1023,7 @@ void ceph_queue_caps_release(struct inode *inode)
 		}
 		spin_unlock(&session->s_cap_lock);
 		p = rb_next(p);
-		__ceph_remove_cap(cap, NULL);
+		__ceph_remove_cap(cap);
 
 	}
 	spin_unlock(&inode->i_lock);
@@ -2521,7 +2522,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 			ci->i_cap_exporting_mseq = mseq;
 			ci->i_cap_exporting_issued = cap->issued;
 		}
-		__ceph_remove_cap(cap, NULL);
+		__ceph_remove_cap(cap);
 	} else {
 		WARN_ON(!cap);
 	}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 02834cecc3a0..124c0c17a14a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -344,7 +344,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	INIT_LIST_HEAD(&s->s_waiting);
 	INIT_LIST_HEAD(&s->s_unsafe);
 	s->s_num_cap_releases = 0;
-	s->s_iterating_caps = false;
+	s->s_cap_iterator = NULL;
 	INIT_LIST_HEAD(&s->s_cap_releases);
 	INIT_LIST_HEAD(&s->s_cap_releases_done);
 	INIT_LIST_HEAD(&s->s_cap_flushing);
@@ -729,28 +729,61 @@ static int iterate_session_caps(struct ceph_mds_session *session,
 				 int (*cb)(struct inode *, struct ceph_cap *,
 					    void *), void *arg)
 {
-	struct ceph_cap *cap, *ncap;
-	struct inode *inode;
+	struct list_head *p;
+	struct ceph_cap *cap;
+	struct inode *inode, *last_inode = NULL;
+	struct ceph_cap *old_cap = NULL;
 	int ret;
 
 	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
 	spin_lock(&session->s_cap_lock);
-	session->s_iterating_caps = true;
-	list_for_each_entry_safe(cap, ncap, &session->s_caps, session_caps) {
+	p = session->s_caps.next;
+	while (p != &session->s_caps) {
+		cap = list_entry(p, struct ceph_cap, session_caps);
 		inode = igrab(&cap->ci->vfs_inode);
-		if (!inode)
+		if (!inode) {
+			p = p->next;
 			continue;
+		}
+		session->s_cap_iterator = cap;
 		spin_unlock(&session->s_cap_lock);
+
+		if (last_inode) {
+			iput(last_inode);
+			last_inode = NULL;
+		}
+		if (old_cap) {
+			ceph_put_cap(old_cap);
+			old_cap = NULL;
+		}
+
 		ret = cb(inode, cap, arg);
-		iput(inode);
+		last_inode = inode;
+
 		spin_lock(&session->s_cap_lock);
+		p = p->next;
+		if (cap->ci == NULL) {
+			dout("iterate_session_caps  finishing cap %p removal\n",
+			     cap);
+			BUG_ON(cap->session != session);
+			list_del_init(&cap->session_caps);
+			session->s_nr_caps--;
+			cap->session = NULL;
+			old_cap = cap;  /* put_cap it w/o locks held */
+		}
 		if (ret < 0)
 			goto out;
 	}
 	ret = 0;
 out:
-	session->s_iterating_caps = false;
+	session->s_cap_iterator = NULL;
 	spin_unlock(&session->s_cap_lock);
+
+	if (last_inode)
+		iput(last_inode);
+	if (old_cap)
+		ceph_put_cap(old_cap);
+
 	return ret;
 }
 
@@ -942,7 +975,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 	session->s_trim_caps--;
 	if (oissued) {
 		/* we aren't the only cap.. just remove us */
-		__ceph_remove_cap(cap, NULL);
+		__ceph_remove_cap(cap);
 	} else {
 		/* try to drop referring dentries */
 		spin_unlock(&inode->i_lock);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 9d6b90173879..961cc6f65878 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -114,7 +114,7 @@ struct ceph_mds_session {
 	int               s_num_cap_releases;
 	struct list_head  s_cap_releases; /* waiting cap_release messages */
 	struct list_head  s_cap_releases_done; /* ready to send */
-	bool              s_iterating_caps;
+	struct ceph_cap  *s_cap_iterator;
 
 	/* protected by mutex */
 	struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3b5faf9980f8..384f0e2e7c68 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -795,15 +795,15 @@ extern int ceph_add_cap(struct inode *inode,
 			int fmode, unsigned issued, unsigned wanted,
 			unsigned cap, unsigned seq, u64 realmino, int flags,
 			struct ceph_cap_reservation *caps_reservation);
-extern void __ceph_remove_cap(struct ceph_cap *cap,
-			      struct ceph_cap_reservation *ctx);
+extern void __ceph_remove_cap(struct ceph_cap *cap);
 static inline void ceph_remove_cap(struct ceph_cap *cap)
 {
 	struct inode *inode = &cap->ci->vfs_inode;
 	spin_lock(&inode->i_lock);
-	__ceph_remove_cap(cap, NULL);
+	__ceph_remove_cap(cap);
 	spin_unlock(&inode->i_lock);
 }
+extern void ceph_put_cap(struct ceph_cap *cap);
 
 extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, int unused);
-- 
cgit v1.2.2


From 9794b146fa7b93f8ab74fb62d67fdefad760769f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 16 Feb 2010 15:53:32 -0800
Subject: ceph: fix memory leak when destroying osdmap with pg_temp mappings

Also move _lookup_pg_mapping into a helper.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 49 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index a6afe3836f7e..443fdcdb19c4 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -321,8 +321,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
 	dout("osdmap_destroy %p\n", map);
 	if (map->crush)
 		crush_destroy(map->crush);
-	while (!RB_EMPTY_ROOT(&map->pg_temp))
-		rb_erase(rb_first(&map->pg_temp), &map->pg_temp);
+	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(rb_first(&map->pg_temp),
+				 struct ceph_pg_mapping, node);
+		rb_erase(&pg->node, &map->pg_temp);
+		kfree(pg);
+	}
 	kfree(map->osd_state);
 	kfree(map->osd_weight);
 	kfree(map->pg_pool);
@@ -367,7 +372,8 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 }
 
 /*
- * Insert a new pg_temp mapping
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds)
  */
 static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
 {
@@ -406,6 +412,26 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
 	return 0;
 }
 
+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+						   struct ceph_pg pgid)
+{
+	struct rb_node *n = root->rb_node;
+	struct ceph_pg_mapping *pg;
+	int c;
+
+	while (n) {
+		pg = rb_entry(n, struct ceph_pg_mapping, node);
+		c = pgid_cmp(pgid, pg->pgid);
+		if (c < 0)
+			n = n->rb_left;
+		else if (c > 0)
+			n = n->rb_right;
+		else
+			return pg;
+	}
+	return NULL;
+}
+
 /*
  * decode a full map.
  */
@@ -870,26 +896,17 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 			int *osds, int *num)
 {
-	struct rb_node *n = osdmap->pg_temp.rb_node;
 	struct ceph_pg_mapping *pg;
 	struct ceph_pg_pool_info *pool;
 	int ruleno;
 	unsigned poolid, ps, pps;
 	int preferred;
-	int c;
 
 	/* pg_temp? */
-	while (n) {
-		pg = rb_entry(n, struct ceph_pg_mapping, node);
-		c = pgid_cmp(pgid, pg->pgid);
-		if (c < 0)
-			n = n->rb_left;
-		else if (c > 0)
-			n = n->rb_right;
-		else {
-			*num = pg->len;
-			return pg->osds;
-		}
+	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+	if (pg) {
+		*num = pg->len;
+		return pg->osds;
 	}
 
 	/* crush */
-- 
cgit v1.2.2


From 4fc51be8fa7043ff9a1e34fef0e99214373332ac Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 16 Feb 2010 15:55:03 -0800
Subject: ceph: use rbtree for pg pools; decode new osdmap format

Since we can now create and destroy pg pools, the pool ids will be sparse,
and an array no longer makes sense for looking up by pool id.  Use an
rbtree instead.

The OSDMap encoding also no longer has a max pool count (previously used to
allocate the array).  There is a new pool_max, that is the largest pool id
we've ever used, although we don't actually need it in the client.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/debugfs.c |   7 +--
 fs/ceph/osdmap.c  | 136 +++++++++++++++++++++++++++++++++++++-----------------
 fs/ceph/osdmap.h  |   7 +--
 fs/ceph/rados.h   |   4 +-
 4 files changed, 104 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 1a47b5c25b5f..e159f1415110 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -78,6 +78,7 @@ static int osdmap_show(struct seq_file *s, void *p)
 {
 	int i;
 	struct ceph_client *client = s->private;
+	struct rb_node *n;
 
 	if (client->osdc.osdmap == NULL)
 		return 0;
@@ -87,11 +88,11 @@ static int osdmap_show(struct seq_file *s, void *p)
 		   " NEARFULL" : "",
 		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
 		   " FULL" : "");
-	for (i = 0; i < client->osdc.osdmap->num_pools; i++) {
+	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
 		struct ceph_pg_pool_info *pool =
-			&client->osdc.osdmap->pg_pool[i];
+			rb_entry(n, struct ceph_pg_pool_info, node);
 		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
-			   i, pool->v.pg_num, pool->pg_num_mask,
+			   pool->id, pool->v.pg_num, pool->pg_num_mask,
 			   pool->v.lpg_num, pool->lpg_num_mask);
 	}
 	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 443fdcdb19c4..34b5696c84fd 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -328,9 +328,15 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
 		rb_erase(&pg->node, &map->pg_temp);
 		kfree(pg);
 	}
+	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+		struct ceph_pg_pool_info *pi =
+			rb_entry(rb_first(&map->pg_pools),
+				 struct ceph_pg_pool_info, node);
+		rb_erase(&pi->node, &map->pg_pools);
+		kfree(pi);
+	}
 	kfree(map->osd_state);
 	kfree(map->osd_weight);
-	kfree(map->pg_pool);
 	kfree(map->osd_addr);
 	kfree(map);
 }
@@ -432,6 +438,48 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
 	return NULL;
 }
 
+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_pg_pool_info *pi = NULL;
+
+	while (*p) {
+		parent = *p;
+		pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+		if (new->id < pi->id)
+			p = &(*p)->rb_left;
+		else if (new->id > pi->id)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+	return 0;
+}
+
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+	struct ceph_pg_pool_info *pi;
+	struct rb_node *n = root->rb_node;
+
+	while (n) {
+		pi = rb_entry(n, struct ceph_pg_pool_info, node);
+		if (id < pi->id)
+			n = n->rb_left;
+		else if (id > pi->id)
+			n = n->rb_right;
+		else
+			return pi;
+	}
+	return NULL;
+}
+
 /*
  * decode a full map.
  */
@@ -443,6 +491,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	u8 ev;
 	int err = -EINVAL;
 	void *start = *p;
+	struct ceph_pg_pool_info *pi;
 
 	dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
 
@@ -464,32 +513,27 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	ceph_decode_copy(p, &map->created, sizeof(map->created));
 	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
 
-	map->num_pools = ceph_decode_32(p);
-	map->pg_pool = kcalloc(map->num_pools, sizeof(*map->pg_pool),
-			       GFP_NOFS);
-	if (!map->pg_pool) {
-		err = -ENOMEM;
-		goto bad;
-	}
 	ceph_decode_32_safe(p, end, max, bad);
 	while (max--) {
-		ceph_decode_need(p, end, 4+1+sizeof(map->pg_pool->v), bad);
-		i = ceph_decode_32(p);
-		if (i >= map->num_pools)
+		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+		pi = kmalloc(sizeof(*pi), GFP_NOFS);
+		if (!pi)
 			goto bad;
+		pi->id = ceph_decode_32(p);
 		ev = ceph_decode_8(p); /* encoding version */
 		if (ev > CEPH_PG_POOL_VERSION) {
 			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
 				   ev, CEPH_PG_POOL_VERSION);
 			goto bad;
 		}
-		ceph_decode_copy(p, &map->pg_pool[i].v,
-				 sizeof(map->pg_pool->v));
-		calc_pg_masks(&map->pg_pool[i]);
-		p += le32_to_cpu(map->pg_pool[i].v.num_snaps) * sizeof(u64);
-		p += le32_to_cpu(map->pg_pool[i].v.num_removed_snap_intervals)
+		ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+		__insert_pg_pool(&map->pg_pools, pi);
+		calc_pg_masks(pi);
+		p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+		p += le32_to_cpu(pi->v.num_removed_snap_intervals)
 			* sizeof(u64) * 2;
 	}
+	ceph_decode_32_safe(p, end, map->pool_max, bad);
 
 	ceph_decode_32_safe(p, end, map->flags, bad);
 
@@ -581,7 +625,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	u32 epoch = 0;
 	struct ceph_timespec modified;
 	u32 len, pool;
-	__s32 new_flags, max;
+	__s32 new_pool_max, new_flags, max;
 	void *start = *p;
 	int err = -EINVAL;
 	u16 version;
@@ -600,6 +644,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	epoch = ceph_decode_32(p);
 	BUG_ON(epoch != map->epoch+1);
 	ceph_decode_copy(p, &modified, sizeof(modified));
+	new_pool_max = ceph_decode_32(p);
 	new_flags = ceph_decode_32(p);
 
 	/* full map? */
@@ -623,6 +668,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	/* new flags? */
 	if (new_flags >= 0)
 		map->flags = new_flags;
+	if (new_pool_max >= 0)
+		map->pool_max = new_pool_max;
 
 	ceph_decode_need(p, end, 5*sizeof(u32), bad);
 
@@ -647,37 +694,42 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	ceph_decode_32_safe(p, end, len, bad);
 	while (len--) {
 		__u8 ev;
+		struct ceph_pg_pool_info *pi;
 
 		ceph_decode_32_safe(p, end, pool, bad);
-		if (pool >= map->num_pools) {
-			void *pg_pool = kcalloc(pool + 1,
-						sizeof(*map->pg_pool),
-						GFP_NOFS);
-			if (!pg_pool) {
-				err = -ENOMEM;
-				goto bad;
-			}
-			memcpy(pg_pool, map->pg_pool,
-			       map->num_pools * sizeof(*map->pg_pool));
-			kfree(map->pg_pool);
-			map->pg_pool = pg_pool;
-			map->num_pools = pool+1;
-		}
-		ceph_decode_need(p, end, 1 + sizeof(map->pg_pool->v), bad);
+		ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
 		ev = ceph_decode_8(p);  /* encoding version */
 		if (ev > CEPH_PG_POOL_VERSION) {
 			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
 				   ev, CEPH_PG_POOL_VERSION);
 			goto bad;
 		}
-		ceph_decode_copy(p, &map->pg_pool[pool].v,
-				 sizeof(map->pg_pool->v));
-		calc_pg_masks(&map->pg_pool[pool]);
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (!pi) {
+			pi = kmalloc(sizeof(*pi), GFP_NOFS);
+			if (!pi) {
+				err = -ENOMEM;
+				goto bad;
+			}
+			pi->id = pool;
+			__insert_pg_pool(&map->pg_pools, pi);
+		}
+		ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+		calc_pg_masks(pi);
 	}
 
-	/* old_pool (ignore) */
+	/* old_pool */
 	ceph_decode_32_safe(p, end, len, bad);
-	*p += len * sizeof(u32);
+	while (len--) {
+		struct ceph_pg_pool_info *pi;
+
+		ceph_decode_32_safe(p, end, pool, bad);
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (pi) {
+			rb_erase(&pi->node, &map->pg_pools);
+			kfree(pi);
+		}
+	}
 
 	/* new_up */
 	err = -EINVAL;
@@ -861,10 +913,10 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
 	unsigned ps;
 
 	BUG_ON(!osdmap);
-	if (poolid >= osdmap->num_pools)
-		return -EIO;
 
-	pool = &osdmap->pg_pool[poolid];
+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	if (!pool)
+		return -EIO;
 	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
 	if (preferred >= 0) {
 		ps += preferred;
@@ -919,9 +971,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 	    preferred >= osdmap->crush->max_devices)
 		preferred = -1;
 
-	if (poolid >= osdmap->num_pools)
+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	if (!pool)
 		return NULL;
-	pool = &osdmap->pg_pool[poolid];
 	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
 				 pool->v.type, pool->v.size);
 	if (ruleno < 0) {
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
index c4af8418aa00..1fb55afb2642 100644
--- a/fs/ceph/osdmap.h
+++ b/fs/ceph/osdmap.h
@@ -19,6 +19,8 @@
  * the change between two successive epochs, or as a fully encoded map.
  */
 struct ceph_pg_pool_info {
+	struct rb_node node;
+	int id;
 	struct ceph_pg_pool v;
 	int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
 };
@@ -44,9 +46,8 @@ struct ceph_osdmap {
 	struct ceph_entity_addr *osd_addr;
 
 	struct rb_root pg_temp;
-
-	u32 num_pools;
-	struct ceph_pg_pool_info *pg_pool;
+	struct rb_root pg_pools;
+	u32 pool_max;
 
 	/* the CRUSH map specifies the mapping of placement groups to
 	 * the list of osds that store+replicate them. */
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 1f4c78640541..26ac8b89a676 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -11,8 +11,8 @@
 /*
  * osdmap encoding versions
  */
-#define CEPH_OSDMAP_INC_VERSION 3
-#define CEPH_OSDMAP_VERSION     3
+#define CEPH_OSDMAP_INC_VERSION 4
+#define CEPH_OSDMAP_VERSION     4
 
 /*
  * fs id
-- 
cgit v1.2.2


From a17d6473cc9eb64a5b41c568310aa73824ebaa64 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 17 Feb 2010 13:56:07 -0800
Subject: ceph: v0.19 release

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/ceph_fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 004aae59d4ba..0c2241ef3653 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
  * Ceph release version
  */
 #define CEPH_VERSION_MAJOR 0
-#define CEPH_VERSION_MINOR 18
+#define CEPH_VERSION_MINOR 19
 #define CEPH_VERSION_PATCH 0
 
 #define _CEPH_STRINGIFY(x) #x
-- 
cgit v1.2.2


From 2c27c9a57c93a0757b9b4b0e7dc1abeaf1db1ce2 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 17 Feb 2010 15:45:51 -0800
Subject: ceph: fix typo in ceph_queue_writeback debug output

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 58bdff09c2c1..d7d5d4923772 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1247,10 +1247,10 @@ void ceph_queue_writeback(struct inode *inode)
 {
 	if (queue_work(ceph_inode_to_client(inode)->wb_wq,
 		       &ceph_inode(inode)->i_wb_work)) {
-		dout("ceph_queue_invalidate %p\n", inode);
+		dout("ceph_queue_writeback %p\n", inode);
 		igrab(inode);
 	} else {
-		dout("ceph_queue_invalidate %p failed\n", inode);
+		dout("ceph_queue_writeback %p failed\n", inode);
 	}
 }
 
-- 
cgit v1.2.2


From 5ecad6fd7bfd30b3eaea51345f546b81de7a6473 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 17 Feb 2010 10:43:37 -0800
Subject: ceph: fix check for invalidate_mapping_pages success

We need to know whether there was any page left behind, and not the
return value (the total number of pages invalidated).  Look at the mapping
to see if we were successful or not.

Move it all into a helper to simplify the two callers.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 82 +++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 50 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 4958a2ef3e04..e1e6df0f549e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1361,6 +1361,41 @@ static int __mark_caps_flushing(struct inode *inode,
 	return flushing;
 }
 
+/*
+ * try to invalidate mapping pages without blocking.
+ */
+static int mapping_is_empty(struct address_space *mapping)
+{
+	struct page *page = find_get_page(mapping, 0);
+
+	if (!page)
+		return 1;
+
+	put_page(page);
+	return 0;
+}
+
+static int try_nonblocking_invalidate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u32 invalidating_gen = ci->i_rdcache_gen;
+
+	spin_unlock(&inode->i_lock);
+	invalidate_mapping_pages(&inode->i_data, 0, -1);
+	spin_lock(&inode->i_lock);
+
+	if (mapping_is_empty(&inode->i_data) &&
+	    invalidating_gen == ci->i_rdcache_gen) {
+		/* success. */
+		dout("try_nonblocking_invalidate %p success\n", inode);
+		ci->i_rdcache_gen = 0;
+		ci->i_rdcache_revoking = 0;
+		return 0;
+	}
+	dout("try_nonblocking_invalidate %p failed\n", inode);
+	return -1;
+}
+
 /*
  * Swiss army knife function to examine currently used and wanted
  * versus held caps.  Release, flush, ack revoked caps to mds as
@@ -1451,27 +1486,19 @@ retry_locked:
 	    (file_wanted == 0 ||                     /* no open files */
 	     (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
 	    !tried_invalidate) {
-		u32 invalidating_gen = ci->i_rdcache_gen;
-		int ret;
-
 		dout("check_caps trying to invalidate on %p\n", inode);
-		spin_unlock(&inode->i_lock);
-		ret = invalidate_mapping_pages(&inode->i_data, 0, -1);
-		spin_lock(&inode->i_lock);
-		if (ret == 0 && invalidating_gen == ci->i_rdcache_gen) {
-			/* success. */
-			ci->i_rdcache_gen = 0;
-			ci->i_rdcache_revoking = 0;
-		} else if (revoking & CEPH_CAP_FILE_CACHE) {
-			dout("check_caps queuing invalidate\n");
-			queue_invalidate = 1;
-			ci->i_rdcache_revoking = ci->i_rdcache_gen;
-		} else {
-			dout("check_caps failed to invalidate pages\n");
-			/* we failed to invalidate pages.  check these
-			   caps again later. */
-			force_requeue = 1;
-			__cap_set_timeouts(mdsc, ci);
+		if (try_nonblocking_invalidate(inode) < 0) {
+			if (revoking & CEPH_CAP_FILE_CACHE) {
+				dout("check_caps queuing invalidate\n");
+				queue_invalidate = 1;
+				ci->i_rdcache_revoking = ci->i_rdcache_gen;
+			} else {
+				dout("check_caps failed to invalidate pages\n");
+				/* we failed to invalidate pages.  check these
+				   caps again later. */
+				force_requeue = 1;
+				__cap_set_timeouts(mdsc, ci);
+			}
 		}
 		tried_invalidate = 1;
 		goto retry_locked;
@@ -2184,7 +2211,6 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	int revoked_rdcache = 0;
 	int queue_invalidate = 0;
 	int tried_invalidate = 0;
-	int ret;
 
 	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
 	     inode, cap, mds, seq, ceph_cap_string(newcaps));
@@ -2199,24 +2225,16 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 restart:
 	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
 	    !ci->i_wrbuffer_ref && !tried_invalidate) {
-		dout("CACHE invalidation\n");
-		spin_unlock(&inode->i_lock);
 		tried_invalidate = 1;
-
-		ret = invalidate_mapping_pages(&inode->i_data, 0, -1);
-		spin_lock(&inode->i_lock);
-		if (ret < 0) {
+		if (try_nonblocking_invalidate(inode) == 0) {
+			revoked_rdcache = 1;
+		} else {
 			/* there were locked pages.. invalidate later
 			   in a separate thread. */
 			if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
 				queue_invalidate = 1;
 				ci->i_rdcache_revoking = ci->i_rdcache_gen;
 			}
-		} else {
-			/* we successfully invalidated those pages */
-			revoked_rdcache = 1;
-			ci->i_rdcache_gen = 0;
-			ci->i_rdcache_revoking = 0;
 		}
 		goto restart;
 	}
-- 
cgit v1.2.2


From e63dc5c780ba32d6d8b3662eecce2b8d96489b41 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Fri, 19 Feb 2010 00:07:01 +0000
Subject: ceph: remove page upon writeback completion if lost cache cap

This page should have been removed earlier when the cache cap was
revoked, but a writeback was in flight, so it was skipped. We truncate
it here just as the writeback finishes, while it's still locked.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 71f5ad1c1e26..25360d517d1b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -514,6 +514,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	u64 bytes = 0;
 	struct ceph_client *client = ceph_inode_to_client(inode);
 	long writeback_stat;
+	unsigned issued = __ceph_caps_issued(ci, NULL);
 
 	/* parse reply */
 	replyhead = msg->front.iov_base;
@@ -559,6 +560,16 @@ static void writepages_finish(struct ceph_osd_request *req,
 		ceph_put_snap_context(snapc);
 		dout("unlocking %d %p\n", i, page);
 		end_page_writeback(page);
+
+		/*
+		 * We lost the cache cap, need to truncate the page before
+		 * it is unlocked, otherwise we'd truncate it later in the
+		 * page truncation thread, possibly losing some data that
+		 * raced its way in
+		 */
+		if ((issued & CEPH_CAP_FILE_CACHE) == 0)
+			generic_error_remove_page(inode->i_mapping, page);
+
 		unlock_page(page);
 	}
 	dout("%p wrote+cleaned %d pages\n", inode, wrote);
-- 
cgit v1.2.2


From c9af9fb68e01eb2c2165e1bc45cfeeed510c64e6 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Fri, 19 Feb 2010 00:10:11 +0000
Subject: ceph: don't truncate dirty pages in invalidate work thread

Instead of truncating the whole range of pages, we skip those
pages that are dirty or in the middle of writeback. Those pages
will be cleared later when the writeback completes.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d7d5d4923772..7abe1aed819b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -10,6 +10,7 @@
 #include <linux/namei.h>
 #include <linux/writeback.h>
 #include <linux/vmalloc.h>
+#include <linux/pagevec.h>
 
 #include "super.h"
 #include "decode.h"
@@ -1279,6 +1280,49 @@ void ceph_queue_invalidate(struct inode *inode)
 	}
 }
 
+/*
+ * invalidate any pages that are not dirty or under writeback.  this
+ * includes pages that are clean and mapped.
+ */
+static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	pgoff_t next = 0;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			pgoff_t index;
+			int skip_page =
+				(PageDirty(page) || PageWriteback(page));
+
+			if (!skip_page)
+				skip_page = !trylock_page(page);
+
+			/*
+			 * We really shouldn't be looking at the ->index of an
+			 * unlocked page.  But we're not allowed to lock these
+			 * pages.  So we rely upon nobody altering the ->index
+			 * of this (pinned-by-us) page.
+			 */
+			index = page->index;
+			if (index > next)
+				next = index;
+			next++;
+
+			if (skip_page)
+				continue;
+
+			generic_error_remove_page(mapping, page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
 /*
  * Invalidate inode pages in a worker thread.  (This can't be done
  * in the message handler context.)
@@ -1305,7 +1349,7 @@ static void ceph_invalidate_work(struct work_struct *work)
 	orig_gen = ci->i_rdcache_gen;
 	spin_unlock(&inode->i_lock);
 
-	truncate_inode_pages(&inode->i_data, 0);
+	ceph_invalidate_nondirty_pages(inode->i_mapping);
 
 	spin_lock(&inode->i_lock);
 	if (orig_gen == ci->i_rdcache_gen) {
-- 
cgit v1.2.2


From bcd2cbd10ce31c950a40c08d7f601f8ff23537b8 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Fri, 19 Feb 2010 00:12:21 +0000
Subject: ceph: cleanup redundant code in handle_cap_grant

There is no state in local vars that requires us to loop after temporarily
dropping i_lock.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index e1e6df0f549e..289f6c65a17e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2210,7 +2210,6 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	int writeback = 0;
 	int revoked_rdcache = 0;
 	int queue_invalidate = 0;
-	int tried_invalidate = 0;
 
 	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
 	     inode, cap, mds, seq, ceph_cap_string(newcaps));
@@ -2222,10 +2221,8 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	 * try to invalidate (once).  (If there are dirty buffers, we
 	 * will invalidate _after_ writeback.)
 	 */
-restart:
 	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
-	    !ci->i_wrbuffer_ref && !tried_invalidate) {
-		tried_invalidate = 1;
+	    !ci->i_wrbuffer_ref) {
 		if (try_nonblocking_invalidate(inode) == 0) {
 			revoked_rdcache = 1;
 		} else {
@@ -2236,7 +2233,6 @@ restart:
 				ci->i_rdcache_revoking = ci->i_rdcache_gen;
 			}
 		}
-		goto restart;
 	}
 
 	/* side effects now are allowed */
-- 
cgit v1.2.2


From 5b3a4db3e4009aff918abb1353eb3f4925393a7b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 19 Feb 2010 21:43:23 -0800
Subject: ceph: fix up unexpected message handling

Fix skipping of unexpected message types from osd, mon.

Clean up pr_info and debug output.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c  |  5 +++--
 fs/ceph/mon_client.c | 14 +++++++++-----
 fs/ceph/osd_client.c | 41 +++++++++++++++++++++++++++++++----------
 3 files changed, 43 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index ca2ad0e5bb28..fdda707aa137 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1361,7 +1361,7 @@ static int read_partial_message(struct ceph_connection *con)
 		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
 		if (skip) {
 			/* skip this message */
-			pr_err("alloc_msg returned NULL, skipping message\n");
+			dout("alloc_msg returned NULL, skipping message\n");
 			con->in_base_pos = -front_len - middle_len - data_len -
 				sizeof(m->footer);
 			con->in_tag = CEPH_MSGR_TAG_READY;
@@ -1370,7 +1370,8 @@ static int read_partial_message(struct ceph_connection *con)
 		if (IS_ERR(con->in_msg)) {
 			ret = PTR_ERR(con->in_msg);
 			con->in_msg = NULL;
-			con->error_msg = "error allocating memory for incoming message";
+			con->error_msg =
+				"error allocating memory for incoming message";
 			return ret;
 		}
 		m = con->in_msg;
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 40d7d90bbed1..890597c09d43 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -763,7 +763,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 	struct ceph_mon_client *monc = con->private;
 	int type = le16_to_cpu(hdr->type);
 	int front_len = le32_to_cpu(hdr->front_len);
-	struct ceph_msg *m;
+	struct ceph_msg *m = NULL;
 
 	*skip = 0;
 
@@ -777,13 +777,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
 	case CEPH_MSG_AUTH_REPLY:
 		m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
 		break;
-	default:
-		return NULL;
+	case CEPH_MSG_MON_MAP:
+	case CEPH_MSG_MDS_MAP:
+	case CEPH_MSG_OSD_MAP:
+		m = ceph_msg_new(type, front_len, 0, 0, NULL);
+		break;
 	}
 
-	if (!m)
+	if (!m) {
+		pr_info("alloc_msg unknown type %d\n", type);
 		*skip = 1;
-
+	}
 	return m;
 }
 
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index fa0f73703954..ffd819c5a5dd 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1396,31 +1396,30 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 	ceph_msg_put(msg);
 }
 
-static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+/*
+ * lookup and return message for incoming reply
+ */
+static struct ceph_msg *get_reply(struct ceph_connection *con,
 				  struct ceph_msg_header *hdr,
 				  int *skip)
 {
 	struct ceph_osd *osd = con->private;
 	struct ceph_osd_client *osdc = osd->o_osdc;
-	int type = le16_to_cpu(hdr->type);
-	int front = le32_to_cpu(hdr->front_len);
-	int data_len = le32_to_cpu(hdr->data_len);
 	struct ceph_msg *m;
 	struct ceph_osd_request *req;
+	int front = le32_to_cpu(hdr->front_len);
+	int data_len = le32_to_cpu(hdr->data_len);
 	u64 tid;
 	int err;
 
-	*skip = 0;
-	if (type != CEPH_MSG_OSD_OPREPLY)
-		return NULL;
-
 	tid = le64_to_cpu(hdr->tid);
 	mutex_lock(&osdc->request_mutex);
 	req = __lookup_request(osdc, tid);
 	if (!req) {
 		*skip = 1;
 		m = NULL;
-		dout("alloc_msg unknown tid %llu\n", tid);
+		pr_info("alloc_msg unknown tid %llu from osd%d\n", tid,
+			osd->o_osd);
 		goto out;
 	}
 	m = __get_next_reply(con, req, front);
@@ -1437,11 +1436,33 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
 			m = ERR_PTR(err);
 		}
 	}
+	*skip = 0;
 
 out:
 	mutex_unlock(&osdc->request_mutex);
-
 	return m;
+
+}
+
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+				  struct ceph_msg_header *hdr,
+				  int *skip)
+{
+	struct ceph_osd *osd = con->private;
+	int type = le16_to_cpu(hdr->type);
+	int front = le32_to_cpu(hdr->front_len);
+
+	switch (type) {
+	case CEPH_MSG_OSD_MAP:
+		return ceph_msg_new(type, front, 0, 0, NULL);
+	case CEPH_MSG_OSD_OPREPLY:
+		return get_reply(con, hdr, skip);
+	default:
+		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+			osd->o_osd);
+		*skip = 1;
+		return NULL;
+	}
 }
 
 /*
-- 
cgit v1.2.2


From 4ce1e9adabbad8f2c45ceeeb6de56cc368484297 Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Mon, 22 Feb 2010 17:17:44 +0300
Subject: ceph: move dereference after NULL test

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 25360d517d1b..23bb0ceabe31 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -144,7 +144,7 @@ static int ceph_set_page_dirty(struct page *page)
  */
 static void ceph_invalidatepage(struct page *page, unsigned long offset)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode;
 	struct ceph_inode_info *ci;
 	struct ceph_snap_context *snapc = (void *)page->private;
 
@@ -153,6 +153,8 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
 	BUG_ON(!PagePrivate(page));
 	BUG_ON(!page->mapping);
 
+	inode = page->mapping->host;
+
 	/*
 	 * We can get non-dirty pages here due to races between
 	 * set_page_dirty and truncate_complete_page; just spit out a
-- 
cgit v1.2.2


From a6369741c48815fedfce7072b7a9cd98b5bafd56 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 22 Feb 2010 13:59:00 -0800
Subject: ceph: fix comments, locking in destroy_inode

The destroy_inode path needs no inode locks since there are no
inode references.  Update __ceph_remove_cap comment to reflect
that it is called without cap->session->s_mutex in this case.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 289f6c65a17e..b6154ffe70df 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -856,8 +856,8 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
 }
 
 /*
- * caller should hold i_lock, and session s_mutex.
- * returns true if this is the last cap.  if so, caller should iput.
+ * caller should hold i_lock.
+ * caller will not hold session s_mutex if called from destroy_inode.
  */
 void __ceph_remove_cap(struct ceph_cap *cap)
 {
@@ -974,15 +974,14 @@ static int send_cap_msg(struct ceph_mds_session *session,
 }
 
 /*
- * Queue cap releases when an inode is dropped from our
- * cache.
+ * Queue cap releases when an inode is dropped from our cache.  Since
+ * inode is about to be destroyed, there is no need for i_lock.
  */
 void ceph_queue_caps_release(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct rb_node *p;
 
-	spin_lock(&inode->i_lock);
 	p = rb_first(&ci->i_caps);
 	while (p) {
 		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
@@ -1024,9 +1023,7 @@ void ceph_queue_caps_release(struct inode *inode)
 		spin_unlock(&session->s_cap_lock);
 		p = rb_next(p);
 		__ceph_remove_cap(cap);
-
 	}
-	spin_unlock(&inode->i_lock);
 }
 
 /*
-- 
cgit v1.2.2


From 2600d2dd5085ab6fb09540226138a60055abf335 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 22 Feb 2010 15:12:16 -0800
Subject: ceph: drop messages on unregistered mds sessions; cleanup

Verify the mds session is currently registered before handling
incoming messages.  Clean up message handlers to pull mds out
of session->s_mds instead of less trustworthy src field.

Clean up con_{get,put} debug output.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       |  2 +-
 fs/ceph/mds_client.c | 85 ++++++++++++++++++++++++++--------------------------
 fs/ceph/snap.c       | 17 ++---------
 fs/ceph/super.h      |  1 +
 4 files changed, 46 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b6154ffe70df..bb846164addc 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2600,7 +2600,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	struct inode *inode;
 	struct ceph_cap *cap;
 	struct ceph_mds_caps *h;
-	int mds = le64_to_cpu(msg->hdr.src.name.num);
+	int mds = session->s_mds;
 	int op;
 	u32 seq;
 	struct ceph_vino vino;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 124c0c17a14a..4d00ea2af000 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -309,6 +309,15 @@ static bool __have_session(struct ceph_mds_client *mdsc, int mds)
 	return mdsc->sessions[mds];
 }
 
+static int __verify_registered_session(struct ceph_mds_client *mdsc,
+				       struct ceph_mds_session *s)
+{
+	if (s->s_mds >= mdsc->max_sessions ||
+	    mdsc->sessions[s->s_mds] != s)
+		return -ENOENT;
+	return 0;
+}
+
 /*
  * create+register a new session for given mds.
  * called under mdsc->mutex.
@@ -382,10 +391,11 @@ fail_realloc:
 /*
  * called under mdsc->mutex
  */
-static void unregister_session(struct ceph_mds_client *mdsc,
+static void __unregister_session(struct ceph_mds_client *mdsc,
 			       struct ceph_mds_session *s)
 {
-	dout("unregister_session mds%d %p\n", s->s_mds, s);
+	dout("__unregister_session mds%d %p\n", s->s_mds, s);
+	BUG_ON(mdsc->sessions[s->s_mds] != s);
 	mdsc->sessions[s->s_mds] = NULL;
 	ceph_con_close(&s->s_con);
 	ceph_put_mds_session(s);
@@ -1740,10 +1750,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
 	u64 tid;
 	int err, result;
-	int mds;
+	int mds = session->s_mds;
 
-	if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
-		return;
 	if (msg->front.iov_len < sizeof(*head)) {
 		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
 		ceph_msg_dump(msg);
@@ -1760,7 +1768,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 		return;
 	}
 	dout("handle_reply %p\n", req);
-	mds = le64_to_cpu(msg->hdr.src.name.num);
 
 	/* correct session? */
 	if (!req->r_session && req->r_session != session) {
@@ -1884,7 +1891,9 @@ out:
 /*
  * handle mds notification that our request has been forwarded.
  */
-static void handle_forward(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+static void handle_forward(struct ceph_mds_client *mdsc,
+			   struct ceph_mds_session *session,
+			   struct ceph_msg *msg)
 {
 	struct ceph_mds_request *req;
 	u64 tid;
@@ -1894,11 +1903,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 	int err = -EINVAL;
 	void *p = msg->front.iov_base;
 	void *end = p + msg->front.iov_len;
-	int from_mds, state;
-
-	if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
-		goto bad;
-	from_mds = le64_to_cpu(msg->hdr.src.name.num);
+	int state;
 
 	ceph_decode_need(&p, end, sizeof(u64)+2*sizeof(u32), bad);
 	tid = ceph_decode_64(&p);
@@ -1915,6 +1920,9 @@ static void handle_forward(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 		goto out;  /* dup reply? */
 	}
 
+	if (next_mds >= mdsc->max_sessions)
+		goto out;
+
 	state = mdsc->sessions[next_mds]->s_state;
 	if (fwd_seq <= req->r_num_fwd) {
 		dout("forward %llu to mds%d - old seq %d <= %d\n",
@@ -1945,14 +1953,10 @@ static void handle_session(struct ceph_mds_session *session,
 	struct ceph_mds_client *mdsc = session->s_mdsc;
 	u32 op;
 	u64 seq;
-	int mds;
+	int mds = session->s_mds;
 	struct ceph_mds_session_head *h = msg->front.iov_base;
 	int wake = 0;
 
-	if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
-		return;
-	mds = le64_to_cpu(msg->hdr.src.name.num);
-
 	/* decode */
 	if (msg->front.iov_len != sizeof(*h))
 		goto bad;
@@ -1960,6 +1964,8 @@ static void handle_session(struct ceph_mds_session *session,
 	seq = le64_to_cpu(h->seq);
 
 	mutex_lock(&mdsc->mutex);
+	if (op == CEPH_SESSION_CLOSE)
+		__unregister_session(mdsc, session);
 	/* FIXME: this ttl calculation is generous */
 	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
 	mutex_unlock(&mdsc->mutex);
@@ -1990,7 +1996,6 @@ static void handle_session(struct ceph_mds_session *session,
 		break;
 
 	case CEPH_SESSION_CLOSE:
-		unregister_session(mdsc, session);
 		remove_session_caps(session);
 		wake = 1; /* for good measure */
 		complete(&mdsc->session_close_waiters);
@@ -2269,7 +2274,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 				/* the session never opened, just close it
 				 * out now */
 				__wake_requests(mdsc, &s->s_waiting);
-				unregister_session(mdsc, s);
+				__unregister_session(mdsc, s);
 			} else {
 				/* just close it */
 				mutex_unlock(&mdsc->mutex);
@@ -2329,24 +2334,22 @@ void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
 	di->lease_session = NULL;
 }
 
-static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+static void handle_lease(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session,
+			 struct ceph_msg *msg)
 {
 	struct super_block *sb = mdsc->client->sb;
 	struct inode *inode;
-	struct ceph_mds_session *session;
 	struct ceph_inode_info *ci;
 	struct dentry *parent, *dentry;
 	struct ceph_dentry_info *di;
-	int mds;
+	int mds = session->s_mds;
 	struct ceph_mds_lease *h = msg->front.iov_base;
 	struct ceph_vino vino;
 	int mask;
 	struct qstr dname;
 	int release = 0;
 
-	if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
-		return;
-	mds = le64_to_cpu(msg->hdr.src.name.num);
 	dout("handle_lease from mds%d\n", mds);
 
 	/* decode */
@@ -2360,15 +2363,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 	if (dname.len != get_unaligned_le32(h+1))
 		goto bad;
 
-	/* find session */
-	mutex_lock(&mdsc->mutex);
-	session = __ceph_lookup_mds_session(mdsc, mds);
-	mutex_unlock(&mdsc->mutex);
-	if (!session) {
-		pr_err("handle_lease got lease but no session mds%d\n", mds);
-		return;
-	}
-
 	mutex_lock(&session->s_mutex);
 	session->s_seq++;
 
@@ -2437,7 +2431,6 @@ release:
 out:
 	iput(inode);
 	mutex_unlock(&session->s_mutex);
-	ceph_put_mds_session(session);
 	return;
 
 bad:
@@ -2794,7 +2787,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	for (i = 0; i < mdsc->max_sessions; i++) {
 		if (mdsc->sessions[i]) {
 			session = get_session(mdsc->sessions[i]);
-			unregister_session(mdsc, session);
+			__unregister_session(mdsc, session);
 			mutex_unlock(&mdsc->mutex);
 			mutex_lock(&session->s_mutex);
 			remove_session_caps(session);
@@ -2891,8 +2884,7 @@ static struct ceph_connection *con_get(struct ceph_connection *con)
 	struct ceph_mds_session *s = con->private;
 
 	if (get_session(s)) {
-		dout("mdsc con_get %p %d -> %d\n", s,
-		     atomic_read(&s->s_ref) - 1, atomic_read(&s->s_ref));
+		dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
 		return con;
 	}
 	dout("mdsc con_get %p FAIL\n", s);
@@ -2903,9 +2895,8 @@ static void con_put(struct ceph_connection *con)
 {
 	struct ceph_mds_session *s = con->private;
 
-	dout("mdsc con_put %p %d -> %d\n", s, atomic_read(&s->s_ref),
-	     atomic_read(&s->s_ref) - 1);
 	ceph_put_mds_session(s);
+	dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
 }
 
 /*
@@ -2926,6 +2917,13 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	int type = le16_to_cpu(msg->hdr.type);
 
+	mutex_lock(&mdsc->mutex);
+	if (__verify_registered_session(mdsc, s) < 0) {
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+	mutex_unlock(&mdsc->mutex);
+
 	switch (type) {
 	case CEPH_MSG_MDS_MAP:
 		ceph_mdsc_handle_map(mdsc, msg);
@@ -2937,22 +2935,23 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		handle_reply(s, msg);
 		break;
 	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
-		handle_forward(mdsc, msg);
+		handle_forward(mdsc, s, msg);
 		break;
 	case CEPH_MSG_CLIENT_CAPS:
 		ceph_handle_caps(s, msg);
 		break;
 	case CEPH_MSG_CLIENT_SNAP:
-		ceph_handle_snap(mdsc, msg);
+		ceph_handle_snap(mdsc, s, msg);
 		break;
 	case CEPH_MSG_CLIENT_LEASE:
-		handle_lease(mdsc, msg);
+		handle_lease(mdsc, s, msg);
 		break;
 
 	default:
 		pr_err("received unknown message type %d %s\n", type,
 		       ceph_msg_type_name(type));
 	}
+out:
 	ceph_msg_put(msg);
 }
 
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 49d0c4c59d81..bf2a5f3846a4 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -713,11 +713,11 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
  * directory into another realm.
  */
 void ceph_handle_snap(struct ceph_mds_client *mdsc,
+		      struct ceph_mds_session *session,
 		      struct ceph_msg *msg)
 {
 	struct super_block *sb = mdsc->client->sb;
-	struct ceph_mds_session *session;
-	int mds;
+	int mds = session->s_mds;
 	u64 split;
 	int op;
 	int trace_len;
@@ -730,10 +730,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 	int i;
 	int locked_rwsem = 0;
 
-	if (msg->hdr.src.name.type != CEPH_ENTITY_TYPE_MDS)
-		return;
-	mds = le64_to_cpu(msg->hdr.src.name.num);
-
 	/* decode */
 	if (msg->front.iov_len < sizeof(*h))
 		goto bad;
@@ -749,15 +745,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 	dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
 	     ceph_snap_op_name(op), split, trace_len);
 
-	/* find session */
-	mutex_lock(&mdsc->mutex);
-	session = __ceph_lookup_mds_session(mdsc, mds);
-	mutex_unlock(&mdsc->mutex);
-	if (!session) {
-		dout("WTF, got snap but no session for mds%d\n", mds);
-		return;
-	}
-
 	mutex_lock(&session->s_mutex);
 	session->s_seq++;
 	mutex_unlock(&session->s_mutex);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 384f0e2e7c68..ff7aaa32736c 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -707,6 +707,7 @@ extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
 extern int ceph_update_snap_trace(struct ceph_mds_client *m,
 				  void *p, void *e, bool deletion);
 extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_session *session,
 			     struct ceph_msg *msg);
 extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
 				struct ceph_snap_context *snapc);
-- 
cgit v1.2.2


From a1ea787c7b6ec036d169d84e08cca7b6e399ba70 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 23 Feb 2010 14:02:44 -0800
Subject: ceph: fix client_request_forward decoding

The tid is in the message header, not body.  Broken since 6df058c0.

No need to look at next mds session; just mark the request and be done.
(The old error path was broken too, but now it's gone.)

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4d00ea2af000..bec8a7aeb300 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1896,17 +1896,15 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 			   struct ceph_msg *msg)
 {
 	struct ceph_mds_request *req;
-	u64 tid;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
 	u32 next_mds;
 	u32 fwd_seq;
 	u8 must_resend;
 	int err = -EINVAL;
 	void *p = msg->front.iov_base;
 	void *end = p + msg->front.iov_len;
-	int state;
 
-	ceph_decode_need(&p, end, sizeof(u64)+2*sizeof(u32), bad);
-	tid = ceph_decode_64(&p);
+	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
 	next_mds = ceph_decode_32(&p);
 	fwd_seq = ceph_decode_32(&p);
 	must_resend = ceph_decode_8(&p);
@@ -1920,10 +1918,6 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 		goto out;  /* dup reply? */
 	}
 
-	if (next_mds >= mdsc->max_sessions)
-		goto out;
-
-	state = mdsc->sessions[next_mds]->s_state;
 	if (fwd_seq <= req->r_num_fwd) {
 		dout("forward %llu to mds%d - old seq %d <= %d\n",
 		     tid, next_mds, req->r_num_fwd, fwd_seq);
-- 
cgit v1.2.2


From 88d892a37fc231ab2aa3b1c40ca9d67224616594 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 23 Feb 2010 18:16:23 +0000
Subject: ceph: don't clobber write return value when using O_SYNC

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/file.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 2c4ae4441cab..88932c9145e9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -807,7 +807,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
 	loff_t endoff = pos + iov->iov_len;
 	int got = 0;
-	int ret;
+	int ret, err;
 
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
@@ -838,9 +838,12 @@ retry_snap:
 
 		if ((ret >= 0 || ret == -EIOCBQUEUED) &&
 		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
-		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL)))
-			ret = vfs_fsync_range(file, file->f_path.dentry,
+		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
+			err = vfs_fsync_range(file, file->f_path.dentry,
 					      pos, pos + ret - 1, 1);
+			if (err < 0)
+				ret = err;
+		}
 	}
 	if (ret >= 0) {
 		spin_lock(&inode->i_lock);
-- 
cgit v1.2.2


From 161fd65ac934608345aed35226fc889ea3b0b500 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 25 Feb 2010 12:38:57 -0800
Subject: ceph: invalidate_authorizer without con->mutex held

This fixes lock ABBA inversion, as the ->invalidate_authorizer()
op may need to take a lock (or even call back into the
messenger).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index fdda707aa137..9ea7b763c8dc 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1853,14 +1853,6 @@ static void ceph_fault(struct ceph_connection *con)
 		con->in_msg = NULL;
 	}
 
-	/*
-	 * in case we faulted due to authentication, invalidate our
-	 * current tickets so that we can get new ones.
-         */
-	if (con->auth_retry && con->ops->invalidate_authorizer) {
-		dout("calling invalidate_authorizer()\n");
-		con->ops->invalidate_authorizer(con);
-	}
 
 	/* If there are no messages in the queue, place the connection
 	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
@@ -1890,6 +1882,15 @@ static void ceph_fault(struct ceph_connection *con)
 out_unlock:
 	mutex_unlock(&con->mutex);
 out:
+	/*
+	 * in case we faulted due to authentication, invalidate our
+	 * current tickets so that we can get new ones.
+         */
+	if (con->auth_retry && con->ops->invalidate_authorizer) {
+		dout("calling invalidate_authorizer()\n");
+		con->ops->invalidate_authorizer(con);
+	}
+
 	if (con->ops->fault)
 		con->ops->fault(con);
 }
-- 
cgit v1.2.2


From e80a52d14f868059e8ec790c9fae88cdb8a1df98 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 25 Feb 2010 12:40:45 -0800
Subject: ceph: fix connection fault STANDBY check

Move any out_sent messages to out_queue _before_ checking if
out_queue is empty and going to STANDBY, or else we may drop
something that was never acked.

And clean up the code a bit (less goto).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 9ea7b763c8dc..0ddc2c75f6b4 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1853,32 +1853,27 @@ static void ceph_fault(struct ceph_connection *con)
 		con->in_msg = NULL;
 	}
 
+	/* Requeue anything that hasn't been acked */
+	list_splice_init(&con->out_sent, &con->out_queue);
 
 	/* If there are no messages in the queue, place the connection
 	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
 	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
 		dout("fault setting STANDBY\n");
 		set_bit(STANDBY, &con->state);
-		mutex_unlock(&con->mutex);
-		goto out;
+	} else {
+		/* retry after a delay. */
+		if (con->delay == 0)
+			con->delay = BASE_DELAY_INTERVAL;
+		else if (con->delay < MAX_DELAY_INTERVAL)
+			con->delay *= 2;
+		dout("fault queueing %p delay %lu\n", con, con->delay);
+		con->ops->get(con);
+		if (queue_delayed_work(ceph_msgr_wq, &con->work,
+				       round_jiffies_relative(con->delay)) == 0)
+			con->ops->put(con);
 	}
 
-	/* Requeue anything that hasn't been acked, and retry after a
-	 * delay. */
-	list_splice_init(&con->out_sent, &con->out_queue);
-
-	if (con->delay == 0)
-		con->delay = BASE_DELAY_INTERVAL;
-	else if (con->delay < MAX_DELAY_INTERVAL)
-		con->delay *= 2;
-
-	/* explicitly schedule work to try to reconnect again later. */
-	dout("fault queueing %p delay %lu\n", con, con->delay);
-	con->ops->get(con);
-	if (queue_delayed_work(ceph_msgr_wq, &con->work,
-			       round_jiffies_relative(con->delay)) == 0)
-		con->ops->put(con);
-
 out_unlock:
 	mutex_unlock(&con->mutex);
 out:
-- 
cgit v1.2.2


From c99eb1c7263a44e63161a041a778b345b5cf0b6a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 26 Feb 2010 09:37:33 -0800
Subject: ceph: remove fragile __map_osds optimization

We used to try to avoid freeing and then reallocating the osd
struct.  This is a bit fragile due to potential interactions with
other references (beyond o_requests), and may be the cause of
this crash:

[120633.442358] BUG: unable to handle kernel NULL pointer dereference at (null)
[120633.443292] IP: [<ffffffff812549b6>] rb_erase+0x11d/0x277
[120633.443292] PGD f7ff3067 PUD f7f53067 PMD 0
[120633.443292] Oops: 0000 [#1] PREEMPT SMP
[120633.443292] last sysfs file: /sys/kernel/uevent_seqnum
[120633.443292] CPU 1
[120633.443292] Modules linked in: ceph fan ac battery psmouse ehci_hcd ide_pci_generic ohci_hcd thermal processor button
[120633.443292] Pid: 3023, comm: ceph-msgr/1 Not tainted 2.6.32-rc2 #12 H8SSL
[120633.443292] RIP: 0010:[<ffffffff812549b6>]  [<ffffffff812549b6>] rb_erase+0x11d/0x277
[120633.443292] RSP: 0018:ffff8800f7b13a50  EFLAGS: 00010246
[120633.443292] RAX: ffff880022907819 RBX: ffff880022907818 RCX: 0000000000000000
[120633.443292] RDX: ffff8800f7b13a80 RSI: ffff8800f587eb48 RDI: 0000000000000000
[120633.443292] RBP: ffff8800f7b13a60 R08: 0000000000000000 R09: 0000000000000004
[120633.443292] R10: 0000000000000000 R11: ffff8800c4441000 R12: ffff8800f587eb48
[120633.443292] R13: ffff8800f58eaa00 R14: ffff8800f413c000 R15: 0000000000000001
[120633.443292] FS:  00007fbef6e226e0(0000) GS:ffff880009200000(0000) knlGS:0000000000000000
[120633.443292] CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[120633.443292] CR2: 0000000000000000 CR3: 00000000f7c53000 CR4: 00000000000006e0
[120633.443292] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[120633.443292] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[120633.443292] Process ceph-msgr/1 (pid: 3023, threadinfo ffff8800f7b12000, task ffff8800f5858b40)
[120633.443292] Stack:
[120633.443292]  ffff8800f413c000 ffff8800f587e9c0 ffff8800f7b13a80 ffffffffa0098a86
[120633.443292] <0> 00000000000006f1 0000000000000000 ffff8800f7b13af0 ffffffffa009959b
[120633.443292] <0> ffff8800f413c000 ffff880022a68400 ffff880022a68400 ffff8800f587e9c0
[120633.443292] Call Trace:
[120633.443292]  [<ffffffffa0098a86>] __remove_osd+0x4d/0xbc [ceph]
[120633.443292]  [<ffffffffa009959b>] __map_osds+0x199/0x4fa [ceph]
[120633.443292]  [<ffffffffa00999f4>] ? __send_request+0xf8/0x186 [ceph]
[120633.443292]  [<ffffffffa0099beb>] kick_requests+0x169/0x3cb [ceph]
[120633.443292]  [<ffffffffa009a8c1>] ceph_osdc_handle_map+0x370/0x522 [ceph]

Since we're probably screwed anyway if a small kmalloc is
failing, don't bother with trying to be clever here.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index ffd819c5a5dd..3a631f27cc9e 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -615,7 +615,6 @@ static int __map_osds(struct ceph_osd_client *osdc,
 	struct ceph_pg pgid;
 	int o = -1;
 	int err;
-	struct ceph_osd *newosd = NULL;
 
 	dout("map_osds %p tid %lld\n", req, req->r_tid);
 	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
@@ -639,25 +638,15 @@ static int __map_osds(struct ceph_osd_client *osdc,
 	if (req->r_osd) {
 		__cancel_request(req);
 		list_del_init(&req->r_osd_item);
-		if (list_empty(&req->r_osd->o_requests)) {
-			/* try to re-use r_osd if possible */
-			newosd = get_osd(req->r_osd);
-			__remove_osd(osdc, newosd);
-		}
 		req->r_osd = NULL;
 	}
 
 	req->r_osd = __lookup_osd(osdc, o);
 	if (!req->r_osd && o >= 0) {
-		if (newosd) {
-			req->r_osd = newosd;
-			newosd = NULL;
-		} else {
-			err = -ENOMEM;
-			req->r_osd = create_osd(osdc);
-			if (!req->r_osd)
-				goto out;
-		}
+		err = -ENOMEM;
+		req->r_osd = create_osd(osdc);
+		if (!req->r_osd)
+			goto out;
 
 		dout("map_osds osd %p is osd%d\n", req->r_osd, o);
 		req->r_osd->o_osd = o;
@@ -674,8 +663,6 @@ static int __map_osds(struct ceph_osd_client *osdc,
 	err = 1;   /* osd changed */
 
 out:
-	if (newosd)
-		put_osd(newosd);
 	return err;
 }
 
-- 
cgit v1.2.2


From 080af17e9c6360c5a835528e8de3141a46273ed2 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 25 Feb 2010 16:40:07 -0800
Subject: ceph: remove bogus mds forward warning

The must_resend flag is always true, not false.  In any case, we can
just ignore it anyway.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index bec8a7aeb300..a2600101ec22 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1899,7 +1899,6 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 	u64 tid = le64_to_cpu(msg->hdr.tid);
 	u32 next_mds;
 	u32 fwd_seq;
-	u8 must_resend;
 	int err = -EINVAL;
 	void *p = msg->front.iov_base;
 	void *end = p + msg->front.iov_len;
@@ -1907,14 +1906,11 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
 	next_mds = ceph_decode_32(&p);
 	fwd_seq = ceph_decode_32(&p);
-	must_resend = ceph_decode_8(&p);
-
-	WARN_ON(must_resend);  /* shouldn't happen. */
 
 	mutex_lock(&mdsc->mutex);
 	req = __lookup_request(mdsc, tid);
 	if (!req) {
-		dout("forward %llu dne\n", tid);
+		dout("forward %llu to mds%d - req dne\n", tid, next_mds);
 		goto out;  /* dup reply? */
 	}
 
-- 
cgit v1.2.2


From 1679f876a641d209e7b22e43ebda0693c71003cf Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 26 Feb 2010 13:55:51 -0800
Subject: ceph: reset bits on connection close

Clear LOSSYTX bit, so that if/when we reconnect, said reconnect
will retry on failure.

Clear _PENDING bits too, to avoid polluting subsequent
connection state.

Drop unused REGISTERED bit.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 3 +++
 fs/ceph/messenger.h | 1 -
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 0ddc2c75f6b4..bf4590c77cf6 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -342,6 +342,9 @@ void ceph_con_close(struct ceph_connection *con)
 	dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
 	set_bit(CLOSED, &con->state);  /* in case there's queued work */
 	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
+	clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
+	clear_bit(KEEPALIVE_PENDING, &con->state);
+	clear_bit(WRITE_PENDING, &con->state);
 	mutex_lock(&con->mutex);
 	reset_connection(con);
 	cancel_delayed_work(&con->work);
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index c9735378be3f..4caaa5911110 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -119,7 +119,6 @@ struct ceph_msg_pos {
 			    * state with the peer. */
 #define CLOSED		10 /* we've closed the connection */
 #define SOCK_CLOSED	11 /* socket state changed to closed */
-#define REGISTERED      12 /* connection appears in con_tree */
 #define OPENING         13 /* open connection w/ (possibly new) peer */
 #define DEAD            14 /* dead, about to kfree */
 
-- 
cgit v1.2.2


From c16e786927b977cb880873214bbd815e8d5ec4ba Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 1 Mar 2010 13:02:00 -0800
Subject: ceph: use single osd op reply msg

Use a single ceph_msg for the osd reply, even when we are getting multiple
replies.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 138 +++++++++++++++++----------------------------------
 fs/ceph/osd_client.h |   5 +-
 2 files changed, 46 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3a631f27cc9e..ffe1f4064ccd 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -13,7 +13,8 @@
 #include "decode.h"
 #include "auth.h"
 
-#define OSD_REPLY_RESERVE_FRONT_LEN	512
+#define OSD_OP_FRONT_LEN	4096
+#define OSD_OPREPLY_FRONT_LEN	512
 
 const static struct ceph_connection_operations osd_con_ops;
 
@@ -75,17 +76,6 @@ static void calc_layout(struct ceph_osd_client *osdc,
 	     req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
 }
 
-static void remove_replies(struct ceph_osd_request *req)
-{
-	int i;
-	int max = ARRAY_SIZE(req->replies);
-
-	for (i=0; i<max; i++) {
-		if (req->replies[i])
-			ceph_msg_put(req->replies[i]);
-	}
-}
-
 /*
  * requests
  */
@@ -99,7 +89,6 @@ void ceph_osdc_release_request(struct kref *kref)
 		ceph_msg_put(req->r_request);
 	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
-	remove_replies(req);
 	if (req->r_con_filling_msg) {
 		dout("release_request revoking pages %p from con %p\n",
 		     req->r_pages, req->r_con_filling_msg);
@@ -117,60 +106,6 @@ void ceph_osdc_release_request(struct kref *kref)
 		kfree(req);
 }
 
-static int alloc_replies(struct ceph_osd_request *req, int num_reply)
-{
-	int i;
-	int max = ARRAY_SIZE(req->replies);
-
-	BUG_ON(num_reply > max);
-
-	for (i=0; i<num_reply; i++) {
-		req->replies[i] = ceph_msg_new(0, OSD_REPLY_RESERVE_FRONT_LEN, 0, 0, NULL);
-		if (IS_ERR(req->replies[i])) {
-			int j;
-			int err = PTR_ERR(req->replies[i]);
-			for (j = 0; j<=i; j++) {
-				ceph_msg_put(req->replies[j]);
-			}
-			return err;
-		}
-	}
-
-	for (; i<max; i++) {
-		req->replies[i] = NULL;
-	}
-
-	req->cur_reply = 0;
-
-	return 0;
-}
-
-static struct ceph_msg *__get_next_reply(struct ceph_connection *con,
-				       struct ceph_osd_request *req,
-				       int front_len)
-{
-	struct ceph_msg *reply;
-	if (req->r_con_filling_msg) {
-		dout("revoking reply msg %p from old con %p\n", req->r_reply,
-		     req->r_con_filling_msg);
-		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
-		ceph_con_put(req->r_con_filling_msg);
-		req->cur_reply = 0;
-	}
-	reply = req->replies[req->cur_reply];
-	if (!reply || front_len > OSD_REPLY_RESERVE_FRONT_LEN) {
-		/* maybe we can allocate it now? */
-		reply = ceph_msg_new(0, front_len, 0, 0, NULL);
-		if (!reply || IS_ERR(reply)) {
-			pr_err(" reply alloc failed, front_len=%d\n", front_len);
-			return ERR_PTR(-ENOMEM);
-		}
-	}
-	req->r_con_filling_msg = ceph_con_get(con);
-	req->r_reply = ceph_msg_get(reply); /* for duration of read over socket */
-	return ceph_msg_get(reply);
-}
-
 /*
  * build new request AND message, calculate layout, and adjust file
  * extent as needed.
@@ -201,7 +136,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	void *p;
 	int num_op = 1 + do_sync;
 	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
-	int err, i;
+	int i;
 
 	if (use_mempool) {
 		req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
@@ -212,13 +147,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	if (req == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	err = alloc_replies(req, num_reply);
-	if (err) {
-		ceph_osdc_put_request(req);
-		return ERR_PTR(-ENOMEM);
-	}
-	req->r_num_prealloc_reply = num_reply;
-
 	req->r_osdc = osdc;
 	req->r_mempool = use_mempool;
 	kref_init(&req->r_kref);
@@ -229,7 +157,19 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
 	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
 
-	/* create message; allow space for oid */
+	/* create reply message */
+	if (use_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
+				   OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
+	if (IS_ERR(msg)) {
+		ceph_osdc_put_request(req);
+		return ERR_PTR(PTR_ERR(msg));
+	}
+	req->r_reply = msg;
+
+	/* create request message; allow space for oid */
 	msg_size += 40;
 	if (snapc)
 		msg_size += sizeof(u64) * snapc->num_snaps;
@@ -819,21 +759,11 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	 * avoid a (safe but slower) revoke later.
 	 */
 	if (req->r_con_filling_msg == con && req->r_reply == msg) {
-		dout(" got pages, dropping con_filling_msg ref %p\n", con);
+		dout(" dropping con_filling_msg ref %p\n", con);
 		req->r_con_filling_msg = NULL;
 		ceph_con_put(con);
 	}
 
-	if (req->r_reply) {
-		/*
-		 * once we see the message has been received, we don't
-		 * need a ref (which is only needed for revoking
-		 * pages)
-		 */
-		ceph_msg_put(req->r_reply);
-		req->r_reply = NULL;
-	}
-
 	if (!req->r_got_reply) {
 		unsigned bytes;
 
@@ -1249,11 +1179,17 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	if (!osdc->req_mempool)
 		goto out;
 
-	err = ceph_msgpool_init(&osdc->msgpool_op, 4096, 10, true);
+	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
 	if (err < 0)
 		goto out_mempool;
+	err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+				OSD_OPREPLY_FRONT_LEN, 10, true);
+	if (err < 0)
+		goto out_msgpool;
 	return 0;
 
+out_msgpool:
+	ceph_msgpool_destroy(&osdc->msgpool_op);
 out_mempool:
 	mempool_destroy(osdc->req_mempool);
 out:
@@ -1271,6 +1207,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 	remove_old_osds(osdc, 1);
 	mempool_destroy(osdc->req_mempool);
 	ceph_msgpool_destroy(&osdc->msgpool_op);
+	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
 }
 
 /*
@@ -1405,16 +1342,29 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	if (!req) {
 		*skip = 1;
 		m = NULL;
-		pr_info("alloc_msg unknown tid %llu from osd%d\n", tid,
+		pr_info("get_reply unknown tid %llu from osd%d\n", tid,
 			osd->o_osd);
 		goto out;
 	}
-	m = __get_next_reply(con, req, front);
-	if (!m || IS_ERR(m)) {
-		*skip = 1;
-		goto out;
+
+	if (req->r_con_filling_msg) {
+		dout("get_reply revoking msg %p from old con %p\n",
+		     req->r_reply, req->r_con_filling_msg);
+		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+		ceph_con_put(req->r_con_filling_msg);
 	}
 
+	if (front > req->r_reply->front.iov_len) {
+		pr_warning("get_reply front %d > preallocated %d\n",
+			   front, (int)req->r_reply->front.iov_len);
+		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
+		if (IS_ERR(m))
+			goto out;
+		ceph_msg_put(req->r_reply);
+		req->r_reply = m;
+	}
+	m = ceph_msg_get(req->r_reply);
+
 	if (data_len > 0) {
 		err = __prepare_pages(con, hdr, req, tid, m);
 		if (err < 0) {
@@ -1424,6 +1374,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		}
 	}
 	*skip = 0;
+	req->r_con_filling_msg = ceph_con_get(con);
+	dout("get_reply tid %lld %p\n", tid, m);
 
 out:
 	mutex_unlock(&osdc->request_mutex);
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index 70f31b61f02c..f256eba6fe7a 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -53,7 +53,6 @@ struct ceph_osd_request {
 	int               r_flags;     /* any additional flags for the osd */
 	u32               r_sent;      /* >0 if r_request is sending/sent */
 	int               r_got_reply;
-	int		  r_num_prealloc_reply;
 
 	struct ceph_osd_client *r_osdc;
 	struct kref       r_kref;
@@ -77,9 +76,6 @@ struct ceph_osd_request {
 	struct page     **r_pages;            /* pages for data payload */
 	int               r_pages_from_pool;
 	int               r_own_pages;        /* if true, i own page list */
-
-	struct ceph_msg   *replies[2];
-	int		  cur_reply;
 };
 
 struct ceph_osd_client {
@@ -106,6 +102,7 @@ struct ceph_osd_client {
 	mempool_t              *req_mempool;
 
 	struct ceph_msgpool	msgpool_op;
+	struct ceph_msgpool	msgpool_op_reply;
 };
 
 extern int ceph_osdc_init(struct ceph_osd_client *osdc,
-- 
cgit v1.2.2


From 70edb55bdfa8922c8ad40bc5a67abb6d9fee8d47 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 1 Mar 2010 13:20:50 -0800
Subject: ceph: fix snaptrace decoding on cap migration between mds

This was simply broken.  Apparently at some point we thought about putting
the snaptrace in the middle section, but didn't.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index bb846164addc..9afa8d37a6e3 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2608,6 +2608,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	u64 size, max_size;
 	u64 tid;
 	int check_caps = 0;
+	void *snaptrace;
 	int r;
 
 	dout("handle_caps from mds%d\n", mds);
@@ -2617,6 +2618,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	if (msg->front.iov_len < sizeof(*h))
 		goto bad;
 	h = msg->front.iov_base;
+	snaptrace = h + 1;
 	op = le32_to_cpu(h->op);
 	vino.ino = le64_to_cpu(h->ino);
 	vino.snap = CEPH_NOSNAP;
@@ -2651,8 +2653,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 
 	case CEPH_CAP_OP_IMPORT:
 		handle_cap_import(mdsc, inode, h, session,
-				  msg->middle,
-				  le32_to_cpu(h->snap_trace_len));
+				  snaptrace, le32_to_cpu(h->snap_trace_len));
 		check_caps = 1; /* we may have sent a RELEASE to the old auth */
 		goto done;
 	}
-- 
cgit v1.2.2


From 3ca02ef96e119d36bc1752baeae7dd0c59c2f325 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 1 Mar 2010 15:25:00 -0800
Subject: ceph: reset front len on return to msgpool; BUG on mismatched front
 iov

Reset msg front len when a message is returned to the pool: the caller
may have changed it.

BUG if we try to send a message with a hdr.front_len that doesn't match
the front iov.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 2 ++
 fs/ceph/msgpool.c   | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index bf4590c77cf6..781656a49bf8 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1954,6 +1954,8 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 	msg->hdr.src.addr = con->msgr->my_enc_addr;
 	msg->hdr.orig_src = msg->hdr.src;
 
+	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+
 	/* queue */
 	mutex_lock(&con->mutex);
 	BUG_ON(!list_empty(&msg->list_head));
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index 2f04e0fc4666..ca3b44a89f2d 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -166,6 +166,10 @@ void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
 {
 	spin_lock(&pool->lock);
 	if (pool->num < pool->min) {
+		/* reset msg front_len; user may have changed it */
+		msg->front.iov_len = pool->front_len;
+		msg->hdr.front_len = cpu_to_le32(pool->front_len);
+
 		kref_set(&msg->kref, 1);  /* retake a single ref */
 		list_add(&msg->list_head, &pool->msgs);
 		pool->num++;
-- 
cgit v1.2.2


From 6f863e712d4114e8ae2f02de64ebeac0546ebaa0 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 1 Mar 2010 15:26:41 -0800
Subject: ceph: set osd request message front length correctly

We didn't set the front length correctly.  When messages used
the message pool we ended up with the conservative max (4 KB), and
the rest of the time the slightly less conservative estimate.  Even
though the OSD ignores the extra data, set it to the right value to avoid
sending extra data over the network.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index ffe1f4064ccd..c4763bff97b4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -228,6 +228,9 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	}
 
 	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+	msg_size = p - msg->front.iov_base;
+	msg->front.iov_len = msg_size;
+	msg->hdr.front_len = cpu_to_le32(msg_size);
 	return req;
 }
 
-- 
cgit v1.2.2


From 195d3ce2cc9a8ec69827f6369c41b269345b9988 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 1 Mar 2010 09:57:54 -0800
Subject: ceph: return EBADF if waiting for caps on closed file

Verify the file is actually open for the given caps when we are
waiting for caps.  This ensures we will wake up and return EBADF
if another thread closes the file out from under us.

Note that EBADF is also the correct return code from write(2)
when called on a file handle opened for reading (although the
vfs should catch that).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 9 ++++++---
 fs/ceph/file.c | 3 +++
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 9afa8d37a6e3..06f197983be6 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1923,14 +1923,17 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
 	struct inode *inode = &ci->vfs_inode;
 	int ret = 0;
 	int have, implemented;
+	int file_wanted;
 
 	dout("get_cap_refs %p need %s want %s\n", inode,
 	     ceph_cap_string(need), ceph_cap_string(want));
 	spin_lock(&inode->i_lock);
 
-	/* make sure we _have_ some caps! */
-	if (!__ceph_is_any_caps(ci)) {
-		dout("get_cap_refs %p no real caps\n", inode);
+	/* make sure file is actually open */
+	file_wanted = __ceph_caps_file_wanted(ci);
+	if ((file_wanted & need) == 0) {
+		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
+		     ceph_cap_string(need), ceph_cap_string(file_wanted));
 		*err = -EBADF;
 		ret = 1;
 		goto out;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 88932c9145e9..5d2af8464f6a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -262,6 +262,9 @@ int ceph_release(struct inode *inode, struct file *file)
 	kfree(cf->dir_info);
 	dput(cf->dentry);
 	kmem_cache_free(ceph_file_cachep, cf);
+
+	/* wake up anyone waiting for caps on this inode */
+	wake_up(&ci->i_cap_wq);
 	return 0;
 }
 
-- 
cgit v1.2.2


From e53a8fd773065628b24605b289a9a40ee4a35d83 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 1 Mar 2010 14:50:05 -0800
Subject: ceph: fix osdmap decoding when pools include (removed) snaps

Add missing pointer dereference (p is a void **).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 34b5696c84fd..b83f2692b835 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -529,8 +529,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		ceph_decode_copy(p, &pi->v, sizeof(pi->v));
 		__insert_pg_pool(&map->pg_pools, pi);
 		calc_pg_masks(pi);
-		p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
-		p += le32_to_cpu(pi->v.num_removed_snap_intervals)
+		*p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+		*p += le32_to_cpu(pi->v.num_removed_snap_intervals)
 			* sizeof(u64) * 2;
 	}
 	ceph_decode_32_safe(p, end, map->pool_max, bad);
-- 
cgit v1.2.2


From 7af8f1e4aa86720840d3318e4dc225c3c7e5a6d0 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 1 Mar 2010 15:17:34 -0800
Subject: ceph: include migrating caps in issued set

We should include caps that are mid-migration (we've received the EXPORT,
but not the IMPORT) in the issued caps set.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 06f197983be6..295b7e547a31 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -655,7 +655,7 @@ static int __cap_is_valid(struct ceph_cap *cap)
  */
 int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
 {
-	int have = ci->i_snap_caps;
+	int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
 	struct ceph_cap *cap;
 	struct rb_node *p;
 
-- 
cgit v1.2.2


From e9964c102312967a4bc1fd501cb628c4a3b19034 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 1 Mar 2010 15:16:56 -0800
Subject: ceph: fix flush_dirty_caps race with caps migration

The flush_dirty_caps() used to loop over the first entry of the cap_dirty
dirty list on the assumption that after calling ceph_check_caps() it would
be removed from the list.  This isn't true for caps that are being
migrated between MDSs, where we've received the EXPORT but not the IMPORT.

Instead, do a safe list iteration, and pin the next inode on the list via
the CEPH_I_NOFLUSH flag.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c  | 45 ++++++++++++++++++++++++++++++++++++++-------
 fs/ceph/super.h |  1 +
 2 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 295b7e547a31..8b89b9123252 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1573,6 +1573,11 @@ retry_locked:
 		}
 
 ack:
+		if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+			dout(" skipping %p I_NOFLUSH set\n", inode);
+			continue;
+		}
+
 		if (session && session != cap->session) {
 			dout("oops, wrong session %p mutex\n", session);
 			mutex_unlock(&session->s_mutex);
@@ -1652,6 +1657,10 @@ static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
 
 retry:
 	spin_lock(&inode->i_lock);
+	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+		dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
+		goto out;
+	}
 	if (ci->i_dirty_caps && ci->i_auth_cap) {
 		struct ceph_cap *cap = ci->i_auth_cap;
 		int used = __ceph_caps_used(ci);
@@ -2747,16 +2756,38 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
  */
 void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
 {
-	struct ceph_inode_info *ci;
-	struct inode *inode;
+	struct ceph_inode_info *ci, *nci = NULL;
+	struct inode *inode, *ninode = NULL;
+	struct list_head *p, *n;
 
 	dout("flush_dirty_caps\n");
 	spin_lock(&mdsc->cap_dirty_lock);
-	while (!list_empty(&mdsc->cap_dirty)) {
-		ci = list_first_entry(&mdsc->cap_dirty,
-				      struct ceph_inode_info,
-				      i_dirty_item);
-		inode = igrab(&ci->vfs_inode);
+	list_for_each_safe(p, n, &mdsc->cap_dirty) {
+		if (nci) {
+			ci = nci;
+			inode = ninode;
+			ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
+			dout("flush_dirty_caps inode %p (was next inode)\n",
+			     inode);
+		} else {
+			ci = list_entry(p, struct ceph_inode_info,
+					i_dirty_item);
+			inode = igrab(&ci->vfs_inode);
+			BUG_ON(!inode);
+			dout("flush_dirty_caps inode %p\n", inode);
+		}
+		if (n != &mdsc->cap_dirty) {
+			nci = list_entry(n, struct ceph_inode_info,
+					 i_dirty_item);
+			ninode = igrab(&nci->vfs_inode);
+			BUG_ON(!ninode);
+			nci->i_ceph_flags |= CEPH_I_NOFLUSH;
+			dout("flush_dirty_caps next inode %p, noflush\n",
+			     ninode);
+		} else {
+			nci = NULL;
+			ninode = NULL;
+		}
 		spin_unlock(&mdsc->cap_dirty_lock);
 		if (inode) {
 			ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ff7aaa32736c..6a778f2c3f6e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -289,6 +289,7 @@ struct ceph_inode_xattrs_info {
 #define CEPH_I_COMPLETE  1  /* we have complete directory cached */
 #define CEPH_I_NODELAY   4  /* do not delay cap release */
 #define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
 
 struct ceph_inode_info {
 	struct ceph_vino i_vino;   /* ceph ino + snap */
-- 
cgit v1.2.2


From 422d2cb8f9afadba1ecd3614f658b6daaaa480fb Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Fri, 26 Feb 2010 15:32:31 -0800
Subject: ceph: reset osd after relevant messages timed out

This simplifies the process of timing out messages. We
keep lru of current messages that are in flight. If a
timeout has passed, we reset the osd connection, so that
messages will be retransmitted.  This is a failsafe in case
we hit some sort of problem sending out message to the OSD.
Normally, we'll get notification via an updated osdmap if
there are problems.

If a request is older than the keepalive timeout, send a
keepalive to ensure we detect any breaks in the TCP connection.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 153 +++++++++++++++++++++++++++++++++------------------
 fs/ceph/osd_client.h |   6 +-
 fs/ceph/super.c      |   8 ++-
 fs/ceph/super.h      |   3 +
 4 files changed, 113 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index c4763bff97b4..dbe63db9762f 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -17,6 +17,8 @@
 #define OSD_OPREPLY_FRONT_LEN	512
 
 const static struct ceph_connection_operations osd_con_ops;
+static int __kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd);
 
 static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
 
@@ -339,6 +341,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
 	osd->o_con.ops = &osd_con_ops;
 	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
 
+	INIT_LIST_HEAD(&osd->o_keepalive_item);
 	return osd;
 }
 
@@ -461,6 +464,16 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
 	return NULL;
 }
 
+static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+{
+	schedule_delayed_work(&osdc->timeout_work,
+			osdc->client->mount_args->osd_keepalive_timeout * HZ);
+}
+
+static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+{
+	cancel_delayed_work(&osdc->timeout_work);
+}
 
 /*
  * Register request, assign tid.  If this is the first request, set up
@@ -472,21 +485,16 @@ static void register_request(struct ceph_osd_client *osdc,
 	mutex_lock(&osdc->request_mutex);
 	req->r_tid = ++osdc->last_tid;
 	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+	INIT_LIST_HEAD(&req->r_req_lru_item);
 
 	dout("register_request %p tid %lld\n", req, req->r_tid);
 	__insert_request(osdc, req);
 	ceph_osdc_get_request(req);
 	osdc->num_requests++;
 
-	req->r_timeout_stamp =
-		jiffies + osdc->client->mount_args->osd_timeout*HZ;
-
 	if (osdc->num_requests == 1) {
-		osdc->timeout_tid = req->r_tid;
-		dout("  timeout on tid %llu at %lu\n", req->r_tid,
-		     req->r_timeout_stamp);
-		schedule_delayed_work(&osdc->timeout_work,
-		      round_jiffies_relative(req->r_timeout_stamp - jiffies));
+		dout(" first request, scheduling timeout\n");
+		__schedule_osd_timeout(osdc);
 	}
 	mutex_unlock(&osdc->request_mutex);
 }
@@ -513,21 +521,10 @@ static void __unregister_request(struct ceph_osd_client *osdc,
 
 	ceph_osdc_put_request(req);
 
-	if (req->r_tid == osdc->timeout_tid) {
-		if (osdc->num_requests == 0) {
-			dout("no requests, canceling timeout\n");
-			osdc->timeout_tid = 0;
-			cancel_delayed_work(&osdc->timeout_work);
-		} else {
-			req = rb_entry(rb_first(&osdc->requests),
-				       struct ceph_osd_request, r_node);
-			osdc->timeout_tid = req->r_tid;
-			dout("rescheduled timeout on tid %llu at %lu\n",
-			     req->r_tid, req->r_timeout_stamp);
-			schedule_delayed_work(&osdc->timeout_work,
-			      round_jiffies_relative(req->r_timeout_stamp -
-						     jiffies));
-		}
+	list_del_init(&req->r_req_lru_item);
+	if (osdc->num_requests == 0) {
+		dout(" no requests, canceling timeout\n");
+		__cancel_osd_timeout(osdc);
 	}
 }
 
@@ -540,6 +537,7 @@ static void __cancel_request(struct ceph_osd_request *req)
 		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
 		req->r_sent = 0;
 	}
+	list_del_init(&req->r_req_lru_item);
 }
 
 /*
@@ -635,7 +633,8 @@ static int __send_request(struct ceph_osd_client *osdc,
 	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
 	reqhead->reassert_version = req->r_reassert_version;
 
-	req->r_timeout_stamp = jiffies+osdc->client->mount_args->osd_timeout*HZ;
+	req->r_sent_stamp = jiffies;
+	list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
 
 	ceph_msg_get(req->r_request); /* send consumes a ref */
 	ceph_con_send(&req->r_osd->o_con, req->r_request);
@@ -656,11 +655,14 @@ static void handle_timeout(struct work_struct *work)
 {
 	struct ceph_osd_client *osdc =
 		container_of(work, struct ceph_osd_client, timeout_work.work);
-	struct ceph_osd_request *req;
+	struct ceph_osd_request *req, *last_req = NULL;
 	struct ceph_osd *osd;
 	unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
-	unsigned long next_timeout = timeout + jiffies;
+	unsigned long keepalive =
+		osdc->client->mount_args->osd_keepalive_timeout * HZ;
+	unsigned long last_sent = 0;
 	struct rb_node *p;
+	struct list_head slow_osds;
 
 	dout("timeout\n");
 	down_read(&osdc->map_sem);
@@ -683,25 +685,56 @@ static void handle_timeout(struct work_struct *work)
 			continue;
 		}
 	}
-	for (p = rb_first(&osdc->osds); p; p = rb_next(p)) {
-		osd = rb_entry(p, struct ceph_osd, o_node);
-		if (list_empty(&osd->o_requests))
-			continue;
-		req = list_first_entry(&osd->o_requests,
-				       struct ceph_osd_request, r_osd_item);
-		if (time_before(jiffies, req->r_timeout_stamp))
-			continue;
 
-		dout(" tid %llu (at least) timed out on osd%d\n",
+	/*
+	 * reset osds that appear to be _really_ unresponsive.  this
+	 * is a failsafe measure.. we really shouldn't be getting to
+	 * this point if the system is working properly.  the monitors
+	 * should mark the osd as failed and we should find out about
+	 * it from an updated osd map.
+	 */
+	while (!list_empty(&osdc->req_lru)) {
+		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
+				 r_req_lru_item);
+
+		if (time_before(jiffies, req->r_sent_stamp + timeout))
+			break;
+
+		BUG_ON(req == last_req && req->r_sent_stamp == last_sent);
+		last_req = req;
+		last_sent = req->r_sent_stamp;
+
+		osd = req->r_osd;
+		BUG_ON(!osd);
+		pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
+			   req->r_tid, osd->o_osd);
+		__kick_requests(osdc, osd);
+	}
+
+	/*
+	 * ping osds that are a bit slow.  this ensures that if there
+	 * is a break in the TCP connection we will notice, and reopen
+	 * a connection with that osd (from the fault callback).
+	 */
+	INIT_LIST_HEAD(&slow_osds);
+	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+		if (time_before(jiffies, req->r_sent_stamp + keepalive))
+			break;
+
+		osd = req->r_osd;
+		BUG_ON(!osd);
+		dout(" tid %llu is slow, will send keepalive on osd%d\n",
 		     req->r_tid, osd->o_osd);
-		req->r_timeout_stamp = next_timeout;
+		list_move_tail(&osd->o_keepalive_item, &slow_osds);
+	}
+	while (!list_empty(&slow_osds)) {
+		osd = list_entry(slow_osds.next, struct ceph_osd,
+				 o_keepalive_item);
+		list_del_init(&osd->o_keepalive_item);
 		ceph_con_keepalive(&osd->o_con);
 	}
 
-	if (osdc->timeout_tid)
-		schedule_delayed_work(&osdc->timeout_work,
-				      round_jiffies_relative(timeout));
-
+	__schedule_osd_timeout(osdc);
 	mutex_unlock(&osdc->request_mutex);
 
 	up_read(&osdc->map_sem);
@@ -819,18 +852,7 @@ bad:
 }
 
 
-/*
- * Resubmit osd requests whose osd or osd address has changed.  Request
- * a new osd map if osds are down, or we are otherwise unable to determine
- * how to direct a request.
- *
- * Close connections to down osds.
- *
- * If @who is specified, resubmit requests for that specific osd.
- *
- * Caller should hold map_sem for read and request_mutex.
- */
-static void kick_requests(struct ceph_osd_client *osdc,
+static int __kick_requests(struct ceph_osd_client *osdc,
 			  struct ceph_osd *kickosd)
 {
 	struct ceph_osd_request *req;
@@ -839,7 +861,6 @@ static void kick_requests(struct ceph_osd_client *osdc,
 	int err;
 
 	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
-	mutex_lock(&osdc->request_mutex);
 	if (kickosd) {
 		__reset_osd(osdc, kickosd);
 	} else {
@@ -900,14 +921,36 @@ kick:
 			req->r_resend = true;
 		}
 	}
+
+	return needmap;
+}
+
+/*
+ * Resubmit osd requests whose osd or osd address has changed.  Request
+ * a new osd map if osds are down, or we are otherwise unable to determine
+ * how to direct a request.
+ *
+ * Close connections to down osds.
+ *
+ * If @who is specified, resubmit requests for that specific osd.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static void kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd)
+{
+	int needmap;
+
+	mutex_lock(&osdc->request_mutex);
+	needmap = __kick_requests(osdc, kickosd);
 	mutex_unlock(&osdc->request_mutex);
 
 	if (needmap) {
 		dout("%d requests for down osds, need new map\n", needmap);
 		ceph_monc_request_next_osdmap(&osdc->client->monc);
 	}
-}
 
+}
 /*
  * Process updated osd map.
  *
@@ -1164,11 +1207,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	init_completion(&osdc->map_waiters);
 	osdc->last_requested_map = 0;
 	mutex_init(&osdc->request_mutex);
-	osdc->timeout_tid = 0;
 	osdc->last_tid = 0;
 	osdc->osds = RB_ROOT;
 	INIT_LIST_HEAD(&osdc->osd_lru);
 	osdc->requests = RB_ROOT;
+	INIT_LIST_HEAD(&osdc->req_lru);
 	osdc->num_requests = 0;
 	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
 	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index f256eba6fe7a..1b1a3ca43afc 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -36,12 +36,15 @@ struct ceph_osd {
 	void *o_authorizer_buf, *o_authorizer_reply_buf;
 	size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
 	unsigned long lru_ttl;
+	int o_marked_for_keepalive;
+	struct list_head o_keepalive_item;
 };
 
 /* an in-flight request */
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
 	struct rb_node  r_node;
+	struct list_head r_req_lru_item;
 	struct list_head r_osd_item;
 	struct ceph_osd *r_osd;
 	struct ceph_pg   r_pgid;
@@ -67,7 +70,7 @@ struct ceph_osd_request {
 
 	char              r_oid[40];          /* object name */
 	int               r_oid_len;
-	unsigned long     r_timeout_stamp;
+	unsigned long     r_sent_stamp;
 	bool              r_resend;           /* msg send failed, needs retry */
 
 	struct ceph_file_layout r_file_layout;
@@ -92,6 +95,7 @@ struct ceph_osd_client {
 	u64                    timeout_tid;   /* tid of timeout triggering rq */
 	u64                    last_tid;      /* tid of last request */
 	struct rb_root         requests;      /* pending requests */
+	struct list_head       req_lru;	      /* pending requests lru */
 	int                    num_requests;
 	struct delayed_work    timeout_work;
 	struct delayed_work    osds_timeout_work;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 74953be75f8f..4290a6e860b0 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -292,6 +292,7 @@ enum {
 	Opt_wsize,
 	Opt_rsize,
 	Opt_osdtimeout,
+	Opt_osdkeepalivetimeout,
 	Opt_mount_timeout,
 	Opt_osd_idle_ttl,
 	Opt_caps_wanted_delay_min,
@@ -322,6 +323,7 @@ static match_table_t arg_tokens = {
 	{Opt_wsize, "wsize=%d"},
 	{Opt_rsize, "rsize=%d"},
 	{Opt_osdtimeout, "osdtimeout=%d"},
+	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
 	{Opt_mount_timeout, "mount_timeout=%d"},
 	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
 	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
@@ -367,7 +369,8 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
 	/* start with defaults */
 	args->sb_flags = flags;
 	args->flags = CEPH_OPT_DEFAULT;
-	args->osd_timeout = 5;    /* seconds */
+	args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+	args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
 	args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
 	args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
 	args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
@@ -468,6 +471,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
 		case Opt_osdtimeout:
 			args->osd_timeout = intval;
 			break;
+		case Opt_osdkeepalivetimeout:
+			args->osd_keepalive_timeout = intval;
+			break;
 		case Opt_mount_timeout:
 			args->mount_timeout = intval;
 			break;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6a778f2c3f6e..02c0ddcf3eaf 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -62,6 +62,7 @@ struct ceph_mount_args {
 	int max_readdir;      /* max readdir size */
 	int congestion_kb;      /* max readdir size */
 	int osd_timeout;
+	int osd_keepalive_timeout;
 	char *snapdir_name;   /* default ".snap" */
 	char *name;
 	char *secret;
@@ -72,6 +73,8 @@ struct ceph_mount_args {
  * defaults
  */
 #define CEPH_MOUNT_TIMEOUT_DEFAULT  60
+#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
+#define CEPH_OSD_KEEPALIVE_DEFAULT  5
 #define CEPH_OSD_IDLE_TTL_DEFAULT    60
 #define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
 
-- 
cgit v1.2.2


From f1a3d57213fe264b4cf584e78bac36aaf9998729 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 18 Jan 2010 11:53:08 +1100
Subject: ceph: update for write_inode API change

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c  | 4 +++-
 fs/ceph/super.h | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8b89b9123252..db122bb357b8 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -5,6 +5,7 @@
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
 #include <linux/wait.h>
+#include <linux/writeback.h>
 
 #include "super.h"
 #include "decode.h"
@@ -1801,12 +1802,13 @@ int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
  * get by with fewer MDS messages if we wait for data writeback to
  * complete first.
  */
-int ceph_write_inode(struct inode *inode, int wait)
+int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	unsigned flush_tid;
 	int err = 0;
 	int dirty;
+	int wait = wbc->sync_mode == WB_SYNC_ALL;
 
 	dout("write_inode %p wait=%d\n", inode, wait);
 	if (wait) {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 02c0ddcf3eaf..65d12036b670 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -11,6 +11,7 @@
 #include <linux/mempool.h>
 #include <linux/pagemap.h>
 #include <linux/wait.h>
+#include <linux/writeback.h>
 
 #include "types.h"
 #include "messenger.h"
@@ -811,7 +812,7 @@ static inline void ceph_remove_cap(struct ceph_cap *cap)
 extern void ceph_put_cap(struct ceph_cap *cap);
 
 extern void ceph_queue_caps_release(struct inode *inode);
-extern int ceph_write_inode(struct inode *inode, int unused);
+extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
 extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 				    struct ceph_mds_session *session);
-- 
cgit v1.2.2


From df2cf170c823ba779ca339e3ede347c87f4dc6a9 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 12 Feb 2010 07:44:16 -0500
Subject: cifs: overhaul cifs_revalidate and rename to cifs_revalidate_dentry

cifs_revalidate is renamed to cifs_revalidate_dentry as a later patch
will add a by-filehandle variant.

Add a new "invalid_mapping" flag to the cifsInodeInfo that indicates
that the pagecache is considered invalid. Add a new routine to check
inode attributes whenever they're updated and set that flag if the inode
has changed on the server.

cifs_revalidate_dentry is then changed to just update the attrcache if
needed and then to zap the pagecache if it's not valid.

There are some other behavior changes in here as well. Open files are
now allowed to have their caches invalidated. I see no reason why we'd
want to keep stale data around just because a file is open. Also,
cifs_revalidate_cache uses the server_eof for revalidating the file
size since that should more closely match the size of the file on the
server.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c   |   3 +-
 fs/cifs/cifsfs.h   |   2 +-
 fs/cifs/cifsglob.h |   1 +
 fs/cifs/dir.c      |   2 +-
 fs/cifs/file.c     |   2 +-
 fs/cifs/inode.c    | 211 +++++++++++++++++++++++++----------------------------
 6 files changed, 104 insertions(+), 117 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c6a03627176..cf85a4165b01 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -312,6 +312,7 @@ cifs_alloc_inode(struct super_block *sb)
 	cifs_inode->clientCanCacheRead = false;
 	cifs_inode->clientCanCacheAll = false;
 	cifs_inode->delete_pending = false;
+	cifs_inode->invalid_mapping = false;
 	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
 	cifs_inode->server_eof = 0;
 
@@ -638,7 +639,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 		   setting the revalidate time to zero */
 		CIFS_I(file->f_path.dentry->d_inode)->time = 0;
 
-		retval = cifs_revalidate(file->f_path.dentry);
+		retval = cifs_revalidate_dentry(file->f_path.dentry);
 		if (retval < 0)
 			return (loff_t)retval;
 	}
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 78c1b86d55f6..2af995ca8955 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,7 +61,7 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
 extern int cifs_rmdir(struct inode *, struct dentry *);
 extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
 		       struct dentry *);
-extern int cifs_revalidate(struct dentry *);
+extern int cifs_revalidate_dentry(struct dentry *);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
 
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a1c817eb291a..63c89d1d70b5 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -389,6 +389,7 @@ struct cifsInodeInfo {
 	bool clientCanCacheRead:1;	/* read oplock */
 	bool clientCanCacheAll:1;	/* read and writebehind oplock */
 	bool delete_pending:1;		/* DELETE_ON_CLOSE is set */
+	bool invalid_mapping:1;		/* pagecache is invalid */
 	u64  server_eof;		/* current file size on server */
 	u64  uniqueid;			/* server inode number */
 	struct inode vfs_inode;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6ccf7262d1b7..e9f7ecc2714b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -739,7 +739,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 	int isValid = 1;
 
 	if (direntry->d_inode) {
-		if (cifs_revalidate(direntry))
+		if (cifs_revalidate_dentry(direntry))
 			return 0;
 	} else {
 		cFYI(1, ("neg dentry 0x%p name = %s",
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3d8f8a96f5a3..b90f8f2ca85c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1894,7 +1894,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	int rc, xid;
 
 	xid = GetXid();
-	rc = cifs_revalidate(dentry);
+	rc = cifs_revalidate_dentry(dentry);
 	if (rc) {
 		cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
 		FreeXid(xid);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8bdbc818164c..f050dba920cb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -77,6 +77,41 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
 	}
 }
 
+/* check inode attributes against fattr. If they don't match, tag the
+ * inode for cache invalidation
+ */
+static void
+cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
+{
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+
+	cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid));
+
+	if (inode->i_state & I_NEW) {
+		cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid));
+		return;
+	}
+
+	/* don't bother with revalidation if we have an oplock */
+	if (cifs_i->clientCanCacheRead) {
+		cFYI(1, ("%s: inode %llu is oplocked", __func__,
+			 cifs_i->uniqueid));
+		return;
+	}
+
+	 /* revalidate if mtime or size have changed */
+	if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
+	    cifs_i->server_eof == fattr->cf_eof) {
+		cFYI(1, ("%s: inode %llu is unchanged", __func__,
+			 cifs_i->uniqueid));
+		return;
+	}
+
+	cFYI(1, ("%s: invalidating inode %llu mapping", __func__,
+		 cifs_i->uniqueid));
+	cifs_i->invalid_mapping = true;
+}
+
 /* populate an inode with info from a cifs_fattr struct */
 void
 cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -85,6 +120,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	unsigned long oldtime = cifs_i->time;
 
+	cifs_revalidate_cache(inode, fattr);
+
 	inode->i_atime = fattr->cf_atime;
 	inode->i_mtime = fattr->cf_mtime;
 	inode->i_ctime = fattr->cf_ctime;
@@ -1389,135 +1426,83 @@ cifs_rename_exit:
 	return rc;
 }
 
-int cifs_revalidate(struct dentry *direntry)
+static bool
+cifs_inode_needs_reval(struct inode *inode)
 {
-	int xid;
-	int rc = 0, wbrc = 0;
-	char *full_path;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsInodeInfo *cifsInode;
-	loff_t local_size;
-	struct timespec local_mtime;
-	bool invalidate_inode = false;
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
 
-	if (direntry->d_inode == NULL)
-		return -ENOENT;
+	if (cifs_i->clientCanCacheRead)
+		return false;
 
-	cifsInode = CIFS_I(direntry->d_inode);
+	if (!lookupCacheEnabled)
+		return true;
 
-	if (cifsInode == NULL)
-		return -ENOENT;
+	if (cifs_i->time == 0)
+		return true;
 
-	/* no sense revalidating inode info on file that no one can write */
-	if (CIFS_I(direntry->d_inode)->clientCanCacheRead)
-		return rc;
+	/* FIXME: the actimeo should be tunable */
+	if (time_after_eq(jiffies, cifs_i->time + HZ))
+		return true;
+
+	return false;
+}
+
+/* check invalid_mapping flag and zap the cache if it's set */
+static void
+cifs_invalidate_mapping(struct inode *inode)
+{
+	int rc;
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+
+	cifs_i->invalid_mapping = false;
+
+	/* write back any cached data */
+	if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
+		rc = filemap_write_and_wait(inode->i_mapping);
+		if (rc)
+			cifs_i->write_behind_rc = rc;
+	}
+	invalidate_remote_inode(inode);
+}
+
+/* revalidate a dentry's inode attributes */
+int cifs_revalidate_dentry(struct dentry *dentry)
+{
+	int xid;
+	int rc = 0;
+	char *full_path = NULL;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dentry->d_sb;
+
+	if (inode == NULL)
+		return -ENOENT;
 
 	xid = GetXid();
 
-	cifs_sb = CIFS_SB(direntry->d_sb);
+	if (!cifs_inode_needs_reval(inode))
+		goto check_inval;
 
 	/* can not safely grab the rename sem here if rename calls revalidate
 	   since that would deadlock */
-	full_path = build_path_from_dentry(direntry);
+	full_path = build_path_from_dentry(dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto check_inval;
 	}
-	cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
-		 "jiffies %ld", full_path, direntry->d_inode,
-		 direntry->d_inode->i_count.counter, direntry,
-		 direntry->d_time, jiffies));
-
-	if (cifsInode->time == 0) {
-		/* was set to zero previously to force revalidate */
-	} else if (time_before(jiffies, cifsInode->time + HZ) &&
-		   lookupCacheEnabled) {
-		if ((S_ISREG(direntry->d_inode->i_mode) == 0) ||
-		    (direntry->d_inode->i_nlink == 1)) {
-			kfree(full_path);
-			FreeXid(xid);
-			return rc;
-		} else {
-			cFYI(1, ("Have to revalidate file due to hardlinks"));
-		}
-	}
-
-	/* save mtime and size */
-	local_mtime = direntry->d_inode->i_mtime;
-	local_size = direntry->d_inode->i_size;
 
-	if (cifs_sb->tcon->unix_ext) {
-		rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path,
-					      direntry->d_sb, xid);
-		if (rc) {
-			cFYI(1, ("error on getting revalidate info %d", rc));
-/*			if (rc != -ENOENT)
-				rc = 0; */	/* BB should we cache info on
-						   certain errors? */
-		}
-	} else {
-		rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
-					 direntry->d_sb, xid, NULL);
-		if (rc) {
-			cFYI(1, ("error on getting revalidate info %d", rc));
-/*			if (rc != -ENOENT)
-				rc = 0; */	/* BB should we cache info on
-						   certain errors? */
-		}
-	}
-	/* should we remap certain errors, access denied?, to zero */
-
-	/* if not oplocked, we invalidate inode pages if mtime or file size
-	   had changed on server */
+	cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
+		 "jiffies %ld", full_path, inode, inode->i_count.counter,
+		 dentry, dentry->d_time, jiffies));
 
-	if (timespec_equal(&local_mtime, &direntry->d_inode->i_mtime) &&
-	    (local_size == direntry->d_inode->i_size)) {
-		cFYI(1, ("cifs_revalidate - inode unchanged"));
-	} else {
-		/* file may have changed on server */
-		if (cifsInode->clientCanCacheRead) {
-			/* no need to invalidate inode pages since we were the
-			   only ones who could have modified the file and the
-			   server copy is staler than ours */
-		} else {
-			invalidate_inode = true;
-		}
-	}
+	if (CIFS_SB(sb)->tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
+	else
+		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
+					 xid, NULL);
 
-	/* can not grab this sem since kernel filesys locking documentation
-	   indicates i_mutex may be taken by the kernel on lookup and rename
-	   which could deadlock if we grab the i_mutex here as well */
-/*	mutex_lock(&direntry->d_inode->i_mutex);*/
-	/* need to write out dirty pages here  */
-	if (direntry->d_inode->i_mapping) {
-		/* do we need to lock inode until after invalidate completes
-		   below? */
-		wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
-		if (wbrc)
-			CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
-	}
-	if (invalidate_inode) {
-	/* shrink_dcache not necessary now that cifs dentry ops
-	are exported for negative dentries */
-/*		if (S_ISDIR(direntry->d_inode->i_mode))
-			shrink_dcache_parent(direntry); */
-		if (S_ISREG(direntry->d_inode->i_mode)) {
-			if (direntry->d_inode->i_mapping) {
-				wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
-				if (wbrc)
-					CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
-			}
-			/* may eventually have to do this for open files too */
-			if (list_empty(&(cifsInode->openFileList))) {
-				/* changed on server - flush read ahead pages */
-				cFYI(1, ("Invalidating read ahead data on "
-					 "closed file"));
-				invalidate_remote_inode(direntry->d_inode);
-			}
-		}
-	}
-/*	mutex_unlock(&direntry->d_inode->i_mutex); */
+check_inval:
+	if (CIFS_I(inode)->invalid_mapping)
+		cifs_invalidate_mapping(inode);
 
 	kfree(full_path);
 	FreeXid(xid);
@@ -1527,7 +1512,7 @@ int cifs_revalidate(struct dentry *direntry)
 int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	struct kstat *stat)
 {
-	int err = cifs_revalidate(dentry);
+	int err = cifs_revalidate_dentry(dentry);
 	if (!err) {
 		generic_fillattr(dentry->d_inode, stat);
 		stat->blksize = CIFS_MAX_MSGSIZE;
-- 
cgit v1.2.2


From bcd5357f430363376565d07ca542127d6d36602c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 12 Feb 2010 07:44:16 -0500
Subject: cifs: add a CIFSSMBQFileInfo function

...to get inode attributes via filehandle instead of by path.

In some places, we need to revalidate an inode on an open filehandle,
but we can't necessarily guarantee that the dentry associated with it
will still be valid. When we have an open filehandle already, it makes
more sense to do a filehandle based operation anyway.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  2 ++
 fs/cifs/cifssmb.c   | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 88e2bc44ac58..bf2bff14c24f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -142,6 +142,8 @@ extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
 			const __u16 search_handle);
 
+extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+			u16 netfid, FILE_ALL_INFO *pFindData);
 extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
 			FILE_ALL_INFO *findData,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 9d17df3e0768..4ed97825db3d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3230,8 +3230,72 @@ QInfRetry:
 	return rc;
 }
 
+int
+CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+		 u16 netfid, FILE_ALL_INFO *pFindData)
+{
+	struct smb_t2_qfi_req *pSMB = NULL;
+	struct smb_t2_qfi_rsp *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	__u16 params, byte_count;
+
+QFileInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2 /* level */ + 2 /* fid */;
+	pSMB->t2.TotalDataCount = 0;
+	pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->t2.MaxSetupCount = 0;
+	pSMB->t2.Reserved = 0;
+	pSMB->t2.Flags = 0;
+	pSMB->t2.Timeout = 0;
+	pSMB->t2.Reserved2 = 0;
+	pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+					       Fid) - 4);
+	pSMB->t2.DataCount = 0;
+	pSMB->t2.DataOffset = 0;
+	pSMB->t2.SetupCount = 1;
+	pSMB->t2.Reserved3 = 0;
+	pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+	pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
+	pSMB->Pad = 0;
+	pSMB->Fid = netfid;
+	pSMB->hdr.smb_buf_length += byte_count;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, ("Send error in QPathInfo = %d", rc));
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
+		if (rc) /* BB add auto retry on EOPNOTSUPP? */
+			rc = -EIO;
+		else if (pSMBr->ByteCount < 40)
+			rc = -EIO;	/* bad smb */
+		else if (pFindData) {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			memcpy((char *) pFindData,
+			       (char *) &pSMBr->hdr.Protocol +
+			       data_offset, sizeof(FILE_ALL_INFO));
+		} else
+		    rc = -ENOMEM;
+	}
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto QFileInfoRetry;
 
+	return rc;
+}
 
 int
 CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
-- 
cgit v1.2.2


From c8634fd3115497ac311f57be9c12f993437745cf Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 12 Feb 2010 07:44:17 -0500
Subject: cifs: add a CIFSSMBUnixQFileInfo function

...to allow us to get unix attrs via filehandle.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  2 ++
 fs/cifs/cifssmb.c   | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index bf2bff14c24f..ce9199f0af9d 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -154,6 +154,8 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
 			FILE_ALL_INFO *findData,
 			const struct nls_table *nls_codepage, int remap);
 
+extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+			u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
 extern int CIFSSMBUnixQPathInfo(const int xid,
 			struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 4ed97825db3d..903d53871da7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3398,6 +3398,75 @@ QPathInfoRetry:
 	return rc;
 }
 
+int
+CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+		 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
+{
+	struct smb_t2_qfi_req *pSMB = NULL;
+	struct smb_t2_qfi_rsp *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	__u16 params, byte_count;
+
+UnixQFileInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2 /* level */ + 2 /* fid */;
+	pSMB->t2.TotalDataCount = 0;
+	pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->t2.MaxSetupCount = 0;
+	pSMB->t2.Reserved = 0;
+	pSMB->t2.Flags = 0;
+	pSMB->t2.Timeout = 0;
+	pSMB->t2.Reserved2 = 0;
+	pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+					       Fid) - 4);
+	pSMB->t2.DataCount = 0;
+	pSMB->t2.DataOffset = 0;
+	pSMB->t2.SetupCount = 1;
+	pSMB->t2.Reserved3 = 0;
+	pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+	pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
+	pSMB->Pad = 0;
+	pSMB->Fid = netfid;
+	pSMB->hdr.smb_buf_length += byte_count;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, ("Send error in QPathInfo = %d", rc));
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
+			cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
+				   "Unix Extensions can be disabled on mount "
+				   "by specifying the nosfu mount option."));
+			rc = -EIO;	/* bad smb */
+		} else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			memcpy((char *) pFindData,
+			       (char *) &pSMBr->hdr.Protocol +
+			       data_offset,
+			       sizeof(FILE_UNIX_BASIC_INFO));
+		}
+	}
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto UnixQFileInfoRetry;
+
+	return rc;
+}
+
 int
 CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
 		     const unsigned char *searchName,
-- 
cgit v1.2.2


From 8bf8c376ab2eefaf0386f4e003e720e1434fa43d Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 3 Mar 2010 06:28:06 +0300
Subject: blkdev: fix merge_bvec_fn return value checks v2

merge_bvec_fn() returns bvec->bv_len on success. So we have to check
against this value. But in case of fs_optimization merge we compare
with wrong value. This patch must be included in
 b428cd6da7e6559aca69aa2e3a526037d3f20403
But accidentally i've forgot to add this in the initial patch.
To make things straight let's replace all such checks.
In fact this makes code easy to understand.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index dc17afd672e3..d89e0199dc0d 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -555,7 +555,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 					.bi_rw = bio->bi_rw,
 				};
 
-				if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+				if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
 					prev->bv_len -= len;
 					return 0;
 				}
@@ -608,7 +608,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 		 * merge_bvec_fn() returns number of bytes it can accept
 		 * at this offset
 		 */
-		if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
+		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
 			bvec->bv_page = NULL;
 			bvec->bv_len = 0;
 			bvec->bv_offset = 0;
-- 
cgit v1.2.2


From 7dd08a570dcf45d52155996fee688405635ee481 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 6 Mar 2010 15:02:22 +0300
Subject: nfs: fix unlikely memory leak

I'll admit that it's unlikely for the first allocation to fail and
the second one to succeed.  I won't be offended if you ignore this
patch.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index eda74c42d552..f9254fb0c9d0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5107,6 +5107,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
 	res = kzalloc(sizeof(*res), GFP_KERNEL);
 	if (!args || !res) {
 		kfree(args);
+		kfree(res);
 		nfs_put_client(clp);
 		return -ENOMEM;
 	}
-- 
cgit v1.2.2


From 49697ee79242d5f8ac88f1ebc62e583d16bcc687 Mon Sep 17 00:00:00 2001
From: Steve Dickson <SteveD@redhat.com>
Date: Tue, 13 Oct 2009 16:07:33 -0400
Subject: nfs4: Make the v4 callback service hidden

To avoid hangs in the svc_unregister(), on version 4 mounts
(and unmounts), when rpcbind is not running, make the nfs4 callback
program an 'hidden' service by setting the 'vs_hidden' flag in the
nfs4_callback_version structure.

Signed-off-by: Steve Dickson <steved@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_xdr.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index db30c0b398b5..a2b8b4df125d 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -782,6 +782,7 @@ struct svc_version nfs4_callback_version1 = {
 	.vs_proc = nfs4_callback_procedures1,
 	.vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
 	.vs_dispatch = NULL,
+	.vs_hidden = 1,
 };
 
 struct svc_version nfs4_callback_version4 = {
-- 
cgit v1.2.2


From e9edb1d8a345119c9baafa1b240eb1ec06a44662 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 3 Mar 2010 08:53:51 -0500
Subject: GFS2: do not select QUOTA

gfs2 only needs the quotactl code, not the generic quota implementation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 4dcddf83326f..a47b43107112 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -8,7 +8,6 @@ config GFS2_FS
 	select FS_POSIX_ACL
 	select CRC32
 	select SLOW_WORK
-	select QUOTA
 	select QUOTACTL
 	help
 	  A cluster filesystem.
-- 
cgit v1.2.2


From abab095d1fd25986b910d3c46289d8fa3582cdc5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 12 Feb 2010 07:44:18 -0500
Subject: cifs: add cifs_revalidate_file

...to allow updating inode attributes on an existing inode by
filehandle. Change mmap and llseek codepaths to use that
instead of cifs_revalidate_dentry since they have a filehandle
readily available.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c    |  2 +-
 fs/cifs/cifsfs.h    |  1 +
 fs/cifs/cifsproto.h |  2 ++
 fs/cifs/file.c      |  3 +-
 fs/cifs/inode.c     | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 91 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index cf85a4165b01..5183bc2a1916 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -639,7 +639,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 		   setting the revalidate time to zero */
 		CIFS_I(file->f_path.dentry->d_inode)->time = 0;
 
-		retval = cifs_revalidate_dentry(file->f_path.dentry);
+		retval = cifs_revalidate_file(file);
 		if (retval < 0)
 			return (loff_t)retval;
 	}
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2af995ca8955..7aa57ecdc437 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,6 +61,7 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
 extern int cifs_rmdir(struct inode *, struct dentry *);
 extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
 		       struct dentry *);
+extern int cifs_revalidate_file(struct file *filp);
 extern int cifs_revalidate_dentry(struct dentry *);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ce9199f0af9d..39e47f46dea5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -104,10 +104,12 @@ extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
 extern struct inode *cifs_iget(struct super_block *sb,
 			       struct cifs_fattr *fattr);
 
+extern int cifs_get_file_info(struct file *filp);
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path,
 			FILE_ALL_INFO *pfile_info,
 			struct super_block *sb, int xid, const __u16 *pfid);
+extern int cifs_get_file_info_unix(struct file *filp);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
 			const unsigned char *search_path,
 			struct super_block *sb, int xid);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b90f8f2ca85c..1389f6ecef9e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1890,11 +1890,10 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 
 int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct dentry *dentry = file->f_path.dentry;
 	int rc, xid;
 
 	xid = GetXid();
-	rc = cifs_revalidate_dentry(dentry);
+	rc = cifs_revalidate_file(file);
 	if (rc) {
 		cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
 		FreeXid(xid);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f050dba920cb..0d034a84bb8d 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -268,6 +268,31 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 	fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
 }
 
+int cifs_get_file_info_unix(struct file *filp)
+{
+	int rc;
+	int xid;
+	FILE_UNIX_BASIC_INFO find_data;
+	struct cifs_fattr fattr;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+
+	xid = GetXid();
+	rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
+	if (!rc) {
+		cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
+	} else if (rc == -EREMOTE) {
+		cifs_create_dfs_fattr(&fattr, inode->i_sb);
+		rc = 0;
+	}
+
+	cifs_fattr_to_inode(inode, &fattr);
+	FreeXid(xid);
+	return rc;
+}
+
 int cifs_get_inode_info_unix(struct inode **pinode,
 			     const unsigned char *full_path,
 			     struct super_block *sb, int xid)
@@ -469,6 +494,47 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 	fattr->cf_gid = cifs_sb->mnt_gid;
 }
 
+int cifs_get_file_info(struct file *filp)
+{
+	int rc;
+	int xid;
+	FILE_ALL_INFO find_data;
+	struct cifs_fattr fattr;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+
+	xid = GetXid();
+	rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
+	if (rc == -EOPNOTSUPP || rc == -EINVAL) {
+		/*
+		 * FIXME: legacy server -- fall back to path-based call?
+ 		 * for now, just skip revalidating and mark inode for
+ 		 * immediate reval.
+ 		 */
+		rc = 0;
+		CIFS_I(inode)->time = 0;
+		goto cgfi_exit;
+	} else if (rc == -EREMOTE) {
+		cifs_create_dfs_fattr(&fattr, inode->i_sb);
+		rc = 0;
+	} else if (rc)
+		goto cgfi_exit;
+
+	/*
+	 * don't bother with SFU junk here -- just mark inode as needing
+	 * revalidation.
+	 */
+	cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
+	fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
+	fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
+	cifs_fattr_to_inode(inode, &fattr);
+cgfi_exit:
+	FreeXid(xid);
+	return rc;
+}
+
 int cifs_get_inode_info(struct inode **pinode,
 	const unsigned char *full_path, FILE_ALL_INFO *pfindData,
 	struct super_block *sb, int xid, const __u16 *pfid)
@@ -1465,6 +1531,26 @@ cifs_invalidate_mapping(struct inode *inode)
 	invalidate_remote_inode(inode);
 }
 
+int cifs_revalidate_file(struct file *filp)
+{
+	int rc = 0;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	if (!cifs_inode_needs_reval(inode))
+		goto check_inval;
+
+	if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
+		rc = cifs_get_file_info_unix(filp);
+	else
+		rc = cifs_get_file_info(filp);
+
+check_inval:
+	if (CIFS_I(inode)->invalid_mapping)
+		cifs_invalidate_mapping(inode);
+
+	return rc;
+}
+
 /* revalidate a dentry's inode attributes */
 int cifs_revalidate_dentry(struct dentry *dentry)
 {
-- 
cgit v1.2.2


From ff215713eb33c56301cf6bfec0143ddc7f22c138 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 9 Mar 2010 20:30:42 +0000
Subject: [CIFS] checkpatch cleanup

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c  | 4 ++--
 fs/cifs/inode.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1389f6ecef9e..ca2ba7a0193c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -219,8 +219,8 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 		cFYI(1, ("inode unchanged on server"));
 	} else {
 		if (file->f_path.dentry->d_inode->i_mapping) {
-		/* BB no need to lock inode until after invalidate
-		   since namei code should already have it locked? */
+			/* BB no need to lock inode until after invalidate
+			since namei code should already have it locked? */
 			rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
 			if (rc != 0)
 				CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0d034a84bb8d..723daaccbd0e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -510,9 +510,9 @@ int cifs_get_file_info(struct file *filp)
 	if (rc == -EOPNOTSUPP || rc == -EINVAL) {
 		/*
 		 * FIXME: legacy server -- fall back to path-based call?
- 		 * for now, just skip revalidating and mark inode for
- 		 * immediate reval.
- 		 */
+		 * for now, just skip revalidating and mark inode for
+		 * immediate reval.
+		 */
 		rc = 0;
 		CIFS_I(inode)->time = 0;
 		goto cgfi_exit;
-- 
cgit v1.2.2


From b4d2314bb88b07e5a04e6c75b442a1dfcd60e340 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 10 Mar 2010 15:21:44 -0500
Subject: NFSv4: Don't ignore the NFS_INO_REVAL_FORCED flag in
 nfs_revalidate_inode()

If the NFS_INO_REVAL_FORCED flag is set, that means that we don't yet have
an up to date attribute cache. Even if we hold a delegation, we must
put a GETATTR on the wire.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/delegation.h | 6 ++++++
 fs/nfs/dir.c        | 2 +-
 fs/nfs/inode.c      | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 944b627ec6e1..69e7b8140122 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -71,4 +71,10 @@ static inline int nfs_inode_return_delegation(struct inode *inode)
 }
 #endif
 
+static inline int nfs_have_delegated_attributes(struct inode *inode)
+{
+	return nfs_have_delegation(inode, FMODE_READ) &&
+		!(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
+}
+
 #endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a1f6b4438fb1..c6f2750648f4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1789,7 +1789,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
 	cache = nfs_access_search_rbtree(inode, cred);
 	if (cache == NULL)
 		goto out;
-	if (!nfs_have_delegation(inode, FMODE_READ) &&
+	if (!nfs_have_delegated_attributes(inode) &&
 	    !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
 		goto out_stale;
 	res->jiffies = cache->jiffies;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 657201acda84..e358df75a6ad 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -729,7 +729,7 @@ int nfs_attribute_timeout(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if (nfs_have_delegation(inode, FMODE_READ))
+	if (nfs_have_delegated_attributes(inode))
 		return 0;
 	return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
-- 
cgit v1.2.2


From 2e95e3f668c85276ce699993596d3b52b0fcf4c5 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Wed, 10 Mar 2010 18:10:19 -0600
Subject: GFS2: Allow the number of committed revokes to temporarily be
 negative

GFS2 tracks the number of revokes and unrevokes that are part of committed
transactions via sd_log_commited_revoke. It is possible for one process to add
revokes during its transaction, while another process unrevokes them during its
transaction. If the second process finishes its transaction first,
sd_log_commited_revoke will be decremented by the number of unrevokes that the
second process did, without first being incremented by the number of revokes
the first process did. This is fine, since all started transactions must be
completed before the journal can be flushed.  However, sd_log_commited_revoke
is an unsigned integer, and log_refund() causes an assertion failure if it
would go negative at the end of a transaction.  This patch makes
sd_log_commited_revoke a signed integer and allows it to go negative.
__gfs2_log_flush() still checks that it mataches the actual number of revokes.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 2 +-
 fs/gfs2/log.c    | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b8025e51cabf..3aac46f6853e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -616,7 +616,7 @@ struct gfs2_sbd {
 	unsigned int sd_log_blks_reserved;
 	unsigned int sd_log_commited_buf;
 	unsigned int sd_log_commited_databuf;
-	unsigned int sd_log_commited_revoke;
+	int sd_log_commited_revoke;
 
 	unsigned int sd_log_num_buf;
 	unsigned int sd_log_num_revoke;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4511b08fc451..e5bf4b59d46e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -417,7 +417,7 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp)
 	databufhdrs_needed = (sdp->sd_log_commited_databuf +
 			      (dbuf_limit - 1)) / dbuf_limit;
 
-	if (sdp->sd_log_commited_revoke)
+	if (sdp->sd_log_commited_revoke > 0)
 		revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
 					  sizeof(u64));
 
@@ -790,7 +790,6 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 	gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
 			     (((int)sdp->sd_log_commited_databuf) >= 0));
 	sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
-	gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
 	reserved = calc_reserved(sdp);
 	gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
 	unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
-- 
cgit v1.2.2


From bb6fbc4548b9ae7ebbd06ef72f00229df259d217 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 11 Mar 2010 09:19:35 -0500
Subject: NFS: Avoid a deadlock in nfs_release_page

J.R. Okajima reports the following deadlock:

INFO: task kswapd0:305 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
kswapd0       D 0000000000000001     0   305      2 0x00000000
 ffff88001f21d4f0 0000000000000046 ffff88001fdea680 ffff88001f21c000
 ffff88001f21dfd8 ffff88001f21c000 ffff88001f21dfd8 ffff88001f21dfd8
 ffff88001fdea040 0000000000014c00 0000000000000001 ffff88001fdea040
Call Trace:
 [<ffffffff8146155d>] io_schedule+0x4d/0x70
 [<ffffffff810d2be5>] sync_page+0x65/0xa0
 [<ffffffff81461b12>] __wait_on_bit_lock+0x52/0xb0
 [<ffffffff810d2b80>] ? sync_page+0x0/0xa0
 [<ffffffff810d2b64>] __lock_page+0x64/0x70
 [<ffffffff81070ce0>] ? wake_bit_function+0x0/0x40
 [<ffffffff810df1d4>] truncate_inode_pages_range+0x344/0x4a0
 [<ffffffff810df340>] truncate_inode_pages+0x10/0x20
 [<ffffffff8112cbfe>] generic_delete_inode+0x15e/0x190
 [<ffffffff8112cc8d>] generic_drop_inode+0x5d/0x80
 [<ffffffff8112bb88>] iput+0x78/0x80
 [<ffffffff811bc908>] nfs_dentry_iput+0x38/0x50
 [<ffffffff811285f4>] dentry_iput+0x84/0x110
 [<ffffffff811286ae>] d_kill+0x2e/0x60
 [<ffffffff8112912a>] dput+0x7a/0x170
 [<ffffffff8111e925>] path_put+0x15/0x40
 [<ffffffff811c3a44>] __put_nfs_open_context+0xa4/0xb0
 [<ffffffff811cb5d0>] ? nfs_free_request+0x0/0x50
 [<ffffffff811c3b0b>] put_nfs_open_context+0xb/0x10
 [<ffffffff811cb5f9>] nfs_free_request+0x29/0x50
 [<ffffffff81234b7e>] kref_put+0x8e/0xe0
 [<ffffffff811cb594>] nfs_release_request+0x14/0x20
 [<ffffffff811cf769>] nfs_find_and_lock_request+0x89/0xa0
 [<ffffffff811d1180>] nfs_wb_page+0x80/0x110
 [<ffffffff811c0770>] nfs_release_page+0x70/0x90
 [<ffffffff810d18ee>] try_to_release_page+0x5e/0x80
 [<ffffffff810e1178>] shrink_page_list+0x638/0x860
 [<ffffffff810e19de>] shrink_zone+0x63e/0xc40

We can fix this by making the call to put_nfs_open_context() happen when we
actually remove the write request from the inode (which is done by the
nfsiod thread in this case).

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/pagelist.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a12c45b65dd4..29d9d36cd5f4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -112,12 +112,10 @@ void nfs_unlock_request(struct nfs_page *req)
  */
 int nfs_set_page_tag_locked(struct nfs_page *req)
 {
-	struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
-
 	if (!nfs_lock_request_dontget(req))
 		return 0;
 	if (req->wb_page != NULL)
-		radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
+		radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
 	return 1;
 }
 
@@ -126,10 +124,10 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
  */
 void nfs_clear_page_tag_locked(struct nfs_page *req)
 {
-	struct inode *inode = req->wb_context->path.dentry->d_inode;
-	struct nfs_inode *nfsi = NFS_I(inode);
-
 	if (req->wb_page != NULL) {
+		struct inode *inode = req->wb_context->path.dentry->d_inode;
+		struct nfs_inode *nfsi = NFS_I(inode);
+
 		spin_lock(&inode->i_lock);
 		radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
 		nfs_unlock_request(req);
@@ -142,16 +140,22 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
  * nfs_clear_request - Free up all resources allocated to the request
  * @req:
  *
- * Release page resources associated with a write request after it
- * has completed.
+ * Release page and open context resources associated with a read/write
+ * request after it has completed.
  */
 void nfs_clear_request(struct nfs_page *req)
 {
 	struct page *page = req->wb_page;
+	struct nfs_open_context *ctx = req->wb_context;
+
 	if (page != NULL) {
 		page_cache_release(page);
 		req->wb_page = NULL;
 	}
+	if (ctx != NULL) {
+		put_nfs_open_context(ctx);
+		req->wb_context = NULL;
+	}
 }
 
 
@@ -165,9 +169,8 @@ static void nfs_free_request(struct kref *kref)
 {
 	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
 
-	/* Release struct file or cached credential */
+	/* Release struct file and open context */
 	nfs_clear_request(req);
-	put_nfs_open_context(req->wb_context);
 	nfs_page_free(req);
 }
 
-- 
cgit v1.2.2


From 720e7749279bde0d08684b1bb4e7a2eedeec6394 Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Thu, 11 Mar 2010 12:24:45 -0500
Subject: GFS2: Skip check for mandatory locks when unlocking

gfs2_lock() will skip locks on file which have mode set to 02666. This is a problem in cases where the mode of the file is changed after a process has obtained a lock on the file. Such a lock will be skipped and will result in a BUG in locks_remove_flock().

gfs2_lock() should skip the check for mandatory locks when unlocking a file.

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index a6abbae8a278..e6dd2aec6f82 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -640,7 +640,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
-	if (__mandatory_lock(&ip->i_inode))
+	if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	if (cmd == F_CANCELLK) {
-- 
cgit v1.2.2


From f11c9c5c259cb2c3d698548dc3936f773ab1f5b9 Mon Sep 17 00:00:00 2001
From: Edward Shishkin <edward.shishkin@gmail.com>
Date: Thu, 11 Mar 2010 14:09:47 -0800
Subject: vfs: improve writeback_inodes_wb()

Do not pin/unpin superblock for every inode in writeback_inodes_wb(), pin
it for the whole group of inodes which belong to the same superblock and
call writeback_sb_inodes() handler for them.

Signed-off-by: Edward Shishkin <edward.shishkin@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 133 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 73 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 76fc4d594acb..6841effa47ca 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -553,108 +553,85 @@ select_queue:
 	return ret;
 }
 
-static void unpin_sb_for_writeback(struct super_block **psb)
+static void unpin_sb_for_writeback(struct super_block *sb)
 {
-	struct super_block *sb = *psb;
-
-	if (sb) {
-		up_read(&sb->s_umount);
-		put_super(sb);
-		*psb = NULL;
-	}
+	up_read(&sb->s_umount);
+	put_super(sb);
 }
 
+enum sb_pin_state {
+	SB_PINNED,
+	SB_NOT_PINNED,
+	SB_PIN_FAILED
+};
+
 /*
  * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
  * before calling writeback. So make sure that we do pin it, so it doesn't
  * go away while we are writing inodes from it.
- *
- * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
- * 1 if we failed.
  */
-static int pin_sb_for_writeback(struct writeback_control *wbc,
-				struct inode *inode, struct super_block **psb)
+static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
+					      struct super_block *sb)
 {
-	struct super_block *sb = inode->i_sb;
-
-	/*
-	 * If this sb is already pinned, nothing more to do. If not and
-	 * *psb is non-NULL, unpin the old one first
-	 */
-	if (sb == *psb)
-		return 0;
-	else if (*psb)
-		unpin_sb_for_writeback(psb);
-
 	/*
 	 * Caller must already hold the ref for this
 	 */
 	if (wbc->sync_mode == WB_SYNC_ALL) {
 		WARN_ON(!rwsem_is_locked(&sb->s_umount));
-		return 0;
+		return SB_NOT_PINNED;
 	}
-
 	spin_lock(&sb_lock);
 	sb->s_count++;
 	if (down_read_trylock(&sb->s_umount)) {
 		if (sb->s_root) {
 			spin_unlock(&sb_lock);
-			goto pinned;
+			return SB_PINNED;
 		}
 		/*
 		 * umounted, drop rwsem again and fall through to failure
 		 */
 		up_read(&sb->s_umount);
 	}
-
 	sb->s_count--;
 	spin_unlock(&sb_lock);
-	return 1;
-pinned:
-	*psb = sb;
-	return 0;
+	return SB_PIN_FAILED;
 }
 
-static void writeback_inodes_wb(struct bdi_writeback *wb,
-				struct writeback_control *wbc)
+/*
+ * Write a portion of b_io inodes which belong to @sb.
+ * If @wbc->sb != NULL, then find and write all such
+ * inodes. Otherwise write only ones which go sequentially
+ * in reverse order.
+ * Return 1, if the caller writeback routine should be
+ * interrupted. Otherwise return 0.
+ */
+static int writeback_sb_inodes(struct super_block *sb,
+			       struct bdi_writeback *wb,
+			       struct writeback_control *wbc)
 {
-	struct super_block *sb = wbc->sb, *pin_sb = NULL;
-	const unsigned long start = jiffies;	/* livelock avoidance */
-
-	spin_lock(&inode_lock);
-
-	if (!wbc->for_kupdate || list_empty(&wb->b_io))
-		queue_io(wb, wbc->older_than_this);
-
 	while (!list_empty(&wb->b_io)) {
-		struct inode *inode = list_entry(wb->b_io.prev,
-						struct inode, i_list);
 		long pages_skipped;
-
-		/*
-		 * super block given and doesn't match, skip this inode
-		 */
-		if (sb && sb != inode->i_sb) {
+		struct inode *inode = list_entry(wb->b_io.prev,
+						 struct inode, i_list);
+		if (wbc->sb && sb != inode->i_sb) {
+			/* super block given and doesn't
+			   match, skip this inode */
 			redirty_tail(inode);
 			continue;
 		}
-
+		if (sb != inode->i_sb)
+			/* finish with this superblock */
+			return 0;
 		if (inode->i_state & (I_NEW | I_WILL_FREE)) {
 			requeue_io(inode);
 			continue;
 		}
-
 		/*
 		 * Was this inode dirtied after sync_sb_inodes was called?
 		 * This keeps sync from extra jobs and livelock.
 		 */
-		if (inode_dirtied_after(inode, start))
-			break;
-
-		if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
-			requeue_io(inode);
-			continue;
-		}
+		if (inode_dirtied_after(inode, wbc->wb_start))
+			return 1;
 
 		BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
 		__iget(inode);
@@ -673,14 +650,50 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
 		spin_lock(&inode_lock);
 		if (wbc->nr_to_write <= 0) {
 			wbc->more_io = 1;
-			break;
+			return 1;
 		}
 		if (!list_empty(&wb->b_more_io))
 			wbc->more_io = 1;
 	}
+	/* b_io is empty */
+	return 1;
+}
+
+static void writeback_inodes_wb(struct bdi_writeback *wb,
+				struct writeback_control *wbc)
+{
+	int ret = 0;
 
-	unpin_sb_for_writeback(&pin_sb);
+	wbc->wb_start = jiffies; /* livelock avoidance */
+	spin_lock(&inode_lock);
+	if (!wbc->for_kupdate || list_empty(&wb->b_io))
+		queue_io(wb, wbc->older_than_this);
+
+	while (!list_empty(&wb->b_io)) {
+		struct inode *inode = list_entry(wb->b_io.prev,
+						 struct inode, i_list);
+		struct super_block *sb = inode->i_sb;
+		enum sb_pin_state state;
+
+		if (wbc->sb && sb != wbc->sb) {
+			/* super block given and doesn't
+			   match, skip this inode */
+			redirty_tail(inode);
+			continue;
+		}
+		state = pin_sb_for_writeback(wbc, sb);
+
+		if (state == SB_PIN_FAILED) {
+			requeue_io(inode);
+			continue;
+		}
+		ret = writeback_sb_inodes(sb, wb, wbc);
 
+		if (state == SB_PINNED)
+			unpin_sb_for_writeback(sb);
+		if (ret)
+			break;
+	}
 	spin_unlock(&inode_lock);
 	/* Leave any unwritten inodes on b_io */
 }
-- 
cgit v1.2.2


From fae4528b2341f2ab0c86c191e24d9cdd93624c60 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 6 Mar 2010 04:44:16 +0000
Subject: fs/9p: re-init the wstat in readdir loop

This ensure that on failure when we free the stat buf we don't end up
freeing an already freed pointer in the earlier loop

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_dir.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 6580aa449541..d8a3afe4ff72 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -76,6 +76,15 @@ static inline int dt_type(struct p9_wstat *mistat)
 	return rettype;
 }
 
+static void p9stat_init(struct p9_wstat *stbuf)
+{
+	stbuf->name  = NULL;
+	stbuf->uid   = NULL;
+	stbuf->gid   = NULL;
+	stbuf->muid  = NULL;
+	stbuf->extension = NULL;
+}
+
 /**
  * v9fs_dir_readdir - read a directory
  * @filp: opened file structure
@@ -131,8 +140,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			rdir->head = 0;
 			rdir->tail = err;
 		}
-
 		while (rdir->head < rdir->tail) {
+			p9stat_init(&st);
 			err = p9stat_read(rdir->buf + rdir->head,
 						buflen - rdir->head, &st,
 						fid->clnt->proto_version);
-- 
cgit v1.2.2


From 45bc21edb52fa71dbb1324c6f573aa880e95519d Mon Sep 17 00:00:00 2001
From: Sripathi Kodi <sripathik@in.ibm.com>
Date: Mon, 8 Mar 2010 17:33:04 +0000
Subject: 9p: Change the name of new protocol from 9p2010.L to 9p2000.L

This patch changes the name of the new 9P protocol from 9p2010.L to
9p2000.u. This is because we learnt that the name 9p2010 is already
being used by others.

Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 79000bf62491..6b801d1ddf4b 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -24,7 +24,7 @@
 /**
  * enum p9_session_flags - option flags for each 9P session
  * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions
- * @V9FS_PROTO_2010L: whether or not to use 9P2010.l extensions
+ * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
  * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
  * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
  * @V9FS_ACCESS_ANY: use a single attach for all users
@@ -34,7 +34,7 @@
  */
 enum p9_session_flags {
 	V9FS_PROTO_2000U	= 0x01,
-	V9FS_PROTO_2010L	= 0x02,
+	V9FS_PROTO_2000L	= 0x02,
 	V9FS_ACCESS_SINGLE	= 0x04,
 	V9FS_ACCESS_USER	= 0x08,
 	V9FS_ACCESS_ANY		= 0x0C,
@@ -130,5 +130,5 @@ static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
 
 static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 {
-	return v9ses->flags & V9FS_PROTO_2010L;
+	return v9ses->flags & V9FS_PROTO_2000L;
 }
-- 
cgit v1.2.2


From fc0f296126433e61600539325975b6c30681c07e Mon Sep 17 00:00:00 2001
From: jvrao <jvrao@linux.vnet.ibm.com>
Date: Mon, 8 Mar 2010 22:07:02 +0000
Subject: 9p: Fixes a simple bug enabling writes beyond 2GB.

Fixes a simple bug so that large files beyond 2GB can be created.

Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 36122683fae8..019f1cd8750b 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -215,7 +215,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	struct p9_fid *fid;
 	struct p9_client *clnt;
 	struct inode *inode = filp->f_path.dentry->d_inode;
-	int origin = *offset;
+	loff_t origin = *offset;
 	unsigned long pg_start, pg_end;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
-- 
cgit v1.2.2


From f78233dd44a110c574fe760ad6f9c1e8741a0d00 Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Sat, 13 Mar 2010 09:03:55 -0600
Subject: 9p: Skip check for mandatory locks when unlocking

While investigating a bug, I came across a possible bug in v9fs. The
problem is similar to the one reported for NFS by ASANO Masahiro in
http://lkml.org/lkml/2005/12/21/334.

v9fs_file_lock() will skip locks on file which has mode set to 02666.
This is a problem in cases where the mode of the file is changed after
a process has obtained a lock on the file. Such a lock will be skipped
during unlock and the machine will end up with a BUG in
locks_remove_flock().

v9fs_file_lock() should skip the check for mandatory locks when
unlocking a file.

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 019f1cd8750b..df52d488d2a6 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -114,7 +114,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
 
 	/* No mandatory locks */
-	if (__mandatory_lock(inode))
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
-- 
cgit v1.2.2


From 6c477d44a7dad43a2783b4dea7f4ca3882d77126 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 14 Mar 2010 02:01:51 +0900
Subject: nilfs2: fix discrepancy in use of static specifier

Two segbuf functions, nilfs_segbuf_write and nilfs_segbuf_wait, are
declared with the static storage class specifier, but their
implementations are not.

This fixes the discrepancy.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segbuf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index ab56fe44e377..68b08f21c648 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -470,8 +470,8 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf,
  *
  * %-ENOMEM - Insufficient memory available.
  */
-int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
-		       struct the_nilfs *nilfs)
+static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+			      struct the_nilfs *nilfs)
 {
 	struct nilfs_write_info wi;
 	struct buffer_head *bh;
@@ -514,7 +514,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
  *
  * %-EIO - I/O error
  */
-int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
+static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
 {
 	int err = 0;
 
-- 
cgit v1.2.2


From 9ccf56c13831c5fe0edecd8c1184c9a6fe805d23 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 14 Mar 2010 03:01:03 +0900
Subject: nilfs2: fix function name typos in docbook comments

Fixes the following typos in docbook comments:

 nilfs_detroy_transaction_cache -> nilfs_destroy_transaction_cache
 nilfs_secgtor_start_timer -> nilfs_segctor_start_timer

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segment.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index ada2f1b947a3..3265bfc57e42 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -141,7 +141,7 @@ int nilfs_init_transaction_cache(void)
 }
 
 /**
- * nilfs_detroy_transaction_cache - destroy the cache for transaction info
+ * nilfs_destroy_transaction_cache - destroy the cache for transaction info
  *
  * nilfs_destroy_transaction_cache() frees the slab cache for the struct
  * nilfs_transaction_info.
@@ -2214,7 +2214,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 }
 
 /**
- * nilfs_secgtor_start_timer - set timer of background write
+ * nilfs_segctor_start_timer - set timer of background write
  * @sci: nilfs_sc_info
  *
  * If the timer has already been set, it ignores the new request.
-- 
cgit v1.2.2


From 1621562b6ac55aa69239895d90276577547a5f62 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 14 Mar 2010 03:17:45 +0900
Subject: nilfs2: fix typo "cout" -> "count" in error message

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 0092840492ee..85c89dfc71f0 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -396,7 +396,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
 		/* next page is past the blocks we've got */
 		if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
 			nilfs_error(dir->i_sb, __func__,
-			       "dir %lu size %lld exceeds block cout %llu",
+			       "dir %lu size %lld exceeds block count %llu",
 			       dir->i_ino, dir->i_size,
 			       (unsigned long long)dir->i_blocks);
 			goto out;
-- 
cgit v1.2.2


From 7a65004bbaa6a21a0438aac9c64814e46084dc3f Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 14 Mar 2010 03:32:40 +0900
Subject: nilfs2: fix various typos in comments

This fixes various typos I found in comments of nilfs2.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/alloc.h   | 2 +-
 fs/nilfs2/dat.c     | 2 +-
 fs/nilfs2/gcinode.c | 4 ++--
 fs/nilfs2/page.c    | 4 ++--
 fs/nilfs2/segbuf.c  | 2 +-
 fs/nilfs2/segment.c | 4 ++--
 fs/nilfs2/segment.h | 2 +-
 fs/nilfs2/sufile.c  | 2 +-
 fs/nilfs2/super.c   | 2 +-
 9 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index f4543ac4f560..5cccf874d692 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -42,7 +42,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
 				   const struct buffer_head *, void *);
 
 /**
- * nilfs_palloc_req - persistent alloctor request and reply
+ * nilfs_palloc_req - persistent allocator request and reply
  * @pr_entry_nr: entry number (vblocknr or inode number)
  * @pr_desc_bh: buffer head of the buffer containing block group descriptors
  * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 9d1e5de91afb..013146755683 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -288,7 +288,7 @@ int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
  * @vblocknrs and @nitems.
  *
  * Return Value: On success, 0 is returned. On error, one of the following
- * nagative error codes is returned.
+ * negative error codes is returned.
  *
  * %-EIO - I/O error.
  *
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e16a6664dfa2..8880a9e281e7 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -28,10 +28,10 @@
  * gcinodes), and this file provides lookup function of the dummy
  * inodes and their buffer read function.
  *
- * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it
+ * Since NILFS2 keeps up multiple checkpoints/snapshots across GC, it
  * has to treat blocks that belong to a same file but have different
  * checkpoint numbers.  To avoid interference among generations, dummy
- * inodes are managed separatly from actual inodes, and their lookup
+ * inodes are managed separately from actual inodes, and their lookup
  * function (nilfs_gc_iget) is designed to be specified with a
  * checkpoint number argument as well as an inode number.
  *
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index a2692bbc7b50..fc246dba112a 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -292,7 +292,7 @@ void nilfs_free_private_page(struct page *page)
  * @src: source page
  * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
  *
- * This fuction is for both data pages and btnode pages.  The dirty flag
+ * This function is for both data pages and btnode pages.  The dirty flag
  * should be treated by caller.  The page must not be under i/o.
  * Both src and dst page must be locked
  */
@@ -388,7 +388,7 @@ repeat:
 }
 
 /**
- * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache
+ * nilfs_copy_back_pages -- copy back pages to original cache from shadow cache
  * @dmap: destination page cache
  * @smap: source page cache
  *
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 68b08f21c648..e3f67c6ce1a7 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -174,7 +174,7 @@ int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
 }
 
 /*
- * Setup segument summary
+ * Setup segment summary
  */
 void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
 {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 3265bfc57e42..69576a95e13f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -201,7 +201,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
  * This function allocates a nilfs_transaction_info struct to keep context
  * information on it.  It is initialized and hooked onto the current task in
  * the outermost call.  If a pre-allocated struct is given to @ti, it is used
- * instead; othewise a new struct is assigned from a slab.
+ * instead; otherwise a new struct is assigned from a slab.
  *
  * When @vacancy_check flag is set, this function will check the amount of
  * free space, and will wait for the GC to reclaim disk space if low capacity.
@@ -2854,7 +2854,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
  * @sbi: nilfs_sb_info
  *
  * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
- * initilizes it, and starts the segment constructor.
+ * initializes it, and starts the segment constructor.
  *
  * Return Value: On success, 0 is returned. On error, one of the following
  * negative error code is returned.
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 3155e0c7f415..2a794569dd14 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -30,7 +30,7 @@
 #include "sb.h"
 
 /**
- * struct nilfs_recovery_info - Recovery infomation
+ * struct nilfs_recovery_info - Recovery information
  * @ri_need_recovery: Recovery status
  * @ri_super_root: Block number of the last super root
  * @ri_ri_cno: Number of the last checkpoint
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index b6c36d0cc331..3c6cc6005c2e 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,7 +18,7 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
  * Written by Koji Sato <koji@osrg.net>.
- * Rivised by Ryusuke Konishi <ryusuke@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
  */
 
 #include <linux/kernel.h>
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 92579cc4c935..896b463cc3ce 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -436,7 +436,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	/*
 	 * Compute the overhead
 	 *
-	 * When distributing meta data blocks outside semgent structure,
+	 * When distributing meta data blocks outside segment structure,
 	 * We must count them as the overhead.
 	 */
 	overhead = 0;
-- 
cgit v1.2.2


From 55480a06e9ee8d05d0e580bf46611df489653c76 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 14 Mar 2010 03:55:56 +0900
Subject: nilfs2: remove spaces before tabs

This kills the following checkpatch warnings:

 WARNING: please, no space before tabs
 #74: FILE: segment.h:74:
 +^Iunsigned ^I^Iflags;$

 WARNING: please, no space before tabs
 #35: FILE: segbuf.c:35:
 +^Iint ^I^I^Istart, end; /* The region to be submitted */$

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segbuf.c  | 2 +-
 fs/nilfs2/segment.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index e3f67c6ce1a7..636eaafd6ea2 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -32,7 +32,7 @@
 struct nilfs_write_info {
 	struct the_nilfs       *nilfs;
 	struct bio	       *bio;
-	int 			start, end; /* The region to be submitted */
+	int			start, end; /* The region to be submitted */
 	int			rest_blocks;
 	int			max_pages;
 	int			nr_vecs;
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 2a794569dd14..82dfd6a686b9 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -71,7 +71,7 @@ struct nilfs_recovery_info {
  */
 struct nilfs_cstage {
 	int			scnt;
-	unsigned 		flags;
+	unsigned		flags;
 	struct nilfs_inode_info *dirty_file_ptr;
 	struct nilfs_inode_info *gc_inode_ptr;
 };
-- 
cgit v1.2.2


From c91cea11dfec65968ff9d1b4239c1eab63bf72fd Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 14 Mar 2010 04:01:27 +0900
Subject: nilfs2: remove whitespaces before quoted newlines

This kills the following checkpatch warnings:

 WARNING: unnecessary whitespace before a quoted newline
 #869: FILE: super.c:869:
 +     	           "remount to a different snapshot. \n",

 WARNING: unnecessary whitespace before a quoted newline
 #389: FILE: the_nilfs.c:389:
 +     	    printk(KERN_ERR "NILFS: too short segment. \n");

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/super.c     | 2 +-
 fs/nilfs2/the_nilfs.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 896b463cc3ce..0cdbc5e7655a 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -866,7 +866,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	if ((*flags & MS_RDONLY) &&
 	    sbi->s_snapshot_cno != old_opts.snapshot_cno) {
 		printk(KERN_WARNING "NILFS (device %s): couldn't "
-		       "remount to a different snapshot. \n",
+		       "remount to a different snapshot.\n",
 		       sb->s_id);
 		err = -EINVAL;
 		goto restore_opts;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 92733d5651d2..33871f7e4f01 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -386,7 +386,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
 
 	nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
 	if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
-		printk(KERN_ERR "NILFS: too short segment. \n");
+		printk(KERN_ERR "NILFS: too short segment.\n");
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.2


From d330a5befb88875a9b3d2db62f9b74dadf660b13 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 14 Mar 2010 18:17:54 -0400
Subject: ext4: Fix estimate of # of blocks needed to write indirect-mapped
 files

http://bugzilla.kernel.org/show_bug.cgi?id=15420

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986120f30066..11119e07233b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1035,7 +1035,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 					      sector_t lblock)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
+	sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
 	int blk_bits;
 
 	if (lblock < EXT4_NDIR_BLOCKS)
@@ -1050,7 +1050,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 	}
 	ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
 	ei->i_da_metadata_calc_len = 1;
-	blk_bits = roundup_pow_of_two(lblock + 1);
+	blk_bits = order_base_2(lblock);
 	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
 
-- 
cgit v1.2.2


From 98d377a0894e6bcca44eafd4d2eee74e8af4db83 Mon Sep 17 00:00:00 2001
From: TARUISI Hiroaki <taruishi.hiroak@jp.fujitsu.com>
Date: Wed, 18 Nov 2009 05:42:14 +0000
Subject: Btrfs: add a function to lookup a directory path by following
 backrefs

This will be used by the inode lookup ioctl.

Signed-off-by: TARUISI Hiroaki <taruishi.hiroak@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 645a17927a8f..ac2a28f4fa1a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -48,6 +48,7 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "locking.h"
+#include "ctree.h"
 
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -743,6 +744,97 @@ out:
 	return ret;
 }
 
+/*
+  Search INODE_REFs to identify path name of 'dirid' directory
+  in a 'tree_id' tree. and sets path name to 'name'.
+*/
+static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
+				u64 tree_id, u64 dirid, char *name)
+{
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	char *name_stack, *ptr;
+	int ret = -1;
+	int slot;
+	int len;
+	int total_len = 0;
+	struct btrfs_inode_ref *iref;
+	struct extent_buffer *l;
+	struct btrfs_path *path;
+
+	if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
+		name[0]='\0';
+		return 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	name_stack = kzalloc(BTRFS_PATH_NAME_MAX+1, GFP_NOFS);
+	if (!name_stack) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	ptr = &name_stack[BTRFS_PATH_NAME_MAX];
+
+	key.objectid = tree_id;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	root = btrfs_read_fs_root_no_name(info, &key);
+	if (IS_ERR(root)) {
+		printk(KERN_ERR "could not find root %llu\n", tree_id);
+		return -ENOENT;
+	}
+
+	key.objectid = dirid;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = 0;
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (ret > 0 && (key.objectid != dirid ||
+					key.type != BTRFS_INODE_REF_KEY))
+			goto out;
+
+		iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
+		len = btrfs_inode_ref_name_len(l, iref);
+		ptr -= len + 1;
+		total_len += len + 1;
+		if (ptr < name_stack)
+			goto out;
+
+		*(ptr + len) = '/';
+		read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
+
+		if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
+			break;
+
+		btrfs_release_path(root, path);
+		key.objectid = key.offset;
+		key.offset = 0;
+		dirid = key.objectid;
+
+	}
+	if (ptr < name_stack)
+		goto out;
+	strncpy(name, ptr, total_len);
+	name[total_len]='\0';
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	kfree(name_stack);
+	return ret;
+}
+
 static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 					     void __user *arg)
 {
-- 
cgit v1.2.2


From ac8e9819d71f907a0532b01b22c26b56bbbcbd21 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sun, 28 Feb 2010 15:39:26 -0500
Subject: Btrfs: add search and inode lookup ioctls

The search ioctl is a generic tool for doing btree searches from
userland applications.  The first user of the search ioctl is a
subvolume listing feature, but we'll also use it to find new
files in a subvolume.

The search ioctl allows you to specify min and max keys to search for,
along with min and max transid.  It returns the items along with a
header that includes the item key.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 249 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/ioctl.h |  66 +++++++++++++++
 2 files changed, 299 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ac2a28f4fa1a..c6044733198d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -744,16 +744,206 @@ out:
 	return ret;
 }
 
+static noinline int key_in_sk(struct btrfs_key *key,
+			      struct btrfs_ioctl_search_key *sk)
+{
+	if (key->objectid < sk->min_objectid)
+		return 0;
+	if (key->offset < sk->min_offset)
+		return 0;
+	if (key->type < sk->min_type)
+		return 0;
+	if (key->objectid > sk->max_objectid)
+		return 0;
+	if (key->type > sk->max_type)
+		return 0;
+	if (key->offset > sk->max_offset)
+		return 0;
+	return 1;
+}
+
+static noinline int copy_to_sk(struct btrfs_root *root,
+			       struct btrfs_path *path,
+			       struct btrfs_key *key,
+			       struct btrfs_ioctl_search_key *sk,
+			       char *buf,
+			       unsigned long *sk_offset,
+			       int *num_found)
+{
+	u64 found_transid;
+	struct extent_buffer *leaf;
+	struct btrfs_ioctl_search_header sh;
+	unsigned long item_off;
+	unsigned long item_len;
+	int nritems;
+	int i;
+	int slot;
+	int found = 0;
+	int ret = 0;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	nritems = btrfs_header_nritems(leaf);
+
+	if (btrfs_header_generation(leaf) > sk->max_transid) {
+		i = nritems;
+		goto advance_key;
+	}
+	found_transid = btrfs_header_generation(leaf);
+
+	for (i = slot; i < nritems; i++) {
+		item_off = btrfs_item_ptr_offset(leaf, i);
+		item_len = btrfs_item_size_nr(leaf, i);
+
+		if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+			item_len = 0;
+
+		if (sizeof(sh) + item_len + *sk_offset >
+		    BTRFS_SEARCH_ARGS_BUFSIZE) {
+			ret = 1;
+			goto overflow;
+		}
+
+		btrfs_item_key_to_cpu(leaf, key, i);
+		if (!key_in_sk(key, sk))
+			continue;
+
+		sh.objectid = key->objectid;
+		sh.offset = key->offset;
+		sh.type = key->type;
+		sh.len = item_len;
+		sh.transid = found_transid;
+
+		/* copy search result header */
+		memcpy(buf + *sk_offset, &sh, sizeof(sh));
+		*sk_offset += sizeof(sh);
+
+		if (item_len) {
+			char *p = buf + *sk_offset;
+			/* copy the item */
+			read_extent_buffer(leaf, p,
+					   item_off, item_len);
+			*sk_offset += item_len;
+			found++;
+		}
+
+		if (*num_found >= sk->nr_items)
+			break;
+	}
+advance_key:
+	if (key->offset < (u64)-1)
+		key->offset++;
+	else if (key->type < (u64)-1)
+		key->type++;
+	else if (key->objectid < (u64)-1)
+		key->objectid++;
+	ret = 0;
+overflow:
+	*num_found += found;
+	return ret;
+}
+
+static noinline int search_ioctl(struct inode *inode,
+				 struct btrfs_ioctl_search_args *args)
+{
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	struct btrfs_key max_key;
+	struct btrfs_path *path;
+	struct btrfs_ioctl_search_key *sk = &args->key;
+	struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
+	int ret;
+	int num_found = 0;
+	unsigned long sk_offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	if (sk->tree_id == 0) {
+		/* search the root of the inode that was passed */
+		root = BTRFS_I(inode)->root;
+	} else {
+		key.objectid = sk->tree_id;
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+		root = btrfs_read_fs_root_no_name(info, &key);
+		if (IS_ERR(root)) {
+			printk(KERN_ERR "could not find root %llu\n",
+			       sk->tree_id);
+			btrfs_free_path(path);
+			return -ENOENT;
+		}
+	}
+
+	key.objectid = sk->min_objectid;
+	key.type = sk->min_type;
+	key.offset = sk->min_offset;
+
+	max_key.objectid = sk->max_objectid;
+	max_key.type = sk->max_type;
+	max_key.offset = sk->max_offset;
+
+	path->keep_locks = 1;
+
+	while(1) {
+		ret = btrfs_search_forward(root, &key, &max_key, path, 0,
+					   sk->min_transid);
+		if (ret != 0) {
+			if (ret > 0)
+				ret = 0;
+			goto err;
+		}
+		ret = copy_to_sk(root, path, &key, sk, args->buf,
+				 &sk_offset, &num_found);
+		btrfs_release_path(root, path);
+		if (ret || num_found >= sk->nr_items)
+			break;
+
+	}
+	ret = 0;
+err:
+	sk->nr_items = num_found;
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search(struct file *file,
+					   void __user *argp)
+{
+	 struct btrfs_ioctl_search_args *args;
+	 struct inode *inode;
+	 int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	args = kmalloc(sizeof(*args), GFP_KERNEL);
+	if (!args)
+		return -ENOMEM;
+
+	if (copy_from_user(args, argp, sizeof(*args))) {
+		kfree(args);
+		return -EFAULT;
+	}
+	inode = fdentry(file)->d_inode;
+	ret = search_ioctl(inode, args);
+	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+		ret = -EFAULT;
+	kfree(args);
+	return ret;
+}
+
 /*
-  Search INODE_REFs to identify path name of 'dirid' directory
-  in a 'tree_id' tree. and sets path name to 'name'.
-*/
+ * Search INODE_REFs to identify path name of 'dirid' directory
+ * in a 'tree_id' tree. and sets path name to 'name'.
+ */
 static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 				u64 tree_id, u64 dirid, char *name)
 {
 	struct btrfs_root *root;
 	struct btrfs_key key;
-	char *name_stack, *ptr;
+	char *ptr;
 	int ret = -1;
 	int slot;
 	int len;
@@ -771,13 +961,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 	if (!path)
 		return -ENOMEM;
 
-	name_stack = kzalloc(BTRFS_PATH_NAME_MAX+1, GFP_NOFS);
-	if (!name_stack) {
-		btrfs_free_path(path);
-		return -ENOMEM;
-	}
-
-	ptr = &name_stack[BTRFS_PATH_NAME_MAX];
+	ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
 
 	key.objectid = tree_id;
 	key.type = BTRFS_ROOT_ITEM_KEY;
@@ -802,14 +986,16 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 		btrfs_item_key_to_cpu(l, &key, slot);
 
 		if (ret > 0 && (key.objectid != dirid ||
-					key.type != BTRFS_INODE_REF_KEY))
+				key.type != BTRFS_INODE_REF_KEY)) {
+			ret = -ENOENT;
 			goto out;
+		}
 
 		iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
 		len = btrfs_inode_ref_name_len(l, iref);
 		ptr -= len + 1;
 		total_len += len + 1;
-		if (ptr < name_stack)
+		if (ptr < name)
 			goto out;
 
 		*(ptr + len) = '/';
@@ -824,14 +1010,41 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 		dirid = key.objectid;
 
 	}
-	if (ptr < name_stack)
+	if (ptr < name)
 		goto out;
-	strncpy(name, ptr, total_len);
+	memcpy(name, ptr, total_len);
 	name[total_len]='\0';
 	ret = 0;
 out:
 	btrfs_free_path(path);
-	kfree(name_stack);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_ino_lookup(struct file *file,
+					   void __user *argp)
+{
+	 struct btrfs_ioctl_ino_lookup_args *args;
+	 struct inode *inode;
+	 int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	args = kmalloc(sizeof(*args), GFP_KERNEL);
+	if (copy_from_user(args, argp, sizeof(*args))) {
+		kfree(args);
+		return -EFAULT;
+	}
+	inode = fdentry(file)->d_inode;
+
+	ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
+					args->treeid, args->objectid,
+					args->name);
+
+	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+		ret = -EFAULT;
+
+	kfree(args);
 	return ret;
 }
 
@@ -1430,6 +1643,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_trans_start(file);
 	case BTRFS_IOC_TRANS_END:
 		return btrfs_ioctl_trans_end(file);
+	case BTRFS_IOC_TREE_SEARCH:
+		return btrfs_ioctl_tree_search(file, argp);
+	case BTRFS_IOC_INO_LOOKUP:
+		return btrfs_ioctl_ino_lookup(file, argp);
 	case BTRFS_IOC_SYNC:
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
 		return 0;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index bc49914475eb..79c07b104f91 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,6 +30,68 @@ struct btrfs_ioctl_vol_args {
 	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
+#define BTRFS_INO_LOOKUP_PATH_MAX 4080
+struct btrfs_ioctl_ino_lookup_args {
+	__u64 treeid;
+	__u64 objectid;
+	char name[BTRFS_INO_LOOKUP_PATH_MAX];
+};
+
+struct btrfs_ioctl_search_key {
+	/* which root are we searching.  0 is the tree of tree roots */
+	__u64 tree_id;
+
+	/* keys returned will be >= min and <= max */
+	__u64 min_objectid;
+	__u64 max_objectid;
+
+	/* keys returned will be >= min and <= max */
+	__u64 min_offset;
+	__u64 max_offset;
+
+	/* max and min transids to search for */
+	__u64 min_transid;
+	__u64 max_transid;
+
+	/* keys returned will be >= min and <= max */
+	__u32 min_type;
+	__u32 max_type;
+
+	/*
+	 * how many items did userland ask for, and how many are we
+	 * returning
+	 */
+	__u32 nr_items;
+
+	/* align to 64 bits */
+	__u32 unused;
+
+	/* some extra for later */
+	__u64 unused1;
+	__u64 unused2;
+	__u64 unused3;
+	__u64 unused4;
+};
+
+struct btrfs_ioctl_search_header {
+	__u64 transid;
+	__u64 objectid;
+	__u64 offset;
+	__u32 type;
+	__u32 len;
+};
+
+#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
+/*
+ * the buf is an array of search headers where
+ * each header is followed by the actual item
+ * the type field is expanded to 32 bits for alignment
+ */
+struct btrfs_ioctl_search_args {
+	struct btrfs_ioctl_search_key key;
+	char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
+};
+
 struct btrfs_ioctl_clone_range_args {
   __s64 src_fd;
   __u64 src_offset, src_length;
@@ -67,4 +129,8 @@ struct btrfs_ioctl_clone_range_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
 				struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
+				   struct btrfs_ioctl_search_args)
+#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
+				   struct btrfs_ioctl_ino_lookup_args)
 #endif
-- 
cgit v1.2.2


From 12534832cb7b0abc7369298246e8b7af03b863ca Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 17 Dec 2009 21:32:27 +0000
Subject: Btrfs: make set/get functions for the super compat_ro flags use
 compat_ro

Our set/get functions for compat_ro_flags actually look at compat_flags.  This
will mess any attempt to use compat flags up.  The fix is obvious.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2aa8ec6a0981..abbce4d90c1b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1842,7 +1842,7 @@ BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
 BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
 			 compat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
-			 compat_flags, 64);
+			 compat_ro_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
 			 incompat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
-- 
cgit v1.2.2


From 73f73415caddbc01d9f10c03e0a677d5b3d11569 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 4 Dec 2009 17:38:27 +0000
Subject: Btrfs: change how we mount subvolumes

This work is in preperation for being able to set a different root as the
default mounting root.

There is currently a problem with how we mount subvolumes.  We cannot currently
mount a subvolume of a subvolume, you can only mount subvolumes/snapshots of the
default subvolume.  So say you take a snapshot of the default subvolume and call
it snap1, and then take a snapshot of snap1 and call it snap2, so now you have

/
/snap1
/snap1/snap2

as your available volumes.  Currently you can only mount / and /snap1,
you cannot mount /snap1/snap2.  To fix this problem instead of passing
subvolid=<name> you must pass in subvolid=<treeid>, where <treeid> is
the tree id that gets spit out via the subvolume listing you get from
the subvolume listing patches (btrfs filesystem list).  This allows us
to mount /, /snap1 and /snap1/snap2 as the root volume.

In addition to the above, we also now read the default dir item in the
tree root to get the root key that it points to.  For now this just
points at what has always been the default subvolme, but later on I plan
to change it to point at whatever root you want to be the new default
root, so you can just set the default mount and not have to mount with
-o subvolid=<treeid>.  I tested this out with the above scenario and it
worked perfectly.  Thanks,

mount -o subvol operates inside the selected subvolid.  For example:

mount -o subvol=snap1,subvolid=256 /dev/xxx /mnt

/mnt will have the snap1 directory for the subvolume with id
256.

mount -o subvol=snap /dev/xxx /mnt

/mnt will be the snap directory of whatever the default subvolume
is.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h      |   2 +-
 fs/btrfs/export.c     |   4 +-
 fs/btrfs/inode.c      |  10 +--
 fs/btrfs/relocation.c |   2 +-
 fs/btrfs/super.c      | 172 ++++++++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/tree-log.c   |   2 +-
 6 files changed, 158 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index abbce4d90c1b..07d956977a07 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2335,7 +2335,7 @@ int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-			 struct btrfs_root *root);
+			 struct btrfs_root *root, int *was_new);
 int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ba5c3fd5ab8c..951ef09b82f4 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -95,7 +95,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	inode = btrfs_iget(sb, &key, root);
+	inode = btrfs_iget(sb, &key, root, NULL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto fail;
@@ -223,7 +223,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+	dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
 	if (!IS_ERR(dentry))
 		dentry->d_op = &btrfs_dentry_operations;
 	return dentry;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4deb280f8969..7d10d1ccb0fe 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2153,7 +2153,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		found_key.objectid = found_key.offset;
 		found_key.type = BTRFS_INODE_ITEM_KEY;
 		found_key.offset = 0;
-		inode = btrfs_iget(root->fs_info->sb, &found_key, root);
+		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
 		if (IS_ERR(inode))
 			break;
 
@@ -3687,7 +3687,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
  * Returns in *is_new if the inode was read from disk
  */
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-			 struct btrfs_root *root)
+			 struct btrfs_root *root, int *new)
 {
 	struct inode *inode;
 
@@ -3702,6 +3702,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 
 		inode_tree_add(inode);
 		unlock_new_inode(inode);
+		if (new)
+			*new = 1;
 	}
 
 	return inode;
@@ -3754,7 +3756,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 		return NULL;
 
 	if (location.type == BTRFS_INODE_ITEM_KEY) {
-		inode = btrfs_iget(dir->i_sb, &location, root);
+		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
 		return inode;
 	}
 
@@ -3769,7 +3771,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 		else
 			inode = new_simple_dir(dir->i_sb, &location, sub_root);
 	} else {
-		inode = btrfs_iget(dir->i_sb, &location, sub_root);
+		inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
 	}
 	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0109e5606bad..d52759daa53f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3487,7 +3487,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 	key.objectid = objectid;
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	inode = btrfs_iget(root->fs_info->sb, &key, root);
+	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
 	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
 	BTRFS_I(inode)->index_cnt = group->key.objectid;
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f8b4521de907..f878337cee6f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -63,10 +63,10 @@ static void btrfs_put_super(struct super_block *sb)
 }
 
 enum {
-	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
-	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
-	Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio,
+	Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
+	Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start,
+	Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool,
+	Opt_noacl, Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio,
 	Opt_flushoncommit,
 	Opt_discard, Opt_err,
 };
@@ -74,6 +74,7 @@ enum {
 static match_table_t tokens = {
 	{Opt_degraded, "degraded"},
 	{Opt_subvol, "subvol=%s"},
+	{Opt_subvolid, "subvolid=%d"},
 	{Opt_device, "device=%s"},
 	{Opt_nodatasum, "nodatasum"},
 	{Opt_nodatacow, "nodatacow"},
@@ -157,6 +158,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, DEGRADED);
 			break;
 		case Opt_subvol:
+		case Opt_subvolid:
 		case Opt_device:
 			/*
 			 * These are parsed by btrfs_parse_early_options
@@ -292,12 +294,13 @@ out:
  * only when we need to allocate a new super block.
  */
 static int btrfs_parse_early_options(const char *options, fmode_t flags,
-		void *holder, char **subvol_name,
+		void *holder, char **subvol_name, u64 *subvol_objectid,
 		struct btrfs_fs_devices **fs_devices)
 {
 	substring_t args[MAX_OPT_ARGS];
 	char *opts, *p;
 	int error = 0;
+	int intarg;
 
 	if (!options)
 		goto out;
@@ -320,6 +323,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		case Opt_subvol:
 			*subvol_name = match_strdup(&args[0]);
 			break;
+		case Opt_subvolid:
+			intarg = 0;
+			match_int(&args[0], &intarg);
+			if (intarg)
+				*subvol_objectid = intarg;
+			break;
 		case Opt_device:
 			error = btrfs_scan_one_device(match_strdup(&args[0]),
 					flags, holder, fs_devices);
@@ -347,6 +356,110 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 	return error;
 }
 
+static struct dentry *get_default_root(struct super_block *sb,
+				       u64 subvol_objectid)
+{
+	struct btrfs_root *root = sb->s_fs_info;
+	struct btrfs_root *new_root;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	struct btrfs_key location;
+	struct inode *inode;
+	struct dentry *dentry;
+	u64 dir_id;
+	int new = 0;
+
+	/*
+	 * We have a specific subvol we want to mount, just setup location and
+	 * go look up the root.
+	 */
+	if (subvol_objectid) {
+		location.objectid = subvol_objectid;
+		location.type = BTRFS_ROOT_ITEM_KEY;
+		location.offset = (u64)-1;
+		goto find_root;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return ERR_PTR(-ENOMEM);
+	path->leave_spinning = 1;
+
+	/*
+	 * Find the "default" dir item which points to the root item that we
+	 * will mount by default if we haven't been given a specific subvolume
+	 * to mount.
+	 */
+	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+	if (!di) {
+		/*
+		 * Ok the default dir item isn't there.  This is weird since
+		 * it's always been there, but don't freak out, just try and
+		 * mount to root most subvolume.
+		 */
+		btrfs_free_path(path);
+		dir_id = BTRFS_FIRST_FREE_OBJECTID;
+		new_root = root->fs_info->fs_root;
+		goto setup_root;
+	}
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+	btrfs_free_path(path);
+
+find_root:
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	if (IS_ERR(new_root))
+		return ERR_PTR(PTR_ERR(new_root));
+
+	if (btrfs_root_refs(&new_root->root_item) == 0)
+		return ERR_PTR(-ENOENT);
+
+	dir_id = btrfs_root_dirid(&new_root->root_item);
+setup_root:
+	location.objectid = dir_id;
+	location.type = BTRFS_INODE_ITEM_KEY;
+	location.offset = 0;
+
+	inode = btrfs_iget(sb, &location, new_root, &new);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * If we're just mounting the root most subvol put the inode and return
+	 * a reference to the dentry.  We will have already gotten a reference
+	 * to the inode in btrfs_fill_super so we're good to go.
+	 */
+	if (!new && sb->s_root->d_inode == inode) {
+		iput(inode);
+		return dget(sb->s_root);
+	}
+
+	if (new) {
+		const struct qstr name = { .name = "/", .len = 1 };
+
+		/*
+		 * New inode, we need to make the dentry a sibling of s_root so
+		 * everything gets cleaned up properly on unmount.
+		 */
+		dentry = d_alloc(sb->s_root, &name);
+		if (!dentry) {
+			iput(inode);
+			return ERR_PTR(-ENOMEM);
+		}
+		d_splice_alias(inode, dentry);
+	} else {
+		/*
+		 * We found the inode in cache, just find a dentry for it and
+		 * put the reference to the inode we just got.
+		 */
+		dentry = d_find_alias(inode);
+		iput(inode);
+	}
+
+	return dentry;
+}
+
 static int btrfs_fill_super(struct super_block *sb,
 			    struct btrfs_fs_devices *fs_devices,
 			    void *data, int silent)
@@ -380,7 +493,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root);
+	inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto fail_close;
@@ -392,12 +505,6 @@ static int btrfs_fill_super(struct super_block *sb,
 		err = -ENOMEM;
 		goto fail_close;
 	}
-#if 0
-	/* this does the super kobj at the same time */
-	err = btrfs_sysfs_add_super(tree_root->fs_info);
-	if (err)
-		goto fail_close;
-#endif
 
 	sb->s_root = root_dentry;
 
@@ -489,19 +596,22 @@ static int btrfs_test_super(struct super_block *s, void *data)
 static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	char *subvol_name = NULL;
 	struct block_device *bdev = NULL;
 	struct super_block *s;
 	struct dentry *root;
 	struct btrfs_fs_devices *fs_devices = NULL;
 	fmode_t mode = FMODE_READ;
+	char *subvol_name = NULL;
+	u64 subvol_objectid = 0;
 	int error = 0;
+	int found = 0;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
 	error = btrfs_parse_early_options(data, mode, fs_type,
-					  &subvol_name, &fs_devices);
+					  &subvol_name, &subvol_objectid,
+					  &fs_devices);
 	if (error)
 		return error;
 
@@ -530,6 +640,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 			goto error_close_devices;
 		}
 
+		found = 1;
 		btrfs_close_devices(fs_devices);
 	} else {
 		char b[BDEVNAME_SIZE];
@@ -547,25 +658,35 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		s->s_flags |= MS_ACTIVE;
 	}
 
-	if (!strcmp(subvol_name, "."))
-		root = dget(s->s_root);
-	else {
-		mutex_lock(&s->s_root->d_inode->i_mutex);
-		root = lookup_one_len(subvol_name, s->s_root,
+	root = get_default_root(s, subvol_objectid);
+	if (IS_ERR(root)) {
+		error = PTR_ERR(root);
+		deactivate_locked_super(s);
+		goto error;
+	}
+	/* if they gave us a subvolume name bind mount into that */
+	if (strcmp(subvol_name, ".")) {
+		struct dentry *new_root;
+		mutex_lock(&root->d_inode->i_mutex);
+		new_root = lookup_one_len(subvol_name, root,
 				      strlen(subvol_name));
-		mutex_unlock(&s->s_root->d_inode->i_mutex);
+		mutex_unlock(&root->d_inode->i_mutex);
 
-		if (IS_ERR(root)) {
+		if (IS_ERR(new_root)) {
 			deactivate_locked_super(s);
-			error = PTR_ERR(root);
-			goto error_free_subvol_name;
+			error = PTR_ERR(new_root);
+			dput(root);
+			goto error_close_devices;
 		}
-		if (!root->d_inode) {
+		if (!new_root->d_inode) {
 			dput(root);
+			dput(new_root);
 			deactivate_locked_super(s);
 			error = -ENXIO;
-			goto error_free_subvol_name;
+			goto error_close_devices;
 		}
+		dput(root);
+		root = new_root;
 	}
 
 	mnt->mnt_sb = s;
@@ -580,6 +701,7 @@ error_close_devices:
 	btrfs_close_devices(fs_devices);
 error_free_subvol_name:
 	kfree(subvol_name);
+error:
 	return error;
 }
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a9434b622ec..1255fcc8ade5 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -445,7 +445,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
 	key.objectid = objectid;
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	inode = btrfs_iget(root->fs_info->sb, &key, root);
+	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
 	if (IS_ERR(inode)) {
 		inode = NULL;
 	} else if (is_bad_inode(inode)) {
-- 
cgit v1.2.2


From 6ef5ed0d386be5c43ec66d6f2999919c0893558b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 11 Dec 2009 21:11:29 +0000
Subject: Btrfs: add ioctl and incompat flag to set the default mount subvol

This patch needs to go along with my previous patch.  This lets us set the
default dir item's location to whatever root we want to use as our default
mounting subvol.  With this we don't have to use mount -o subvol=<tree id>
anymore to mount a different subvol, we can just set the new one and it will
just magically work.  I've done some moderate testing with this, mostly just
switching the default mount around, mounting subvols and the default mount at
the same time and such, everything seems to work.  Thanks,

Older kernels would generally be able to still mount the filesystem with the
default subvolume set, but it would result in a different volume being mounted,
which could be an even more unpleasant suprise for users.  So if you set your
default subvolume, you can't go back to older kernels.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h |  4 ++-
 fs/btrfs/ioctl.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.h |  2 ++
 3 files changed, 80 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 07d956977a07..1166b15e9bf6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -373,11 +373,13 @@ struct btrfs_super_block {
  * ones specified below then we will fail to mount
  */
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(2ULL << 0)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
 #define BTRFS_FEATURE_INCOMPAT_SUPP		\
-	BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
+	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |	\
+	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c6044733198d..7875a75315d0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1579,6 +1579,79 @@ out:
 	return ret;
 }
 
+static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_root *new_root;
+	struct btrfs_dir_item *di;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_key location;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_super_block *disk_super;
+	u64 features;
+	u64 objectid = 0;
+	u64 dir_id;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(&objectid, argp, sizeof(objectid)))
+		return -EFAULT;
+
+	if (!objectid)
+		objectid = root->root_key.objectid;
+
+	location.objectid = objectid;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+	location.offset = (u64)-1;
+
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	if (IS_ERR(new_root))
+		return PTR_ERR(new_root);
+
+	if (btrfs_root_refs(&new_root->root_item) == 0)
+		return -ENOENT;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
+				   dir_id, "default", 7, 1);
+	if (!di) {
+		btrfs_free_path(path);
+		btrfs_end_transaction(trans, root);
+		printk(KERN_ERR "Umm, you don't have the default dir item, "
+		       "this isn't going to work\n");
+		return -ENOENT;
+	}
+
+	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
+	btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_free_path(path);
+
+	disk_super = &root->fs_info->super_copy;
+	features = btrfs_super_incompat_flags(disk_super);
+	if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
+		features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
+		btrfs_set_super_incompat_flags(disk_super, features);
+	}
+	btrfs_end_transaction(trans, root);
+
+	return 0;
+}
+
 /*
  * there are many ways the trans_start and trans_end ioctls can lead
  * to deadlocks.  They should only be used by applications that
@@ -1625,6 +1698,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_snap_create(file, argp, 1);
 	case BTRFS_IOC_SNAP_DESTROY:
 		return btrfs_ioctl_snap_destroy(file, argp);
+	case BTRFS_IOC_DEFAULT_SUBVOL:
+		return btrfs_ioctl_default_subvol(file, argp);
 	case BTRFS_IOC_DEFRAG:
 		return btrfs_ioctl_defrag(file);
 	case BTRFS_IOC_RESIZE:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 79c07b104f91..f1923e0260e3 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -1,3 +1,4 @@
+
 /*
  * Copyright (C) 2007 Oracle.  All rights reserved.
  *
@@ -133,4 +134,5 @@ struct btrfs_ioctl_clone_range_args {
 				   struct btrfs_ioctl_search_args)
 #define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
 				   struct btrfs_ioctl_ino_lookup_args)
+#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
 #endif
-- 
cgit v1.2.2


From 4849f01d153be0f52b8191ee1be0ce492aa96811 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 14 Dec 2009 19:18:38 +0000
Subject: Btrfs: make subvolid=0 mount the original default root

Since theres not a good way to make sure the user sees the original default root
tree id, and not to mention it's 5 so is way different than any other volume,
just make subvol=0 mount the original default root.  This makes it a bit easier
for users to handle in the long run.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f878337cee6f..9771eb8694b6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -325,9 +325,15 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 			break;
 		case Opt_subvolid:
 			intarg = 0;
-			match_int(&args[0], &intarg);
-			if (intarg)
-				*subvol_objectid = intarg;
+			error = match_int(&args[0], &intarg);
+			if (!error) {
+				/* we want the original fs_tree */
+				if (!intarg)
+					*subvol_objectid =
+						BTRFS_FS_TREE_OBJECTID;
+				else
+					*subvol_objectid = intarg;
+			}
 			break;
 		case Opt_device:
 			error = btrfs_scan_one_device(match_strdup(&args[0]),
-- 
cgit v1.2.2


From 51684082b11c304829ea22193d4d96a5b1663b97 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 10 Mar 2010 15:33:32 -0500
Subject: Btrfs: run the backing dev more often in the submit_bio helper

The submit_bio helper thread can decide to loop back around to
service more bios.  This commit forces it to unplug first, which helps
reduce the latency seen by submitters.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 41ecbb2347f2..ace2e8d7bbcd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -325,16 +325,6 @@ loop_lock:
 		num_sync_run = 0;
 		blk_run_backing_dev(bdi, NULL);
 	}
-
-	cond_resched();
-	if (again)
-		goto loop;
-
-	spin_lock(&device->io_lock);
-	if (device->pending_bios.head || device->pending_sync_bios.head)
-		goto loop_lock;
-	spin_unlock(&device->io_lock);
-
 	/*
 	 * IO has already been through a long path to get here.  Checksumming,
 	 * async helper threads, perhaps compression.  We've done a pretty
@@ -346,6 +336,16 @@ loop_lock:
 	 * cared about found its way down here.
 	 */
 	blk_run_backing_dev(bdi, NULL);
+
+	cond_resched();
+	if (again)
+		goto loop;
+
+	spin_lock(&device->io_lock);
+	if (device->pending_bios.head || device->pending_sync_bios.head)
+		goto loop_lock;
+	spin_unlock(&device->io_lock);
+
 done:
 	return 0;
 }
-- 
cgit v1.2.2


From 940100a4a7b78b27e60a3e72340fb9b5397dcdb2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 10 Mar 2010 10:52:59 -0500
Subject: Btrfs: be more selective in the defrag ioctl

The btrfs defrag ioctl had some bugs around delalloc accounting, and it
wasn't properly skipping pages that were not in the mapping.

It wasn't properly clearing the page checked flag, which could make the
writeback code ignore the page forever while pinning it as dirty.

This commit fixes those problems and makes defrag a little smarter.  It
skips holes and it doesn't waste time defragging large extents.  If a
tiny extent comes before a very large extent, it will defrag both of
them to make sure the tiny extent ends up next to something big.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 140 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7875a75315d0..3a89cd77f307 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -475,6 +475,73 @@ out_unlock:
 	return error;
 }
 
+static int should_defrag_range(struct inode *inode, u64 start, u64 len,
+			       u64 *last_len, u64 *skip, u64 *defrag_end)
+{
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	int ret = 1;
+
+	/*
+	 * make sure that once we start defragging and extent, we keep on
+	 * defragging it
+	 */
+	if (start < *defrag_end)
+		return 1;
+
+	*skip = 0;
+
+	/*
+	 * hopefully we have this extent in the tree already, try without
+	 * the full extent lock
+	 */
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	read_unlock(&em_tree->lock);
+
+	if (!em) {
+		/* get the big lock and read metadata off disk */
+		lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+		em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+		unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+
+		if (!em)
+			return 0;
+	}
+
+	/* this will cover holes, and inline extents */
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE)
+		ret = 0;
+
+	/*
+	 * we hit a real extent, if it is big don't bother defragging it again
+	 */
+	if ((*last_len == 0 || *last_len >= 256 * 1024) &&
+	    em->len >= 256 * 1024)
+		ret = 0;
+
+	/*
+	 * last_len ends up being a counter of how many bytes we've defragged.
+	 * every time we choose not to defrag an extent, we reset *last_len
+	 * so that the next tiny extent will force a defrag.
+	 *
+	 * The end result of this is that tiny extents before a single big
+	 * extent will force at least part of that big extent to be defragged.
+	 */
+	if (ret) {
+		*last_len += len;
+		*defrag_end = extent_map_end(em);
+	} else {
+		*last_len = 0;
+		*skip = extent_map_end(em);
+		*defrag_end = 0;
+	}
+
+	free_extent_map(em);
+	return ret;
+}
+
 static int btrfs_defrag_file(struct file *file)
 {
 	struct inode *inode = fdentry(file)->d_inode;
@@ -487,37 +554,86 @@ static int btrfs_defrag_file(struct file *file)
 	unsigned long total_read = 0;
 	u64 page_start;
 	u64 page_end;
+	u64 last_len = 0;
+	u64 skip = 0;
+	u64 defrag_end = 0;
 	unsigned long i;
 	int ret;
 
-	ret = btrfs_check_data_free_space(root, inode, inode->i_size);
-	if (ret)
-		return -ENOSPC;
+	if (inode->i_size == 0)
+		return 0;
+
+	last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+	i = 0;
+	while (i <= last_index) {
+		if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+					PAGE_CACHE_SIZE, &last_len, &skip,
+					&defrag_end)) {
+			unsigned long next;
+			/*
+			 * the should_defrag function tells us how much to skip
+			 * bump our counter by the suggested amount
+			 */
+			next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+			i = max(i + 1, next);
+			continue;
+		}
 
-	mutex_lock(&inode->i_mutex);
-	last_index = inode->i_size >> PAGE_CACHE_SHIFT;
-	for (i = 0; i <= last_index; i++) {
 		if (total_read % ra_pages == 0) {
 			btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
 				       min(last_index, i + ra_pages - 1));
 		}
 		total_read++;
+		mutex_lock(&inode->i_mutex);
+
+		ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+		if (ret) {
+			ret = -ENOSPC;
+			break;
+		}
+
+		ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       PAGE_CACHE_SIZE);
+			ret = -ENOSPC;
+			break;
+		}
 again:
+		if (inode->i_size == 0 ||
+		    i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
+			ret = 0;
+			goto err_reservations;
+		}
+
 		page = grab_cache_page(inode->i_mapping, i);
 		if (!page)
-			goto out_unlock;
+			goto err_reservations;
+
 		if (!PageUptodate(page)) {
 			btrfs_readpage(NULL, page);
 			lock_page(page);
 			if (!PageUptodate(page)) {
 				unlock_page(page);
 				page_cache_release(page);
-				goto out_unlock;
+				goto err_reservations;
 			}
 		}
 
+		if (page->mapping != inode->i_mapping) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto again;
+		}
+
 		wait_on_page_writeback(page);
 
+		if (PageDirty(page)) {
+			btrfs_free_reserved_data_space(root, inode,
+						       PAGE_CACHE_SIZE);
+			goto loop_unlock;
+		}
+
 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -538,18 +654,32 @@ again:
 		 * page if it is dirtied again later
 		 */
 		clear_page_dirty_for_io(page);
+		clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
+				  page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
+				  EXTENT_DO_ACCOUNTING, GFP_NOFS);
 
 		btrfs_set_extent_delalloc(inode, page_start, page_end);
+		ClearPageChecked(page);
 		set_page_dirty(page);
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+loop_unlock:
 		unlock_page(page);
 		page_cache_release(page);
+		mutex_unlock(&inode->i_mutex);
+
+		btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+		i++;
 	}
 
-out_unlock:
-	mutex_unlock(&inode->i_mutex);
 	return 0;
+
+err_reservations:
+	mutex_unlock(&inode->i_mutex);
+	btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+	return ret;
 }
 
 static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
-- 
cgit v1.2.2


From 1e701a3292e25a6c4939cad9f24951dc6b6ad853 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Mar 2010 09:42:04 -0500
Subject: Btrfs: add new defrag-range ioctl.

The btrfs defrag ioctl was limited to doing the entire file.  This
commit adds a new interface that can defrag a specific range inside
the file.

It can also force compression on the file, allowing you to selectively
compress individual files after they were created, even when mount -o
compress isn't turned on.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  5 +++
 fs/btrfs/ctree.h       |  1 -
 fs/btrfs/inode.c       | 11 +++++--
 fs/btrfs/ioctl.c       | 83 ++++++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/ioctl.h       | 31 +++++++++++++++++++
 5 files changed, 117 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3f1f50d9d916..7a4dee199832 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -153,6 +153,11 @@ struct btrfs_inode {
 	unsigned ordered_data_close:1;
 	unsigned dummy_inode:1;
 
+	/*
+	 * always compress this one file
+	 */
+	unsigned force_compress:1;
+
 	struct inode vfs_inode;
 };
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1166b15e9bf6..3a36b1fb553a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1184,7 +1184,6 @@ struct btrfs_root {
 #define BTRFS_INODE_NOATIME		(1 << 9)
 #define BTRFS_INODE_DIRSYNC		(1 << 10)
 
-
 /* some macros to generate set/get funcs for the struct fields.  This
  * assumes there is a lefoo_to_cpu for every type, so lets make a simple
  * one for u8:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d10d1ccb0fe..3657925c2461 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -379,7 +379,8 @@ again:
 	 * change at any time if we discover bad compression ratios.
 	 */
 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
-	    btrfs_test_opt(root, COMPRESS)) {
+	    (btrfs_test_opt(root, COMPRESS) ||
+	     (BTRFS_I(inode)->force_compress))) {
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 
@@ -483,8 +484,10 @@ again:
 		nr_pages_ret = 0;
 
 		/* flag the file so we don't compress in the future */
-		if (!btrfs_test_opt(root, FORCE_COMPRESS))
+		if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
+		    !(BTRFS_I(inode)->force_compress)) {
 			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+		}
 	}
 	if (will_compress) {
 		*num_added += 1;
@@ -1211,7 +1214,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
-	else if (!btrfs_test_opt(root, COMPRESS))
+	else if (!btrfs_test_opt(root, COMPRESS) &&
+		 !(BTRFS_I(inode)->force_compress))
 		ret = cow_file_range(inode, locked_page, start, end,
 				      page_started, nr_written, 1);
 	else
@@ -3639,6 +3643,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->index_cnt = (u64)-1;
 	bi->last_unlink_trans = 0;
 	bi->ordered_data_close = 0;
+	bi->force_compress = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3a89cd77f307..d866b460c26e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -476,13 +476,18 @@ out_unlock:
 }
 
 static int should_defrag_range(struct inode *inode, u64 start, u64 len,
-			       u64 *last_len, u64 *skip, u64 *defrag_end)
+			       int thresh, u64 *last_len, u64 *skip,
+			       u64 *defrag_end)
 {
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map *em = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	int ret = 1;
 
+
+	if (thresh == 0)
+		thresh = 256 * 1024;
+
 	/*
 	 * make sure that once we start defragging and extent, we keep on
 	 * defragging it
@@ -517,8 +522,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 	/*
 	 * we hit a real extent, if it is big don't bother defragging it again
 	 */
-	if ((*last_len == 0 || *last_len >= 256 * 1024) &&
-	    em->len >= 256 * 1024)
+	if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
 		ret = 0;
 
 	/*
@@ -542,7 +546,8 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 	return ret;
 }
 
-static int btrfs_defrag_file(struct file *file)
+static int btrfs_defrag_file(struct file *file,
+			     struct btrfs_ioctl_defrag_range_args *range)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -563,11 +568,19 @@ static int btrfs_defrag_file(struct file *file)
 	if (inode->i_size == 0)
 		return 0;
 
-	last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
-	i = 0;
+	if (range->start + range->len > range->start) {
+		last_index = min_t(u64, inode->i_size - 1,
+			 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
+	} else {
+		last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+	}
+
+	i = range->start >> PAGE_CACHE_SHIFT;
 	while (i <= last_index) {
 		if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
-					PAGE_CACHE_SIZE, &last_len, &skip,
+					PAGE_CACHE_SIZE,
+					range->extent_thresh,
+					&last_len, &skip,
 					&defrag_end)) {
 			unsigned long next;
 			/*
@@ -585,6 +598,8 @@ static int btrfs_defrag_file(struct file *file)
 		}
 		total_read++;
 		mutex_lock(&inode->i_mutex);
+		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
+			BTRFS_I(inode)->force_compress = 1;
 
 		ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
 		if (ret) {
@@ -673,6 +688,28 @@ loop_unlock:
 		i++;
 	}
 
+	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
+		filemap_flush(inode->i_mapping);
+
+	if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+		/* the filemap_flush will queue IO into the worker threads, but
+		 * we have to make sure the IO is actually started and that
+		 * ordered extents get created before we return
+		 */
+		atomic_inc(&root->fs_info->async_submit_draining);
+		while (atomic_read(&root->fs_info->nr_async_submits) ||
+		      atomic_read(&root->fs_info->async_delalloc_pages)) {
+			wait_event(root->fs_info->async_submit_wait,
+			   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+			    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+		}
+		atomic_dec(&root->fs_info->async_submit_draining);
+
+		mutex_lock(&inode->i_mutex);
+		BTRFS_I(inode)->force_compress = 0;
+		mutex_unlock(&inode->i_mutex);
+	}
+
 	return 0;
 
 err_reservations:
@@ -1284,10 +1321,11 @@ out:
 	return err;
 }
 
-static int btrfs_ioctl_defrag(struct file *file)
+static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ioctl_defrag_range_args *range;
 	int ret;
 
 	ret = mnt_want_write(file->f_path.mnt);
@@ -1308,7 +1346,30 @@ static int btrfs_ioctl_defrag(struct file *file)
 			ret = -EINVAL;
 			goto out;
 		}
-		btrfs_defrag_file(file);
+
+		range = kzalloc(sizeof(*range), GFP_KERNEL);
+		if (!range) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		if (argp) {
+			if (copy_from_user(range, argp,
+					   sizeof(*range))) {
+				ret = -EFAULT;
+				kfree(range);
+			}
+			/* compression requires us to start the IO */
+			if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+				range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
+				range->extent_thresh = (u32)-1;
+			}
+		} else {
+			/* the rest are all set to zero by kzalloc */
+			range->len = (u64)-1;
+		}
+		btrfs_defrag_file(file, range);
+		kfree(range);
 		break;
 	}
 out:
@@ -1831,7 +1892,9 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_DEFAULT_SUBVOL:
 		return btrfs_ioctl_default_subvol(file, argp);
 	case BTRFS_IOC_DEFRAG:
-		return btrfs_ioctl_defrag(file);
+		return btrfs_ioctl_defrag(file, NULL);
+	case BTRFS_IOC_DEFRAG_RANGE:
+		return btrfs_ioctl_defrag(file, argp);
 	case BTRFS_IOC_RESIZE:
 		return btrfs_ioctl_resize(root, argp);
 	case BTRFS_IOC_ADD_DEV:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index f1923e0260e3..2d64a65842f3 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -99,6 +99,35 @@ struct btrfs_ioctl_clone_range_args {
   __u64 dest_offset;
 };
 
+/* flags for the defrag range ioctl */
+#define BTRFS_DEFRAG_RANGE_COMPRESS 1
+#define BTRFS_DEFRAG_RANGE_START_IO 2
+
+struct btrfs_ioctl_defrag_range_args {
+	/* start of the defrag operation */
+	__u64 start;
+
+	/* number of bytes to defrag, use (u64)-1 to say all */
+	__u64 len;
+
+	/*
+	 * flags for the operation, which can include turning
+	 * on compression for this one defrag
+	 */
+	__u64 flags;
+
+	/*
+	 * any extent bigger than this will be considered
+	 * already defragged.  Use 0 to take the kernel default
+	 * Use 1 to say every single extent must be rewritten
+	 */
+	__u32 extent_thresh;
+
+	/* spare for later */
+	__u32 unused[5];
+};
+
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -130,6 +159,8 @@ struct btrfs_ioctl_clone_range_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
 				struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
+				struct btrfs_ioctl_defrag_range_args)
 #define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
 				   struct btrfs_ioctl_search_args)
 #define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
-- 
cgit v1.2.2


From 3a0524dc054791688177544fe510d2868ee20d9f Mon Sep 17 00:00:00 2001
From: TARUISI Hiroaki <taruishi.hiroak@jp.fujitsu.com>
Date: Tue, 9 Feb 2010 06:36:45 +0000
Subject: btrfs: Update existing btrfs_device for renaming device

When we scan devices in a multi-device filesystem, we memorize the original
name.  If the device gets a new name, later scans don't update the
in-kernel structures related to it, and we're not able to mount the
filesystem.

This patch updates device name during scaning.

Signed-off-by: TARUISI Hiroaki <taruishi.hiroak@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ace2e8d7bbcd..eb89e1317aca 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -365,6 +365,7 @@ static noinline int device_list_add(const char *path,
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *fs_devices;
 	u64 found_transid = btrfs_super_generation(disk_super);
+	char *name;
 
 	fs_devices = find_fsid(disk_super->fsid);
 	if (!fs_devices) {
@@ -411,6 +412,12 @@ static noinline int device_list_add(const char *path,
 
 		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
+	} else if (strcmp(device->name, path)) {
+		name = kstrdup(path, GFP_NOFS);
+		if (!name)
+			return -ENOMEM;
+		kfree(device->name);
+		device->name = name;
 	}
 
 	if (found_transid > fs_devices->latest_trans) {
-- 
cgit v1.2.2


From bd4d10888990f7e3f8029205d27eb155202d6969 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 5 Mar 2010 21:59:21 +0000
Subject: Btrfs: make df be a little bit more understandable

The way we report df usage is way confusing for everybody, including some other
utilities (bacula for one).  So this patch makes df a little bit more
understandable.  First we make used actually count the total amount of used
space in all space info's.  This will give us a real view of how much disk space
is in use.  Second, for blocks available, only count data space.  This makes
things like bacula work because it says 0 when you can no longer write anymore
data to the disk.  I think this is a nice compromise, since you will end up with
something like the following

[root@alpha ~]# df -h
Filesystem            Size  Used Avail Use% Mounted on
/dev/mapper/VolGroup-lv_root
                      148G   30G  111G  21% /
/dev/sda1             194M  116M   68M  64% /boot
tmpfs                 985M   12K  985M   1% /dev/shm
/dev/mapper/VolGroup-LogVol02
                      145G  140G     0 100% /mnt/btrfs-test

Compare this with btrfsctl -i output

[root@alpha btrfs-progs-unstable]# ./btrfsctl -i /mnt/btrfs-test/
Metadata, DUP: total=4.62GB, used=2.46GB
System, DUP: total=8.00MB, used=24.00KB
Data: total=134.80GB, used=134.80GB
Metadata: total=8.00MB, used=0.00
System: total=4.00MB, used=0.00
operation complete

This way we show that there is no more data space to be used, but we have
another 5GB of space left for metadata.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9771eb8694b6..ff3dd55f294d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -752,14 +752,37 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
 	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct list_head *head = &root->fs_info->space_info;
+	struct btrfs_space_info *found;
+	u64 total_used = 0;
+	u64 data_used = 0;
 	int bits = dentry->d_sb->s_blocksize_bits;
 	__be32 *fsid = (__be32 *)root->fs_info->fsid;
 
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
+				    BTRFS_BLOCK_GROUP_RAID10|
+				    BTRFS_BLOCK_GROUP_RAID1)) {
+			total_used += found->bytes_used;
+			if (found->flags & BTRFS_BLOCK_GROUP_DATA)
+				data_used += found->bytes_used;
+			else
+				data_used += found->total_bytes;
+		}
+
+		total_used += found->bytes_used;
+		if (found->flags & BTRFS_BLOCK_GROUP_DATA)
+			data_used += found->bytes_used;
+		else
+			data_used += found->total_bytes;
+	}
+	rcu_read_unlock();
+
 	buf->f_namelen = BTRFS_NAME_LEN;
 	buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
-	buf->f_bfree = buf->f_blocks -
-		(btrfs_super_bytes_used(disk_super) >> bits);
-	buf->f_bavail = buf->f_bfree;
+	buf->f_bfree = buf->f_blocks - (total_used >> bits);
+	buf->f_bavail = buf->f_blocks - (data_used >> bits);
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
 
-- 
cgit v1.2.2


From 0bdb1db297ab36865a63ee722d35ff0a1f0ae522 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 19 Feb 2010 14:13:50 -0800
Subject: Btrfs: flush data on snapshot creation

Flush any delalloc extents when we create a snapshot, so that recently
written file data is always included in the snapshot.

A later commit will add the ability to snapshot without the flush, but
most people expect flushing.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2a36e236a492..2d654c1c794d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -997,13 +997,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		mutex_unlock(&root->fs_info->trans_mutex);
 
-		if (flush_on_commit) {
+		if (flush_on_commit || snap_pending) {
 			btrfs_start_delalloc_inodes(root, 1);
 			ret = btrfs_wait_ordered_extents(root, 0, 1);
 			BUG_ON(ret);
-		} else if (snap_pending) {
-			ret = btrfs_wait_ordered_extents(root, 0, 1);
-			BUG_ON(ret);
 		}
 
 		/*
-- 
cgit v1.2.2


From 0be2e98173f8badd5ccc7c2e994891746ba1caf4 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 11 Feb 2010 08:06:58 +0000
Subject: btrfs: fix btrfs_mkdir goto for no free objectids

btrfs_mkdir() must jump to the place of ending transaction after
btrfs_find_free_objectid() failed. Or this transaction can't end.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3657925c2461..50ce8840a99b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4508,7 +4508,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
 	if (err) {
 		err = -ENOSPC;
-		goto out_unlock;
+		goto out_fail;
 	}
 
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-- 
cgit v1.2.2


From 4125bf761cd0786e1163e024c7c809ce2cc625bc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 3 Feb 2010 18:18:45 +0000
Subject: Btrfs: finish read pages in the order they are submitted

The endio is done at reverse order of bio vectors.

That means for a sequential read, the page first submitted will finish
last in a bio. Considering we will do checksum (making cache hot) for
every page, this does introduce delay (and chance to squeeze cache used
soon) for pages submitted at the begining.

I don't observe obvious performance difference with below patch at my
simple test, but seems more natural to finish read in the order they are
submitted.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7073cbb1b2d4..355a973719a0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1750,7 +1750,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 static void end_bio_extent_readpage(struct bio *bio, int err)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec = bio->bi_io_vec;
 	struct extent_io_tree *tree;
 	u64 start;
 	u64 end;
@@ -1773,7 +1774,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		else
 			whole_page = 0;
 
-		if (--bvec >= bio->bi_io_vec)
+		if (++bvec <= bvec_end)
 			prefetchw(&bvec->bv_page->flags);
 
 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
@@ -1818,7 +1819,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 			}
 			check_page_locked(tree, page);
 		}
-	} while (bvec >= bio->bi_io_vec);
+	} while (bvec <= bvec_end);
 
 	bio_put(bio);
 }
-- 
cgit v1.2.2


From 49958fd7dbb83cd4d65179d025940e01fe1fbacd Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 2 Feb 2010 21:48:28 +0000
Subject: Btrfs: change the ordered tree to use a spinlock instead of a mutex

The ordered tree used to need a mutex, but currently all we use it for is to
protect the rb_tree, and a spin_lock is just fine for that.  Using a spin_lock
instead makes dbench run a little faster, 58 mb/s instead of 51 mb/s, and have
less latency, 3445.138 ms instead of 3820.633 ms.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.c | 34 +++++++++++++++++-----------------
 fs/btrfs/ordered-data.h |  4 ++--
 2 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c2a9e78a949..d56f732ba95e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -174,7 +174,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	if (!entry)
 		return -ENOMEM;
 
-	mutex_lock(&tree->mutex);
 	entry->file_offset = file_offset;
 	entry->start = start;
 	entry->len = len;
@@ -190,16 +189,17 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	INIT_LIST_HEAD(&entry->list);
 	INIT_LIST_HEAD(&entry->root_extent_list);
 
+	spin_lock(&tree->lock);
 	node = tree_insert(&tree->tree, file_offset,
 			   &entry->rb_node);
 	BUG_ON(node);
+	spin_unlock(&tree->lock);
 
 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 	list_add_tail(&entry->root_extent_list,
 		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
 	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 
-	mutex_unlock(&tree->mutex);
 	BUG_ON(node);
 	return 0;
 }
@@ -216,9 +216,9 @@ int btrfs_add_ordered_sum(struct inode *inode,
 	struct btrfs_ordered_inode_tree *tree;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	list_add_tail(&sum->list, &entry->list);
-	mutex_unlock(&tree->mutex);
+	spin_unlock(&tree->lock);
 	return 0;
 }
 
@@ -240,7 +240,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 	int ret;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	node = tree_search(tree, file_offset);
 	if (!node) {
 		ret = 1;
@@ -264,7 +264,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 	else
 		ret = 1;
 out:
-	mutex_unlock(&tree->mutex);
+	spin_unlock(&tree->lock);
 	return ret == 0;
 }
 
@@ -291,7 +291,7 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 
 /*
  * remove an ordered extent from the tree.  No references are dropped
- * and you must wake_up entry->wait.  You must hold the tree mutex
+ * and you must wake_up entry->wait.  You must hold the tree lock
  * while you call this function.
  */
 static int __btrfs_remove_ordered_extent(struct inode *inode,
@@ -340,9 +340,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 	int ret;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	ret = __btrfs_remove_ordered_extent(inode, entry);
-	mutex_unlock(&tree->mutex);
+	spin_unlock(&tree->lock);
 	wake_up(&entry->wait);
 
 	return ret;
@@ -567,7 +567,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 	struct btrfs_ordered_extent *entry = NULL;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	node = tree_search(tree, file_offset);
 	if (!node)
 		goto out;
@@ -578,7 +578,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 	if (entry)
 		atomic_inc(&entry->refs);
 out:
-	mutex_unlock(&tree->mutex);
+	spin_unlock(&tree->lock);
 	return entry;
 }
 
@@ -594,7 +594,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
 	struct btrfs_ordered_extent *entry = NULL;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	node = tree_search(tree, file_offset);
 	if (!node)
 		goto out;
@@ -602,7 +602,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 	atomic_inc(&entry->refs);
 out:
-	mutex_unlock(&tree->mutex);
+	spin_unlock(&tree->lock);
 	return entry;
 }
 
@@ -629,7 +629,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 	else
 		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
 
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	disk_i_size = BTRFS_I(inode)->disk_i_size;
 
 	/* truncate file */
@@ -735,7 +735,7 @@ out:
 	 */
 	if (ordered)
 		__btrfs_remove_ordered_extent(inode, ordered);
-	mutex_unlock(&tree->mutex);
+	spin_unlock(&tree->lock);
 	if (ordered)
 		wake_up(&ordered->wait);
 	return ret;
@@ -762,7 +762,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 	if (!ordered)
 		return 1;
 
-	mutex_lock(&tree->mutex);
+	spin_lock(&tree->lock);
 	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
 		if (disk_bytenr >= ordered_sum->bytenr) {
 			num_sectors = ordered_sum->len / sectorsize;
@@ -777,7 +777,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 		}
 	}
 out:
-	mutex_unlock(&tree->mutex);
+	spin_unlock(&tree->lock);
 	btrfs_put_ordered_extent(ordered);
 	return ret;
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9116c6d0c5a9..bfbcebbb0adc 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -21,7 +21,7 @@
 
 /* one of these per inode */
 struct btrfs_ordered_inode_tree {
-	struct mutex mutex;
+	spinlock_t lock;
 	struct rb_root tree;
 	struct rb_node *last;
 };
@@ -128,7 +128,7 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
 static inline void
 btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
 {
-	mutex_init(&t->mutex);
+	spin_lock_init(&t->lock);
 	t->tree = RB_ROOT;
 	t->last = NULL;
 }
-- 
cgit v1.2.2


From c2a128d28a2e78e159e17e8c9274d0a9d9492555 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 2 Feb 2010 21:19:11 +0000
Subject: Btrfs: cache extent state in find_delalloc_range

This patch makes us cache the extent state we find in find_delalloc_range since
we'll have to lock the extent later on in the function.  This will keep us from
re-searching for the rang when we try to lock the extent.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 355a973719a0..3c17c9eb0d98 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1171,7 +1171,8 @@ out:
  * 1 is returned if we find something, 0 if nothing was in the tree
  */
 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
-					u64 *start, u64 *end, u64 max_bytes)
+					u64 *start, u64 *end, u64 max_bytes,
+					struct extent_state **cached_state)
 {
 	struct rb_node *node;
 	struct extent_state *state;
@@ -1203,8 +1204,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
 				*end = state->end;
 			goto out;
 		}
-		if (!found)
+		if (!found) {
 			*start = state->start;
+			*cached_state = state;
+			atomic_inc(&state->refs);
+		}
 		found++;
 		*end = state->end;
 		cur_start = state->end + 1;
@@ -1336,10 +1340,11 @@ again:
 	delalloc_start = *start;
 	delalloc_end = 0;
 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
-				    max_bytes);
+				    max_bytes, &cached_state);
 	if (!found || delalloc_end <= *start) {
 		*start = delalloc_start;
 		*end = delalloc_end;
+		free_extent_state(cached_state);
 		return found;
 	}
 
-- 
cgit v1.2.2


From 5a1a3df1f6c86926cfe8657e6f9b4b4c2f467d60 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 2 Feb 2010 20:51:14 +0000
Subject: Btrfs: cache ordered extent when completing io

When finishing io we run btrfs_dec_test_ordered_pending, and then immediately
run btrfs_lookup_ordered_extent, but btrfs_dec_test_ordered_pending does that
already, so we're searching twice when we don't have to.  This patch lets us
pass a btrfs_ordered_extent in to btrfs_dec_test_ordered_pending so if we do
complete io on that ordered extent we can just use the one we found then instead
of having to do another btrfs_lookup_ordered_extent.  This made my fio job with
the other patch go from 24 mb/s to 29 mb/s.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c        | 5 ++---
 fs/btrfs/ordered-data.c | 7 ++++++-
 fs/btrfs/ordered-data.h | 3 ++-
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 50ce8840a99b..1824dda1d351 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1698,11 +1698,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	int compressed = 0;
 	int ret;
 
-	ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+	ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+					     end - start + 1);
 	if (!ret)
 		return 0;
-
-	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
 	BUG_ON(!ordered_extent);
 
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d56f732ba95e..a8ffecd0b491 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -232,11 +232,12 @@ int btrfs_add_ordered_sum(struct inode *inode,
  * to make sure this function only returns 1 once for a given ordered extent.
  */
 int btrfs_dec_test_ordered_pending(struct inode *inode,
+				   struct btrfs_ordered_extent **cached,
 				   u64 file_offset, u64 io_size)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
-	struct btrfs_ordered_extent *entry;
+	struct btrfs_ordered_extent *entry = NULL;
 	int ret;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
@@ -264,6 +265,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 	else
 		ret = 1;
 out:
+	if (!ret && cached && entry) {
+		*cached = entry;
+		atomic_inc(&entry->refs);
+	}
 	spin_unlock(&tree->lock);
 	return ret == 0;
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index bfbcebbb0adc..c82f76a9f040 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -137,7 +137,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
 int btrfs_remove_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry);
 int btrfs_dec_test_ordered_pending(struct inode *inode,
-				       u64 file_offset, u64 io_size);
+				   struct btrfs_ordered_extent **cached,
+				   u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 			     u64 start, u64 len, u64 disk_len, int tyep);
 int btrfs_add_ordered_sum(struct inode *inode,
-- 
cgit v1.2.2


From 2ac55d41b5d6bf49e76bc85db5431240617e2f8f Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 3 Feb 2010 19:33:23 +0000
Subject: Btrfs: cache the extent state everywhere we possibly can V2

This patch just goes through and fixes everybody that does

lock_extent()
blah
unlock_extent()

to use

lock_extent_bits()
blah
unlock_extent_cached()

and pass around a extent_state so we only have to do the searches once per
function.  This gives me about a 3 mb/s boots on my random write test.  I have
not converted some things, like the relocation and ioctl's, since they aren't
heavily used and the relocation stuff is in the middle of being re-written.  I
also changed the clear_extent_bit() to only unset the cached state if we are
clearing EXTENT_LOCKED and related stuff, so we can do things like this

lock_extent_bits()
clear delalloc bits
unlock_extent_cached()

without losing our cached state.  I tested this thoroughly and turned on
LEAK_DEBUG to make sure we weren't leaking extent states, everything worked out
fine.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   3 +-
 fs/btrfs/disk-io.c     |  15 ++++---
 fs/btrfs/extent-tree.c |  11 +++--
 fs/btrfs/extent_io.c   |  61 +++++++++++++++++----------
 fs/btrfs/extent_io.h   |  10 +++--
 fs/btrfs/file.c        |  23 ++++++----
 fs/btrfs/inode.c       | 111 ++++++++++++++++++++++++++++++-------------------
 fs/btrfs/ioctl.c       |   2 +-
 fs/btrfs/relocation.c  |   2 +-
 9 files changed, 148 insertions(+), 90 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3a36b1fb553a..3f704a816137 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2311,7 +2311,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+			      struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0427183e3e05..11d0ad30e203 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -263,13 +263,15 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 static int verify_parent_transid(struct extent_io_tree *io_tree,
 				 struct extent_buffer *eb, u64 parent_transid)
 {
+	struct extent_state *cached_state = NULL;
 	int ret;
 
 	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
 		return 0;
 
-	lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
-	if (extent_buffer_uptodate(io_tree, eb) &&
+	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
+			 0, &cached_state, GFP_NOFS);
+	if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
 	    btrfs_header_generation(eb) == parent_transid) {
 		ret = 0;
 		goto out;
@@ -282,10 +284,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 		       (unsigned long long)btrfs_header_generation(eb));
 	}
 	ret = 1;
-	clear_extent_buffer_uptodate(io_tree, eb);
+	clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
 out:
-	unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
-		      GFP_NOFS);
+	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
+			     &cached_state, GFP_NOFS);
 	return ret;
 }
 
@@ -2497,7 +2499,8 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
 	int ret;
 	struct inode *btree_inode = buf->first_page->mapping->host;
 
-	ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+	ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
+				     NULL);
 	if (!ret)
 		return ret;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 559f72489b3b..1727b26fb194 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6561,6 +6561,7 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
 	struct btrfs_key key;
 	struct inode *inode = NULL;
 	struct btrfs_file_extent_item *fi;
+	struct extent_state *cached_state = NULL;
 	u64 num_bytes;
 	u64 skip_objectid = 0;
 	u32 nritems;
@@ -6589,12 +6590,14 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
 		}
 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
 
-		lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
-			    key.offset + num_bytes - 1, GFP_NOFS);
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
+				 key.offset + num_bytes - 1, 0, &cached_state,
+				 GFP_NOFS);
 		btrfs_drop_extent_cache(inode, key.offset,
 					key.offset + num_bytes - 1, 1);
-		unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
-			      key.offset + num_bytes - 1, GFP_NOFS);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
+				     key.offset + num_bytes - 1, &cached_state,
+				     GFP_NOFS);
 		cond_resched();
 	}
 	iput(inode);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3c17c9eb0d98..c99121ac5d6b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -513,7 +513,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	u64 last_end;
 	int err;
 	int set = 0;
+	int clear = 0;
 
+	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+		clear = 1;
 again:
 	if (!prealloc && (mask & __GFP_WAIT)) {
 		prealloc = alloc_extent_state(mask);
@@ -524,14 +527,20 @@ again:
 	spin_lock(&tree->lock);
 	if (cached_state) {
 		cached = *cached_state;
-		*cached_state = NULL;
-		cached_state = NULL;
+
+		if (clear) {
+			*cached_state = NULL;
+			cached_state = NULL;
+		}
+
 		if (cached && cached->tree && cached->start == start) {
-			atomic_dec(&cached->refs);
+			if (clear)
+				atomic_dec(&cached->refs);
 			state = cached;
 			goto hit_next;
 		}
-		free_extent_state(cached);
+		if (clear)
+			free_extent_state(cached);
 	}
 	/*
 	 * this search will find the extents that end after
@@ -946,11 +955,11 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 }
 
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
-		     gfp_t mask)
+			struct extent_state **cached_state, gfp_t mask)
 {
 	return set_extent_bit(tree, start, end,
 			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
-			      0, NULL, NULL, mask);
+			      0, NULL, cached_state, mask);
 }
 
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -984,10 +993,11 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 }
 
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
-				 u64 end, gfp_t mask)
+				 u64 end, struct extent_state **cached_state,
+				 gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
-				NULL, mask);
+				cached_state, mask);
 }
 
 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1727,7 +1737,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 		}
 
 		if (!uptodate) {
-			clear_extent_uptodate(tree, start, end, GFP_NOFS);
+			clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
 			ClearPageUptodate(page);
 			SetPageError(page);
 		}
@@ -2710,6 +2720,7 @@ int extent_readpages(struct extent_io_tree *tree,
 int extent_invalidatepage(struct extent_io_tree *tree,
 			  struct page *page, unsigned long offset)
 {
+	struct extent_state *cached_state = NULL;
 	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
@@ -2718,12 +2729,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 	if (start > end)
 		return 0;
 
-	lock_extent(tree, start, end, GFP_NOFS);
+	lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
 	wait_on_page_writeback(page);
 	clear_extent_bit(tree, start, end,
 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
 			 EXTENT_DO_ACCOUNTING,
-			 1, 1, NULL, GFP_NOFS);
+			 1, 1, &cached_state, GFP_NOFS);
 	return 0;
 }
 
@@ -2926,16 +2937,17 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 		get_extent_t *get_extent)
 {
 	struct inode *inode = mapping->host;
+	struct extent_state *cached_state = NULL;
 	u64 start = iblock << inode->i_blkbits;
 	sector_t sector = 0;
 	size_t blksize = (1 << inode->i_blkbits);
 	struct extent_map *em;
 
-	lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
-		    GFP_NOFS);
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+			 0, &cached_state, GFP_NOFS);
 	em = get_extent(inode, NULL, 0, start, blksize, 0);
-	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
-		      GFP_NOFS);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
+			     start + blksize - 1, &cached_state, GFP_NOFS);
 	if (!em || IS_ERR(em))
 		return 0;
 
@@ -2957,6 +2969,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	u32 flags = 0;
 	u64 disko = 0;
 	struct extent_map *em = NULL;
+	struct extent_state *cached_state = NULL;
 	int end = 0;
 	u64 em_start = 0, em_len = 0;
 	unsigned long emflags;
@@ -2965,8 +2978,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (len == 0)
 		return -EINVAL;
 
-	lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
-		GFP_NOFS);
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
+			 &cached_state, GFP_NOFS);
 	em = get_extent(inode, NULL, 0, off, max - off, 0);
 	if (!em)
 		goto out;
@@ -3029,8 +3042,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 out_free:
 	free_extent_map(em);
 out:
-	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
-			GFP_NOFS);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
+			     &cached_state, GFP_NOFS);
 	return ret;
 }
 
@@ -3270,7 +3283,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 }
 
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-				struct extent_buffer *eb)
+				struct extent_buffer *eb,
+				struct extent_state **cached_state)
 {
 	unsigned long i;
 	struct page *page;
@@ -3280,7 +3294,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
 	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-			      GFP_NOFS);
+			      cached_state, GFP_NOFS);
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		if (page)
@@ -3340,7 +3354,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 }
 
 int extent_buffer_uptodate(struct extent_io_tree *tree,
-			   struct extent_buffer *eb)
+			   struct extent_buffer *eb,
+			   struct extent_state *cached_state)
 {
 	int ret = 0;
 	unsigned long num_pages;
@@ -3352,7 +3367,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 		return 1;
 
 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-			   EXTENT_UPTODATE, 1, NULL);
+			   EXTENT_UPTODATE, 1, cached_state);
 	if (ret)
 		return ret;
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 36de250a7b2b..bbab4813646f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -163,6 +163,8 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		     int bits, struct extent_state **cached, gfp_t mask);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+			 struct extent_state **cached, gfp_t mask);
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		    gfp_t mask);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
@@ -196,7 +198,7 @@ int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
 				  u64 end, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
-		     gfp_t mask);
+			struct extent_state **cached_state, gfp_t mask);
 int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -281,9 +283,11 @@ int test_extent_buffer_dirty(struct extent_io_tree *tree,
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 			       struct extent_buffer *eb);
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-				struct extent_buffer *eb);
+				struct extent_buffer *eb,
+				struct extent_state **cached_state);
 int extent_buffer_uptodate(struct extent_io_tree *tree,
-			   struct extent_buffer *eb);
+			   struct extent_buffer *eb,
+			   struct extent_state *cached_state);
 int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
 		      unsigned long min_len, char **token, char **map,
 		      unsigned long *map_start,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a7fd9f3a750a..d146dde7efb6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -123,7 +123,8 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
 	end_of_last_block = start_pos + num_bytes - 1;
-	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+					NULL);
 	if (err)
 		return err;
 
@@ -753,6 +754,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 			 loff_t pos, unsigned long first_index,
 			 unsigned long last_index, size_t write_bytes)
 {
+	struct extent_state *cached_state = NULL;
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
 	struct inode *inode = fdentry(file)->d_inode;
@@ -781,16 +783,18 @@ again:
 	}
 	if (start_pos < inode->i_size) {
 		struct btrfs_ordered_extent *ordered;
-		lock_extent(&BTRFS_I(inode)->io_tree,
-			    start_pos, last_pos - 1, GFP_NOFS);
+		lock_extent_bits(&BTRFS_I(inode)->io_tree,
+				 start_pos, last_pos - 1, 0, &cached_state,
+				 GFP_NOFS);
 		ordered = btrfs_lookup_first_ordered_extent(inode,
 							    last_pos - 1);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > start_pos &&
 		    ordered->file_offset < last_pos) {
 			btrfs_put_ordered_extent(ordered);
-			unlock_extent(&BTRFS_I(inode)->io_tree,
-				      start_pos, last_pos - 1, GFP_NOFS);
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     start_pos, last_pos - 1,
+					     &cached_state, GFP_NOFS);
 			for (i = 0; i < num_pages; i++) {
 				unlock_page(pages[i]);
 				page_cache_release(pages[i]);
@@ -802,12 +806,13 @@ again:
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
 
-		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
 				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-				  EXTENT_DO_ACCOUNTING,
+				  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
 				  GFP_NOFS);
-		unlock_extent(&BTRFS_I(inode)->io_tree,
-			      start_pos, last_pos - 1, GFP_NOFS);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+				     start_pos, last_pos - 1, &cached_state,
+				     GFP_NOFS);
 	}
 	for (i = 0; i < num_pages; i++) {
 		clear_page_dirty_for_io(pages[i]);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1824dda1d351..2a337a09c650 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -573,8 +573,8 @@ retry:
 			unsigned long nr_written = 0;
 
 			lock_extent(io_tree, async_extent->start,
-				    async_extent->start +
-				    async_extent->ram_size - 1, GFP_NOFS);
+					 async_extent->start +
+					 async_extent->ram_size - 1, GFP_NOFS);
 
 			/* allocate blocks */
 			ret = cow_file_range(inode, async_cow->locked_page,
@@ -1512,12 +1512,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
+			      struct extent_state **cached_state)
 {
 	if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
 		WARN_ON(1);
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
-				   GFP_NOFS);
+				   cached_state, GFP_NOFS);
 }
 
 /* see btrfs_writepage_start_hook for details on why this is required */
@@ -1530,6 +1531,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 {
 	struct btrfs_writepage_fixup *fixup;
 	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
 	struct page *page;
 	struct inode *inode;
 	u64 page_start;
@@ -1548,7 +1550,8 @@ again:
 	page_start = page_offset(page);
 	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
 
-	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+			 &cached_state, GFP_NOFS);
 
 	/* already ordered? We're done */
 	if (PagePrivate2(page))
@@ -1556,17 +1559,18 @@ again:
 
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
-		unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
-			      page_end, GFP_NOFS);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
+				     page_end, &cached_state, GFP_NOFS);
 		unlock_page(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		goto again;
 	}
 
-	btrfs_set_extent_delalloc(inode, page_start, page_end);
+	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
 	ClearPageChecked(page);
 out:
-	unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			     &cached_state, GFP_NOFS);
 out_page:
 	unlock_page(page);
 	page_cache_release(page);
@@ -1695,6 +1699,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered_extent = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_state *cached_state = NULL;
 	int compressed = 0;
 	int ret;
 
@@ -1716,9 +1721,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 		goto out;
 	}
 
-	lock_extent(io_tree, ordered_extent->file_offset,
-		    ordered_extent->file_offset + ordered_extent->len - 1,
-		    GFP_NOFS);
+	lock_extent_bits(io_tree, ordered_extent->file_offset,
+			 ordered_extent->file_offset + ordered_extent->len - 1,
+			 0, &cached_state, GFP_NOFS);
 
 	trans = btrfs_join_transaction(root, 1);
 
@@ -1745,9 +1750,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 				   ordered_extent->len);
 		BUG_ON(ret);
 	}
-	unlock_extent(io_tree, ordered_extent->file_offset,
-		    ordered_extent->file_offset + ordered_extent->len - 1,
-		    GFP_NOFS);
+	unlock_extent_cached(io_tree, ordered_extent->file_offset,
+			     ordered_extent->file_offset +
+			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
+
 	add_pending_csums(trans, inode, ordered_extent->file_offset,
 			  &ordered_extent->list);
 
@@ -3084,6 +3090,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
 	char *kaddr;
 	u32 blocksize = root->sectorsize;
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
@@ -3130,12 +3137,14 @@ again:
 	}
 	wait_on_page_writeback(page);
 
-	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+			 GFP_NOFS);
 	set_page_extent_mapped(page);
 
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
 		unlock_page(page);
 		page_cache_release(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
@@ -3143,13 +3152,15 @@ again:
 		goto again;
 	}
 
-	clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
-			  GFP_NOFS);
+			  0, 0, &cached_state, GFP_NOFS);
 
-	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+					&cached_state);
 	if (ret) {
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
 		goto out_unlock;
 	}
 
@@ -3162,7 +3173,8 @@ again:
 	}
 	ClearPageChecked(page);
 	set_page_dirty(page);
-	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
+			     GFP_NOFS);
 
 out_unlock:
 	if (ret)
@@ -3180,6 +3192,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map *em;
+	struct extent_state *cached_state = NULL;
 	u64 mask = root->sectorsize - 1;
 	u64 hole_start = (inode->i_size + mask) & ~mask;
 	u64 block_end = (size + mask) & ~mask;
@@ -3195,11 +3208,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 		struct btrfs_ordered_extent *ordered;
 		btrfs_wait_ordered_range(inode, hole_start,
 					 block_end - hole_start);
-		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+				 &cached_state, GFP_NOFS);
 		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
 		if (!ordered)
 			break;
-		unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+		unlock_extent_cached(io_tree, hole_start, block_end - 1,
+				     &cached_state, GFP_NOFS);
 		btrfs_put_ordered_extent(ordered);
 	}
 
@@ -3244,7 +3259,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 			break;
 	}
 
-	unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
+			     GFP_NOFS);
 	return err;
 }
 
@@ -4985,6 +5001,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct extent_io_tree *tree;
 	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
 	u64 page_start = page_offset(page);
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
@@ -5003,7 +5020,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		btrfs_releasepage(page, GFP_NOFS);
 		return;
 	}
-	lock_extent(tree, page_start, page_end, GFP_NOFS);
+	lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+			 GFP_NOFS);
 	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
 					   page_offset(page));
 	if (ordered) {
@@ -5014,7 +5032,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
 				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
-				 NULL, GFP_NOFS);
+				 &cached_state, GFP_NOFS);
 		/*
 		 * whoever cleared the private bit is responsible
 		 * for the finish_ordered_io
@@ -5024,11 +5042,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 						page_start, page_end);
 		}
 		btrfs_put_ordered_extent(ordered);
-		lock_extent(tree, page_start, page_end, GFP_NOFS);
+		cached_state = NULL;
+		lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
+				 GFP_NOFS);
 	}
 	clear_extent_bit(tree, page_start, page_end,
 		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
+		 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
 	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
@@ -5061,6 +5081,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
+	struct extent_state *cached_state = NULL;
 	char *kaddr;
 	unsigned long zero_start;
 	loff_t size;
@@ -5099,7 +5120,8 @@ again:
 	}
 	wait_on_page_writeback(page);
 
-	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
+			 GFP_NOFS);
 	set_page_extent_mapped(page);
 
 	/*
@@ -5108,7 +5130,8 @@ again:
 	 */
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
 		unlock_page(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
@@ -5122,13 +5145,15 @@ again:
 	 * is probably a better way to do this, but for now keep consistent with
 	 * prepare_pages in the normal write path.
 	 */
-	clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
-			  GFP_NOFS);
+			  0, 0, &cached_state, GFP_NOFS);
 
-	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
+					&cached_state);
 	if (ret) {
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent_cached(io_tree, page_start, page_end,
+				     &cached_state, GFP_NOFS);
 		ret = VM_FAULT_SIGBUS;
 		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
 		goto out_unlock;
@@ -5154,7 +5179,7 @@ again:
 	BTRFS_I(inode)->last_trans = root->fs_info->generation;
 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
 
-	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 
 out_unlock:
 	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -5833,6 +5858,7 @@ stop_trans:
 static long btrfs_fallocate(struct inode *inode, int mode,
 			    loff_t offset, loff_t len)
 {
+	struct extent_state *cached_state = NULL;
 	u64 cur_offset;
 	u64 last_byte;
 	u64 alloc_start;
@@ -5871,16 +5897,17 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 		/* the extent lock is ordered inside the running
 		 * transaction
 		 */
-		lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-			    GFP_NOFS);
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+				 locked_end, 0, &cached_state, GFP_NOFS);
 		ordered = btrfs_lookup_first_ordered_extent(inode,
 							    alloc_end - 1);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > alloc_start &&
 		    ordered->file_offset < alloc_end) {
 			btrfs_put_ordered_extent(ordered);
-			unlock_extent(&BTRFS_I(inode)->io_tree,
-				      alloc_start, locked_end, GFP_NOFS);
+			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+					     alloc_start, locked_end,
+					     &cached_state, GFP_NOFS);
 			/*
 			 * we can't wait on the range with the transaction
 			 * running or with the extent lock held
@@ -5922,8 +5949,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
 			break;
 		}
 	}
-	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-		      GFP_NOFS);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+			     &cached_state, GFP_NOFS);
 
 	btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
 				       alloc_end - alloc_start);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d866b460c26e..9aaba6e472d3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -673,7 +673,7 @@ again:
 				  page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
 				  EXTENT_DO_ACCOUNTING, GFP_NOFS);
 
-		btrfs_set_extent_delalloc(inode, page_start, page_end);
+		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
 		ClearPageChecked(page);
 		set_page_dirty(page);
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d52759daa53f..0b23942cbc0d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2659,7 +2659,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 					EXTENT_BOUNDARY, GFP_NOFS);
 			nr++;
 		}
-		btrfs_set_extent_delalloc(inode, page_start, page_end);
+		btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
 
 		set_page_dirty(page);
 		dirty_page++;
-- 
cgit v1.2.2


From 1406e4327be3a533a2b18582f715ce2cfbcf6804 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 13 Jan 2010 18:19:06 +0000
Subject: Btrfs: add a "df" ioctl for btrfs

df is a very loaded question in btrfs.  This gives us a way to get the per-space
usage information so we can tell exactly what is in use where.  This will help
us figure out ENOSPC problems, and help users better understand where their disk
space is going.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.h | 14 +++++++++++++-
 2 files changed, 58 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9aaba6e472d3..9213d39d36cc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1843,6 +1843,49 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	return 0;
 }
 
+long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_space_args space_args;
+	struct btrfs_ioctl_space_info space;
+	struct btrfs_ioctl_space_info *dest;
+	struct btrfs_space_info *info;
+	int ret = 0;
+
+	if (copy_from_user(&space_args,
+			   (struct btrfs_ioctl_space_args __user *)arg,
+			   sizeof(space_args)))
+		return -EFAULT;
+
+	space_args.total_spaces = 0;
+	dest = (struct btrfs_ioctl_space_info *)
+		(arg + sizeof(struct btrfs_ioctl_space_args));
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
+		if (!space_args.space_slots) {
+			space_args.total_spaces++;
+			continue;
+		}
+		if (space_args.total_spaces >= space_args.space_slots)
+			break;
+		space.flags = info->flags;
+		space.total_bytes = info->total_bytes;
+		space.used_bytes = info->bytes_used;
+		if (copy_to_user(dest, &space, sizeof(space))) {
+			ret = -EFAULT;
+			break;
+		}
+		dest++;
+		space_args.total_spaces++;
+	}
+	rcu_read_unlock();
+
+	if (copy_to_user(arg, &space_args, sizeof(space_args)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
 /*
  * there are many ways the trans_start and trans_end ioctls can lead
  * to deadlocks.  They should only be used by applications that
@@ -1915,6 +1958,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_tree_search(file, argp);
 	case BTRFS_IOC_INO_LOOKUP:
 		return btrfs_ioctl_ino_lookup(file, argp);
+	case BTRFS_IOC_SPACE_INFO:
+		return btrfs_ioctl_space_info(root, argp);
 	case BTRFS_IOC_SYNC:
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
 		return 0;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 2d64a65842f3..81eac0a59ea3 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -1,4 +1,3 @@
-
 /*
  * Copyright (C) 2007 Oracle.  All rights reserved.
  *
@@ -127,6 +126,17 @@ struct btrfs_ioctl_defrag_range_args {
 	__u32 unused[5];
 };
 
+struct btrfs_ioctl_space_info {
+	u64 flags;
+	u64 total_bytes;
+	u64 used_bytes;
+};
+
+struct btrfs_ioctl_space_args {
+	u64 space_slots;
+	u64 total_spaces;
+	struct btrfs_ioctl_space_info spaces[0];
+};
 
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
@@ -166,4 +176,6 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
 				   struct btrfs_ioctl_ino_lookup_args)
 #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
+#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
+				    struct btrfs_ioctl_space_args)
 #endif
-- 
cgit v1.2.2


From 91748467a5c5884e44ad5cf58630c0c28474f1f6 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Sun, 28 Feb 2010 10:59:11 +0000
Subject: btrfs: use memparse

Use memparse() instead of its own private implementation.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: linux-btrfs@vger.kernel.org
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h |  1 -
 fs/btrfs/ioctl.c |  2 +-
 fs/btrfs/super.c | 31 +++----------------------------
 3 files changed, 4 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3f704a816137..11115847d875 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2388,7 +2388,6 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 
 /* super.c */
-u64 btrfs_parse_size(char *str);
 int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9213d39d36cc..363e209679b6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -776,7 +776,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 			mod = 1;
 			sizestr++;
 		}
-		new_size = btrfs_parse_size(sizestr);
+		new_size = memparse(sizestr, NULL);
 		if (new_size == 0) {
 			ret = -EINVAL;
 			goto out_unlock;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ff3dd55f294d..9ac612e6ca60 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -96,31 +96,6 @@ static match_table_t tokens = {
 	{Opt_err, NULL},
 };
 
-u64 btrfs_parse_size(char *str)
-{
-	u64 res;
-	int mult = 1;
-	char *end;
-	char last;
-
-	res = simple_strtoul(str, &end, 10);
-
-	last = end[0];
-	if (isalpha(last)) {
-		last = tolower(last);
-		switch (last) {
-		case 'g':
-			mult *= 1024;
-		case 'm':
-			mult *= 1024;
-		case 'k':
-			mult *= 1024;
-		}
-		res = res * mult;
-	}
-	return res;
-}
-
 /*
  * Regular mount options parser.  Everything that is needed only when
  * reading in a new superblock is parsed here.
@@ -216,7 +191,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_max_extent:
 			num = match_strdup(&args[0]);
 			if (num) {
-				info->max_extent = btrfs_parse_size(num);
+				info->max_extent = memparse(num, NULL);
 				kfree(num);
 
 				info->max_extent = max_t(u64,
@@ -228,7 +203,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_max_inline:
 			num = match_strdup(&args[0]);
 			if (num) {
-				info->max_inline = btrfs_parse_size(num);
+				info->max_inline = memparse(num, NULL);
 				kfree(num);
 
 				if (info->max_inline) {
@@ -243,7 +218,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_alloc_start:
 			num = match_strdup(&args[0]);
 			if (num) {
-				info->alloc_start = btrfs_parse_size(num);
+				info->alloc_start = memparse(num, NULL);
 				kfree(num);
 				printk(KERN_INFO
 					"btrfs: allocations start at %llu\n",
-- 
cgit v1.2.2


From a343832f1a55c74791a8a37053fc02ad80640710 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 6 Jan 2010 11:48:18 +0000
Subject: btrfs: using btrfs_stack_device_id() get devid

We can use btrfs_stack_device_id() to get dev_item->devid

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index eb89e1317aca..4053fc44d2cf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -599,7 +599,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			goto error_close;
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
-		devid = le64_to_cpu(disk_super->dev_item.devid);
+		devid = btrfs_stack_device_id(&disk_super->dev_item);
 		if (devid != device->devid)
 			goto error_brelse;
 
@@ -701,7 +701,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		goto error_close;
 	}
 	disk_super = (struct btrfs_super_block *)bh->b_data;
-	devid = le64_to_cpu(disk_super->dev_item.devid);
+	devid = btrfs_stack_device_id(&disk_super->dev_item);
 	transid = btrfs_super_generation(disk_super);
 	if (disk_super->label[0])
 		printk(KERN_INFO "device label %s ", disk_super->label);
@@ -1194,7 +1194,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			goto error_close;
 		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
-		devid = le64_to_cpu(disk_super->dev_item.devid);
+		devid = btrfs_stack_device_id(&disk_super->dev_item);
 		dev_uuid = disk_super->dev_item.uuid;
 		device = btrfs_find_device(root, devid, dev_uuid,
 					   disk_super->fsid);
-- 
cgit v1.2.2


From 5ff7ba3a797a8ffd5299b8477df2dca3c3ebd34e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Mar 2010 10:21:30 -0400
Subject: Btrfs: don't look at bio flags after submit_bio

After callling submit_bio, the bio can be freed at any time.  The
btrfs submission thread helper was checking the bio flags too late,
which might not give the correct answer.

When CONFIG_DEBUG_PAGE_ALLOC is turned on, it can lead to oopsen.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4053fc44d2cf..9df8e3f1ccab 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -256,13 +256,13 @@ loop_lock:
 			wake_up(&fs_info->async_submit_wait);
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-		submit_bio(cur->bi_rw, cur);
-		num_run++;
-		batch_run++;
 
 		if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
 			num_sync_run++;
 
+		submit_bio(cur->bi_rw, cur);
+		num_run++;
+		batch_run++;
 		if (need_resched()) {
 			if (num_sync_run) {
 				blk_run_backing_dev(bdi, NULL);
-- 
cgit v1.2.2


From ef5780c018ed169a77b623f87c4ba52faa8ad0fe Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 15 Mar 2010 11:05:29 -0400
Subject: Btrfs: fix gfp flags masking in the compression code

GFP_FS must be masked out, NOFS can't be or'd in.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a11a32058b50..28b92a7218ab 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -478,7 +478,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 			goto next;
 		}
 
-		page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
+		page = alloc_page(mapping_gfp_mask(mapping) & ~__GFP_FS);
 		if (!page)
 			break;
 
-- 
cgit v1.2.2


From 8212cf7583a5ba5d213d9c9180be808222a2813f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Mon, 15 Mar 2010 11:22:26 +0300
Subject: cifs: trivial white space

I fixed the indent level.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 903d53871da7..20959befc705 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -500,7 +500,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	} else if (pSMBr->hdr.WordCount == 13) {
 		cERROR(1, ("mount failed, cifs module not built "
 			  "with CIFS_WEAK_PW_HASH support"));
-			rc = -EOPNOTSUPP;
+		rc = -EOPNOTSUPP;
 #endif /* WEAK_PW_HASH */
 		goto neg_err_exit;
 	} else if (pSMBr->hdr.WordCount != 17) {
-- 
cgit v1.2.2


From cfbc0683af235106e7dabe92003870b82ad6f0ba Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 11 Mar 2010 11:20:17 +1100
Subject: NFS: ensure bdi_unregister is called on mount failure.

bdi_unregister is called by nfs_put_super which is only called by
generic_shutdown_super if ->s_root is not NULL.  So if we error out
in a circumstance where we called nfs_bdi_register (i.e. server !=
NULL) but have not set s_root, then we need to call bdi_unregister
explicitly in nfs_get_sb and various other *_get_sb() functions.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1afee4eea77..6baf9a393466 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2214,7 +2214,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	} else {
 		error = nfs_bdi_register(server);
 		if (error)
-			goto error_splat_super;
+			goto error_splat_bdi;
 	}
 
 	if (!s->s_root) {
@@ -2256,6 +2256,9 @@ out_err_nosb:
 error_splat_root:
 	dput(mntroot);
 error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
 	deactivate_locked_super(s);
 	goto out;
 }
@@ -2326,7 +2329,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	} else {
 		error = nfs_bdi_register(server);
 		if (error)
-			goto error_splat_super;
+			goto error_splat_bdi;
 	}
 
 	if (!s->s_root) {
@@ -2363,6 +2366,9 @@ out_err_noserver:
 	return error;
 
 error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
 	deactivate_locked_super(s);
 	dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
 	return error;
@@ -2578,7 +2584,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
 	} else {
 		error = nfs_bdi_register(server);
 		if (error)
-			goto error_splat_super;
+			goto error_splat_bdi;
 	}
 
 	if (!s->s_root) {
@@ -2616,6 +2622,9 @@ out_free:
 error_splat_root:
 	dput(mntroot);
 error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
 	deactivate_locked_super(s);
 	goto out;
 }
@@ -2811,7 +2820,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	} else {
 		error = nfs_bdi_register(server);
 		if (error)
-			goto error_splat_super;
+			goto error_splat_bdi;
 	}
 
 	if (!s->s_root) {
@@ -2847,6 +2856,9 @@ out_err_noserver:
 	return error;
 
 error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
 	deactivate_locked_super(s);
 	dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
 	return error;
@@ -2893,7 +2905,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
 	} else {
 		error = nfs_bdi_register(server);
 		if (error)
-			goto error_splat_super;
+			goto error_splat_bdi;
 	}
 
 	if (!s->s_root) {
@@ -2929,6 +2941,9 @@ out_err_noserver:
 	return error;
 
 error_splat_super:
+	if (server && !s->s_root)
+		bdi_unregister(&server->backing_dev_info);
+error_splat_bdi:
 	deactivate_locked_super(s);
 	dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
 	return error;
-- 
cgit v1.2.2


From 854d2c3531e6d32e76b94ca5e096ea54c7497e40 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 16 Mar 2010 00:02:25 +0000
Subject: Btrfs: fix search_ioctl key advance

key->type is u8, not u64.

fs/btrfs/ioctl.c: In function 'copy_to_sk':
fs/btrfs/ioctl.c:1024: warning: comparison is always true due to limited range of data type

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 363e209679b6..38a68863390a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1000,7 +1000,7 @@ static noinline int copy_to_sk(struct btrfs_root *root,
 advance_key:
 	if (key->offset < (u64)-1)
 		key->offset++;
-	else if (key->type < (u64)-1)
+	else if (key->type < (u8)-1)
 		key->type++;
 	else if (key->objectid < (u64)-1)
 		key->objectid++;
-- 
cgit v1.2.2


From ce769a2904bf5a9110ef534a7702397e38e2b3e9 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 16 Mar 2010 00:02:26 +0000
Subject: Btrfs: use __u64 types in ioctl.h

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 81eac0a59ea3..424694aa517f 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -127,14 +127,14 @@ struct btrfs_ioctl_defrag_range_args {
 };
 
 struct btrfs_ioctl_space_info {
-	u64 flags;
-	u64 total_bytes;
-	u64 used_bytes;
+	__u64 flags;
+	__u64 total_bytes;
+	__u64 used_bytes;
 };
 
 struct btrfs_ioctl_space_args {
-	u64 space_slots;
-	u64 total_spaces;
+	__u64 space_slots;
+	__u64 total_spaces;
 	struct btrfs_ioctl_space_info spaces[0];
 };
 
-- 
cgit v1.2.2


From 7fde62bffb576d384ea49a3aed3403d5609ee5bc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 16 Mar 2010 15:40:10 -0400
Subject: Btrfs: buffer results in the space_info ioctl

The space_info ioctl was using copy_to_user inside rcu_read_lock.  This
commit changes things to copy into a buffer first and then dump the
result down to userland.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 57 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 46 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 38a68863390a..4329610b141b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1848,39 +1848,74 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 	struct btrfs_ioctl_space_args space_args;
 	struct btrfs_ioctl_space_info space;
 	struct btrfs_ioctl_space_info *dest;
+	struct btrfs_ioctl_space_info *dest_orig;
+	struct btrfs_ioctl_space_info *user_dest;
 	struct btrfs_space_info *info;
+	int alloc_size;
 	int ret = 0;
+	int slot_count = 0;
 
 	if (copy_from_user(&space_args,
 			   (struct btrfs_ioctl_space_args __user *)arg,
 			   sizeof(space_args)))
 		return -EFAULT;
 
+	/* first we count slots */
+	rcu_read_lock();
+	list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
+		slot_count++;
+	rcu_read_unlock();
+
+	/* space_slots == 0 means they are asking for a count */
+	if (space_args.space_slots == 0) {
+		space_args.total_spaces = slot_count;
+		goto out;
+	}
+	alloc_size = sizeof(*dest) * slot_count;
+	/* we generally have at most 6 or so space infos, one for each raid
+	 * level.  So, a whole page should be more than enough for everyone
+	 */
+	if (alloc_size > PAGE_CACHE_SIZE)
+		return -ENOMEM;
+
 	space_args.total_spaces = 0;
-	dest = (struct btrfs_ioctl_space_info *)
-		(arg + sizeof(struct btrfs_ioctl_space_args));
+	dest = kmalloc(alloc_size, GFP_NOFS);
+	if (!dest)
+		return -ENOMEM;
+	dest_orig = dest;
 
+	/* now we have a buffer to copy into */
 	rcu_read_lock();
 	list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
-		if (!space_args.space_slots) {
-			space_args.total_spaces++;
-			continue;
-		}
+		/* make sure we don't copy more than we allocated
+		 * in our buffer
+		 */
+		if (slot_count == 0)
+			break;
+		slot_count--;
+
+		/* make sure userland has enough room in their buffer */
 		if (space_args.total_spaces >= space_args.space_slots)
 			break;
+
 		space.flags = info->flags;
 		space.total_bytes = info->total_bytes;
 		space.used_bytes = info->bytes_used;
-		if (copy_to_user(dest, &space, sizeof(space))) {
-			ret = -EFAULT;
-			break;
-		}
+		memcpy(dest, &space, sizeof(space));
 		dest++;
 		space_args.total_spaces++;
 	}
 	rcu_read_unlock();
 
-	if (copy_to_user(arg, &space_args, sizeof(space_args)))
+	user_dest = (struct btrfs_ioctl_space_info *)
+		(arg + sizeof(struct btrfs_ioctl_space_args));
+
+	if (copy_to_user(user_dest, dest_orig, alloc_size))
+		ret = -EFAULT;
+
+	kfree(dest_orig);
+out:
+	if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
 		ret = -EFAULT;
 
 	return ret;
-- 
cgit v1.2.2


From cd9640a70d542ca026a812ac34733799da0a39c9 Mon Sep 17 00:00:00 2001
From: Alex Elder <aelder@sgi.com>
Date: Tue, 16 Mar 2010 18:55:54 +0000
Subject: xfs: remove old vmap cache

Re-apply a commit that had been reverted due to regressions
that have since been fixed.

    Original commit: d2859751cd0bf586941ffa7308635a293f943c17
    Author: Nick Piggin <npiggin@suse.de>
    Date: Tue, 6 Jan 2009 14:40:44 +1100

    XFS's vmap batching simply defers a number (up to 64) of vunmaps,
    and keeps track of them in a list. To purge the batch, it just goes
    through the list and calls vunamp on each one. This is pretty poor:
    a global TLB flush is generally still performed on each vunmap, with
    the most expensive parts of the operation being the broadcast IPIs
    and locking involved in the SMP callouts, and the locking involved
    in the vmap management -- none of these are avoided by just batching
    up the calls. I'm actually surprised it ever made much difference.
    (Now that the lazy vmap allocator is upstream, this description is
    not quite right, but the vunmap batching still doesn't seem to do
    much).

    Rip all this logic out of XFS completely. I will improve vmap
    performance and scalability directly in subsequent patch.

    Signed-off-by: Nick Piggin <npiggin@suse.de>
    Reviewed-by: Christoph Hellwig <hch@infradead.org>
    Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

The only change I made was to use the "new" xfs_buf_is_vmapped()
function in a place it had been open-coded in the original.

Modified-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 76 +---------------------------------------------
 1 file changed, 1 insertion(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 6f76ba85f193..81f4ef27de3e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -167,75 +167,6 @@ test_page_region(
 	return (mask && (page_private(page) & mask) == mask);
 }
 
-/*
- *	Mapping of multi-page buffers into contiguous virtual space
- */
-
-typedef struct a_list {
-	void		*vm_addr;
-	struct a_list	*next;
-} a_list_t;
-
-static a_list_t		*as_free_head;
-static int		as_list_len;
-static DEFINE_SPINLOCK(as_lock);
-
-/*
- *	Try to batch vunmaps because they are costly.
- */
-STATIC void
-free_address(
-	void		*addr)
-{
-	a_list_t	*aentry;
-
-#ifdef CONFIG_XEN
-	/*
-	 * Xen needs to be able to make sure it can get an exclusive
-	 * RO mapping of pages it wants to turn into a pagetable.  If
-	 * a newly allocated page is also still being vmap()ed by xfs,
-	 * it will cause pagetable construction to fail.  This is a
-	 * quick workaround to always eagerly unmap pages so that Xen
-	 * is happy.
-	 */
-	vunmap(addr);
-	return;
-#endif
-
-	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
-	if (likely(aentry)) {
-		spin_lock(&as_lock);
-		aentry->next = as_free_head;
-		aentry->vm_addr = addr;
-		as_free_head = aentry;
-		as_list_len++;
-		spin_unlock(&as_lock);
-	} else {
-		vunmap(addr);
-	}
-}
-
-STATIC void
-purge_addresses(void)
-{
-	a_list_t	*aentry, *old;
-
-	if (as_free_head == NULL)
-		return;
-
-	spin_lock(&as_lock);
-	aentry = as_free_head;
-	as_free_head = NULL;
-	as_list_len = 0;
-	spin_unlock(&as_lock);
-
-	while ((old = aentry) != NULL) {
-		vunmap(aentry->vm_addr);
-		aentry = aentry->next;
-		kfree(old);
-	}
-}
-
 /*
  *	Internal xfs_buf_t object manipulation
  */
@@ -337,7 +268,7 @@ xfs_buf_free(
 		uint		i;
 
 		if (xfs_buf_is_vmapped(bp))
-			free_address(bp->b_addr - bp->b_offset);
+			vunmap(bp->b_addr - bp->b_offset);
 
 		for (i = 0; i < bp->b_page_count; i++) {
 			struct page	*page = bp->b_pages[i];
@@ -457,8 +388,6 @@ _xfs_buf_map_pages(
 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 		bp->b_flags |= XBF_MAPPED;
 	} else if (flags & XBF_MAPPED) {
-		if (as_list_len > 64)
-			purge_addresses();
 		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
 					VM_MAP, PAGE_KERNEL);
 		if (unlikely(bp->b_addr == NULL))
@@ -1955,9 +1884,6 @@ xfsbufd(
 			xfs_buf_iostrategy(bp);
 			count++;
 		}
-
-		if (as_list_len > 0)
-			purge_addresses();
 		if (count)
 			blk_run_address_space(target->bt_mapping);
 
-- 
cgit v1.2.2


From 8a262e573d30187b32b5534ec489446931239cc5 Mon Sep 17 00:00:00 2001
From: Alex Elder <aelder@sgi.com>
Date: Tue, 16 Mar 2010 18:55:56 +0000
Subject: xfs: use scalable vmap API

Re-apply a commit that had been reverted due to regressions
that have since been fixed.

    From 95f8e302c04c0b0c6de35ab399a5551605eeb006 Mon Sep 17 00:00:00 2001
    From: Nick Piggin <npiggin@suse.de>
    Date: Tue, 6 Jan 2009 14:43:09 +1100

    Implement XFS's large buffer support with the new vmap APIs. See the vmap
    rewrite (db64fe02) for some numbers. The biggest improvement that comes from
    using the new APIs is avoiding the global KVA allocation lock on every call.

    Signed-off-by: Nick Piggin <npiggin@suse.de>
    Reviewed-by: Christoph Hellwig <hch@infradead.org>
    Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

Only modifications here were a minor reformat, plus making the patch
apply given the new use of xfs_buf_is_vmapped().

Modified-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 81f4ef27de3e..bd111b7e1daa 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -268,7 +268,8 @@ xfs_buf_free(
 		uint		i;
 
 		if (xfs_buf_is_vmapped(bp))
-			vunmap(bp->b_addr - bp->b_offset);
+			vm_unmap_ram(bp->b_addr - bp->b_offset,
+					bp->b_page_count);
 
 		for (i = 0; i < bp->b_page_count; i++) {
 			struct page	*page = bp->b_pages[i];
@@ -388,8 +389,8 @@ _xfs_buf_map_pages(
 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 		bp->b_flags |= XBF_MAPPED;
 	} else if (flags & XBF_MAPPED) {
-		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
-					VM_MAP, PAGE_KERNEL);
+		bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+					-1, PAGE_KERNEL);
 		if (unlikely(bp->b_addr == NULL))
 			return -ENOMEM;
 		bp->b_addr += bp->b_offset;
-- 
cgit v1.2.2


From e8c3753ce4cd6a805ebcfdb3aa6d30e6f4b8b3e0 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 15 Mar 2010 02:36:35 +0000
Subject: xfs: don't warn about page discards on shutdown

If we are doing a forced shutdown, we can get lots of noise about
delalloc pages being discarded. This is happens by design during a
forced shutdown, so don't spam the logs with these messages.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 9083357f9e44..99628508cb11 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -932,6 +932,9 @@ xfs_aops_discard_page(
 	if (!xfs_is_delayed_page(page, IOMAP_DELAY))
 		goto out_invalidate;
 
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		goto out_invalidate;
+
 	xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
 		"page discard on page %p, inode 0x%llx, offset %llu.",
 			page, ip->i_ino, offset);
@@ -964,8 +967,10 @@ xfs_aops_discard_page(
 
 		if (error) {
 			/* something screwed, just bail */
-			xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
-			"page discard failed delalloc mapping lookup.");
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+				"page discard failed delalloc mapping lookup.");
+			}
 			break;
 		}
 		if (!nimaps) {
@@ -991,8 +996,10 @@ xfs_aops_discard_page(
 		ASSERT(!flist.xbf_count && !flist.xbf_first);
 		if (error) {
 			/* something screwed, just bail */
-			xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
 			"page discard unable to remove delalloc mapping.");
+			}
 			break;
 		}
 next_buffer:
-- 
cgit v1.2.2


From ee860b6a650360c91f5d5f9a94262aad9be90015 Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Wed, 10 Mar 2010 10:28:40 -0500
Subject: [PATCH] Skip check for mandatory locks when unlocking

ocfs2_lock() will skip locks on file which has mode set to 02666. This
is a problem in cases where the mode of the file is changed after a
process has obtained a lock on the file.

ocfs2_lock() should skip the check for mandatory locks when unlocking a
file.

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/locks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac6245175..b5cb3ede9408 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
-	if (__mandatory_lock(inode))
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
-- 
cgit v1.2.2


From 78c37eb0d5e6a9727b12ea0f1821795ffaa66cfe Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 3 Mar 2010 11:26:27 +0800
Subject: ocfs2: Change bg_chain check for ocfs2_validate_gd_parent.

In ocfs2_validate_gd_parent, we check bg_chain against the
cl_next_free_rec of the dinode. Actually in resize, we have
the chance of bg_chain == cl_next_free_rec. So add some
additional condition check for it.

I also rename paramter "clean_error" to "resize", since the
old one is not clearly enough to indicate that we should only
meet with this case in resize.

btw, the correpsonding bug is
http://oss.oracle.com/bugzilla/show_bug.cgi?id=1230.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/suballoc.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c3c60bc3e072..0016503d83ef 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -152,7 +152,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 
 #define do_error(fmt, ...)						\
 	do{								\
-		if (clean_error)					\
+		if (resize)					\
 			mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);	\
 		else							\
 			ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
@@ -160,7 +160,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 
 static int ocfs2_validate_gd_self(struct super_block *sb,
 				  struct buffer_head *bh,
-				  int clean_error)
+				  int resize)
 {
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 
@@ -211,7 +211,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
 static int ocfs2_validate_gd_parent(struct super_block *sb,
 				    struct ocfs2_dinode *di,
 				    struct buffer_head *bh,
-				    int clean_error)
+				    int resize)
 {
 	unsigned int max_bits;
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +233,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
 		return -EINVAL;
 	}
 
-	if (le16_to_cpu(gd->bg_chain) >=
-	    le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
+	/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
+	if ((le16_to_cpu(gd->bg_chain) >
+	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
+	    ((le16_to_cpu(gd->bg_chain) ==
+	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
 		do_error("Group descriptor #%llu has bad chain %u",
 			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_chain));
-- 
cgit v1.2.2


From 6527f8f848ec84b9daf1cb07601266126b8507ab Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 10 Mar 2010 09:56:52 +0800
Subject: ocfs2: Update i_blocks in reflink operations.

In reflink, we need to upate i_blocks for the target inode.

Reported-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/refcounttree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9e96921dffda..29405f2ff616 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4075,6 +4075,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
 	OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
 	spin_unlock(&OCFS2_I(t_inode)->ip_lock);
 	i_size_write(t_inode, size);
+	t_inode->i_blocks = s_inode->i_blocks;
 
 	di->i_xattr_inline_size = s_di->i_xattr_inline_size;
 	di->i_clusters = s_di->i_clusters;
-- 
cgit v1.2.2


From fcefd25ac89239cb57fa198f125a79ff85468c75 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Mon, 15 Mar 2010 15:39:00 -0700
Subject: ocfs2: set i_mode on disk during acl operations

ocfs2_set_acl() and ocfs2_init_acl() were setting i_mode on the in-memory
inode, but never setting it on the disk copy. Thus, acls were some times not
getting propagated between nodes. This patch fixes the issue by adding a
helper function ocfs2_acl_set_mode() which does this the right way.
ocfs2_set_acl() and ocfs2_init_acl() are then updated to call
ocfs2_acl_set_mode().

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/acl.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 72 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0501974bedd0..8ccf0f8c9cc8 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -30,6 +30,8 @@
 #include "alloc.h"
 #include "dlmglue.h"
 #include "file.h"
+#include "inode.h"
+#include "journal.h"
 #include "ocfs2_fs.h"
 
 #include "xattr.h"
@@ -165,6 +167,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
 	return acl;
 }
 
+/*
+ * Helper function to set i_mode in memory and disk. Some call paths
+ * will not have di_bh or a journal handle to pass, in which case it
+ * will create it's own.
+ */
+static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
+			      handle_t *handle, umode_t new_mode)
+{
+	int ret, commit_handle = 0;
+	struct ocfs2_dinode *di;
+
+	if (di_bh == NULL) {
+		ret = ocfs2_read_inode_block(inode, &di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else
+		get_bh(di_bh);
+
+	if (handle == NULL) {
+		handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+					   OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			mlog_errno(ret);
+			goto out_brelse;
+		}
+
+		commit_handle = 1;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	inode->i_mode = new_mode;
+	di->i_mode = cpu_to_le16(inode->i_mode);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	if (commit_handle)
+		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out_brelse:
+	brelse(di_bh);
+out:
+	return ret;
+}
+
 /*
  * Set the access or default ACL of an inode.
  */
@@ -193,9 +249,14 @@ static int ocfs2_set_acl(handle_t *handle,
 			if (ret < 0)
 				return ret;
 			else {
-				inode->i_mode = mode;
 				if (ret == 0)
 					acl = NULL;
+
+				ret = ocfs2_acl_set_mode(inode, di_bh,
+							 handle, mode);
+				if (ret)
+					return ret;
+
 			}
 		}
 		break;
@@ -283,6 +344,7 @@ int ocfs2_init_acl(handle_t *handle,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct posix_acl *acl = NULL;
 	int ret = 0;
+	mode_t mode;
 
 	if (!S_ISLNK(inode->i_mode)) {
 		if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -291,12 +353,17 @@ int ocfs2_init_acl(handle_t *handle,
 			if (IS_ERR(acl))
 				return PTR_ERR(acl);
 		}
-		if (!acl)
-			inode->i_mode &= ~current_umask();
+		if (!acl) {
+			mode = inode->i_mode & ~current_umask();
+			ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+			if (ret) {
+				mlog_errno(ret);
+				goto cleanup;
+			}
+		}
 	}
 	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
 		struct posix_acl *clone;
-		mode_t mode;
 
 		if (S_ISDIR(inode->i_mode)) {
 			ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -313,7 +380,7 @@ int ocfs2_init_acl(handle_t *handle,
 		mode = inode->i_mode;
 		ret = posix_acl_create_masq(clone, &mode);
 		if (ret >= 0) {
-			inode->i_mode = mode;
+			ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
 			if (ret > 0) {
 				ret = ocfs2_set_acl(handle, inode,
 						    di_bh, ACL_TYPE_ACCESS,
-- 
cgit v1.2.2


From bcc54e2a6d8e93ff83ec398511930b0a73e19151 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Mon, 15 Mar 2010 00:34:59 -0400
Subject: jffs2: fix up rb_root initializations to use RB_ROOT

jffs2 uses rb_node = NULL; to zero rb_root.

The problem with this is that 17d9ddc72fb8bba0d4f678 ("rbtree: Add
support for augmented rbtrees") in the linux-next tree adds a new field
to that struct which needs to be NULL as well.  This patch uses RB_ROOT
as the intializer so all of the relevant fields will be NULL'd.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Eric Paris <eparis@redhat.com>
Acked-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jffs2/readinode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index e22de8397b74..d32ee9412cb9 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -567,7 +567,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
 			else BUG();
 		}
 	}
-	list->rb_node = NULL;
+	*list = RB_ROOT;
 }
 
 static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd)
-- 
cgit v1.2.2


From c4af96449e20f9245cf3d904098db508cdebcda8 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 15 Mar 2010 00:35:00 -0400
Subject: ntfs: use bitmap_weight

Use bitmap_weight() instead of doing hweight32() for each u32 element in
the page.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/super.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 1cf39dfaee7a..0de1db6cddbf 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -31,6 +31,7 @@
 #include <linux/vfs.h>
 #include <linux/moduleparam.h>
 #include <linux/smp_lock.h>
+#include <linux/bitmap.h>
 
 #include "sysctl.h"
 #include "logfile.h"
@@ -2458,7 +2459,6 @@ static void ntfs_put_super(struct super_block *sb)
 static s64 get_nr_free_clusters(ntfs_volume *vol)
 {
 	s64 nr_free = vol->nr_clusters;
-	u32 *kaddr;
 	struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
 	struct page *page;
 	pgoff_t index, max_index;
@@ -2477,7 +2477,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 	ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
 			max_index, PAGE_CACHE_SIZE / 4);
 	for (index = 0; index < max_index; index++) {
-		unsigned int i;
+		unsigned long *kaddr;
+
 		/*
 		 * Read the page from page cache, getting it from backing store
 		 * if necessary, and increment the use count.
@@ -2490,16 +2491,16 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 			nr_free -= PAGE_CACHE_SIZE * 8;
 			continue;
 		}
-		kaddr = (u32*)kmap_atomic(page, KM_USER0);
+		kaddr = kmap_atomic(page, KM_USER0);
 		/*
-		 * For each 4 bytes, subtract the number of set bits. If this
+		 * Subtract the number of set bits. If this
 		 * is the last page and it is partial we don't really care as
 		 * it just means we do a little extra work but it won't affect
 		 * the result as all out of range bytes are set to zero by
 		 * ntfs_readpage().
 		 */
-	  	for (i = 0; i < PAGE_CACHE_SIZE / 4; i++)
-			nr_free -= (s64)hweight32(kaddr[i]);
+		nr_free -= bitmap_weight(kaddr,
+					PAGE_CACHE_SIZE * BITS_PER_BYTE);
 		kunmap_atomic(kaddr, KM_USER0);
 		page_cache_release(page);
 	}
@@ -2538,7 +2539,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 		s64 nr_free, const pgoff_t max_index)
 {
-	u32 *kaddr;
 	struct address_space *mapping = vol->mftbmp_ino->i_mapping;
 	struct page *page;
 	pgoff_t index;
@@ -2548,7 +2548,8 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 	ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
 			"0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
 	for (index = 0; index < max_index; index++) {
-		unsigned int i;
+		unsigned long *kaddr;
+
 		/*
 		 * Read the page from page cache, getting it from backing store
 		 * if necessary, and increment the use count.
@@ -2561,16 +2562,16 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 			nr_free -= PAGE_CACHE_SIZE * 8;
 			continue;
 		}
-		kaddr = (u32*)kmap_atomic(page, KM_USER0);
+		kaddr = kmap_atomic(page, KM_USER0);
 		/*
-		 * For each 4 bytes, subtract the number of set bits. If this
+		 * Subtract the number of set bits. If this
 		 * is the last page and it is partial we don't really care as
 		 * it just means we do a little extra work but it won't affect
 		 * the result as all out of range bytes are set to zero by
 		 * ntfs_readpage().
 		 */
-	  	for (i = 0; i < PAGE_CACHE_SIZE / 4; i++)
-			nr_free -= (s64)hweight32(kaddr[i]);
+		nr_free -= bitmap_weight(kaddr,
+					PAGE_CACHE_SIZE * BITS_PER_BYTE);
 		kunmap_atomic(kaddr, KM_USER0);
 		page_cache_release(page);
 	}
-- 
cgit v1.2.2


From abc6e1341bda974e2d0eddb75f57a20ac18e9b33 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 18 Mar 2010 12:10:08 -0400
Subject: Btrfs: fix key checks and advance in the search ioctl

The search ioctl was working well for finding tree roots, but using it for
generic searches requires a few changes to how the keys are advanced.
This treats the search control min fields for objectid, type and offset
more like a key, where we drop the offset to zero once we bump the type,
etc.

The downside of this is that we are changing the min_type and min_offset
fields during the search, and so the ioctl caller needs extra checks to make sure
the keys in the result are the ones it wanted.

This also changes key_in_sk to use btrfs_comp_cpu_keys, just to make
things more readable.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4329610b141b..291aa51ff420 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -914,17 +914,23 @@ out:
 static noinline int key_in_sk(struct btrfs_key *key,
 			      struct btrfs_ioctl_search_key *sk)
 {
-	if (key->objectid < sk->min_objectid)
-		return 0;
-	if (key->offset < sk->min_offset)
-		return 0;
-	if (key->type < sk->min_type)
-		return 0;
-	if (key->objectid > sk->max_objectid)
-		return 0;
-	if (key->type > sk->max_type)
+	struct btrfs_key test;
+	int ret;
+
+	test.objectid = sk->min_objectid;
+	test.type = sk->min_type;
+	test.offset = sk->min_offset;
+
+	ret = btrfs_comp_cpu_keys(key, &test);
+	if (ret < 0)
 		return 0;
-	if (key->offset > sk->max_offset)
+
+	test.objectid = sk->max_objectid;
+	test.type = sk->max_type;
+	test.offset = sk->max_offset;
+
+	ret = btrfs_comp_cpu_keys(key, &test);
+	if (ret > 0)
 		return 0;
 	return 1;
 }
@@ -998,13 +1004,18 @@ static noinline int copy_to_sk(struct btrfs_root *root,
 			break;
 	}
 advance_key:
-	if (key->offset < (u64)-1)
+	ret = 0;
+	if (key->offset < (u64)-1 && key->offset < sk->max_offset)
 		key->offset++;
-	else if (key->type < (u8)-1)
+	else if (key->type < (u8)-1 && key->type < sk->max_type) {
+		key->offset = 0;
 		key->type++;
-	else if (key->objectid < (u64)-1)
+	} else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
+		key->offset = 0;
+		key->type = 0;
 		key->objectid++;
-	ret = 0;
+	} else
+		ret = 1;
 overflow:
 	*num_found += found;
 	return ret;
-- 
cgit v1.2.2


From 90fdde147fd32d18a20be5b498d5f26e56cca8a3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 18 Mar 2010 12:14:54 -0400
Subject: Btrfs: return keys for large items to the search ioctl

The search ioctl was skipping large items entirely (ones that are too
big for the results buffer).  This changes things to at least copy
the item header so that we can send information about the item back to
userland.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 291aa51ff420..fd757f576956 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -997,8 +997,8 @@ static noinline int copy_to_sk(struct btrfs_root *root,
 			read_extent_buffer(leaf, p,
 					   item_off, item_len);
 			*sk_offset += item_len;
-			found++;
 		}
+		found++;
 
 		if (*num_found >= sk->nr_items)
 			break;
-- 
cgit v1.2.2


From 1b53ac4d1b75b23bdc2b54ace787b8f718a987ef Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 18 Mar 2010 12:17:05 -0400
Subject: Btrfs: allow treeid==0 in the inode lookup ioctl

When a root id of 0 is sent to the inode lookup ioctl, it will
use the root of the file we're ioctling and pass the root id
back to userland along with the results.

This allows userland to do searches based on that root later on.


Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fd757f576956..1e462de6556e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1215,6 +1215,9 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 	}
 	inode = fdentry(file)->d_inode;
 
+	if (args->treeid == 0)
+		args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+
 	ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
 					args->treeid, args->objectid,
 					args->name);
-- 
cgit v1.2.2


From 8ad6fcab564c5bc956bdc3dfa440ab152b6e780f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 18 Mar 2010 12:23:10 -0400
Subject: Btrfs: fix the inode ref searches done by btrfs_search_path_in_tree

This is used by the inode lookup ioctl to follow all the backrefs up
to the subvol root.  But the search being done would sometimes land one
past the last item in the leaf instead of finding the backref.

This changes the search to look for the highest possible backref and hop
back one item.  It also fixes a leaked path on failure to find the root.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1e462de6556e..2845c6ceecd2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1147,12 +1147,13 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 	root = btrfs_read_fs_root_no_name(info, &key);
 	if (IS_ERR(root)) {
 		printk(KERN_ERR "could not find root %llu\n", tree_id);
-		return -ENOENT;
+		ret = -ENOENT;
+		goto out;
 	}
 
 	key.objectid = dirid;
 	key.type = BTRFS_INODE_REF_KEY;
-	key.offset = 0;
+	key.offset = (u64)-1;
 
 	while(1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1161,6 +1162,8 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 
 		l = path->nodes[0];
 		slot = path->slots[0];
+		if (ret > 0 && slot > 0)
+			slot--;
 		btrfs_item_key_to_cpu(l, &key, slot);
 
 		if (ret > 0 && (key.objectid != dirid ||
@@ -1184,7 +1187,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 
 		btrfs_release_path(root, path);
 		key.objectid = key.offset;
-		key.offset = 0;
+		key.offset = (u64)-1;
 		dirid = key.objectid;
 
 	}
-- 
cgit v1.2.2


From b22b63ebafb97b66d1054e69941ee049d790c6cf Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Thu, 11 Mar 2010 18:43:46 -0800
Subject: ocfs2: Always try for maximum bits with new local alloc windows

What we were doing before was to ask for the current window size as the
maximum allocation. This had the effect of limiting the amount of allocation
we could get for the local alloc during times when the window size was
shrunk due to fragmentation. In some cases, that could actually *increase*
fragmentation by artificially limiting the number of bits we can accept. So
while we still want to ask for a minimum number of bits equal to window
size, there is no reason why we should limit the number of bits the local
alloc should accept. Hence always allow the maximum number of local alloc
bits.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/localalloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ca992d91f511..171c691b42a0 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -984,8 +984,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 	}
 
 retry_enospc:
-	(*ac)->ac_bits_wanted = osb->local_alloc_bits;
-
+	(*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
 	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
 	if (status == -ENOSPC) {
 		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1061,6 +1060,7 @@ retry_enospc:
 		    OCFS2_LA_DISABLED)
 			goto bail;
 
+		ac->ac_bits_wanted = osb->local_alloc_default_bits;
 		status = ocfs2_claim_clusters(osb, handle, ac,
 					      osb->local_alloc_bits,
 					      &cluster_off,
-- 
cgit v1.2.2


From d812e575822a2b7ab1a7cadae2571505ec6ec2bd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 19 Mar 2010 13:55:17 -0400
Subject: NFS: Prevent another deadlock in nfs_release_page()

We should not attempt to free the page if __GFP_FS is not set. Otherwise we
can deadlock as per

  http://bugzilla.kernel.org/show_bug.cgi?id=15578

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ae8d02294e46..ae0d92736531 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -491,7 +491,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 {
 	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
 
-	if (gfp & __GFP_WAIT)
+	/* Only do I/O if gfp is a superset of GFP_KERNEL */
+	if ((gfp & GFP_KERNEL) == GFP_KERNEL)
 		nfs_wb_page(page->mapping->host, page);
 	/* If PagePrivate() is set, then the page is not freeable */
 	if (PagePrivate(page))
-- 
cgit v1.2.2


From dfe4d3d6a6f707fff1dbfd4b8fce65e64a91b809 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 19 Mar 2010 15:04:23 +0800
Subject: ocfs2: Fix the update of name_offset when removing xattrs

When replacing a xattr's value, in some case we wipe its name/value
first and then re-add it. The wipe is done by
ocfs2_xa_block_wipe_namevalue() when the xattr is in the inode or
block. We currently adjust name_offset for all the entries which have
(offset < name_offset). This does not adjust the entrie we're replacing.
Since we are replacing the entry, we don't adjust the total entry count.
When we calculate a new namevalue location, we trust the entries
now-wrong offset in ocfs2_xa_get_free_start().  The solution is to
also adjust the name_offset for the replaced entry, allowing
ocfs2_xa_get_free_start() to calculate the new namevalue location
correctly.

The following script can trigger a kernel panic easily.

echo 'y'|mkfs.ocfs2 --fs-features=local,xattr -b 4K $DEVICE
mount -t ocfs2 $DEVICE $MNT_DIR
FILE=$MNT_DIR/$RANDOM
for((i=0;i<76;i++))
do
string_76="a$string_76"
done
string_78="aa$string_76"
string_82="aaaa$string_78"

touch $FILE
setfattr -n 'user.test1234567890' -v $string_76 $FILE
setfattr -n 'user.test1234567890' -v $string_78 $FILE
setfattr -n 'user.test1234567890' -v $string_82 $FILE

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d1b0d386f6d1..82c2a0b53eb4 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1622,7 +1622,7 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
 	/* Now tell xh->xh_entries about it */
 	for (i = 0; i < count; i++) {
 		offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
-		if (offset < namevalue_offset)
+		if (offset <= namevalue_offset)
 			le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
 				     namevalue_size);
 	}
-- 
cgit v1.2.2


From b23179681c90a55e2a2083e1dde9f727ecffb2b7 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 19 Mar 2010 15:04:24 +0800
Subject: ocfs2: Init meta_ac properly in ocfs2_create_empty_xattr_block.

You can't store a pointer that you haven't filled in yet and expect it
to work.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 82c2a0b53eb4..3e7773089b96 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -6528,13 +6528,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
 					  int indexed)
 {
 	int ret;
-	struct ocfs2_alloc_context *meta_ac;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_xattr_set_ctxt ctxt = {
-		.meta_ac = meta_ac,
-	};
+	struct ocfs2_xattr_set_ctxt ctxt;
 
-	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+	memset(&ctxt, 0, sizeof(ctxt));
+	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -6556,7 +6554,7 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
 
 	ocfs2_commit_trans(osb, ctxt.handle);
 out:
-	ocfs2_free_alloc_context(meta_ac);
+	ocfs2_free_alloc_context(ctxt.meta_ac);
 	return ret;
 }
 
-- 
cgit v1.2.2


From 978097c907b58a2d085bbf7632bee1a5a7e6f6ba Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 8 Mar 2010 15:27:53 -0800
Subject: ceph: implemented caps should always be superset of issued caps

Added assertion, and cleared one case where the implemented caps were
not following the issued caps.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index db122bb357b8..57d9b44a8820 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2334,6 +2334,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 			   revoked_rdcache)
 			reply = 2;     /* send revoke ack in check_caps */
 		cap->issued = newcaps;
+		cap->implemented |= newcaps;
 	} else if (cap->issued == newcaps) {
 		dout("caps unchanged: %s -> %s\n",
 		     ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
@@ -2346,6 +2347,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 					      * pending revocation */
 		wake = 1;
 	}
+	BUG_ON(cap->issued & ~cap->implemented);
 
 	spin_unlock(&inode->i_lock);
 	if (writeback)
-- 
cgit v1.2.2


From 052bb34af3bf8ae2001b9f03d884ba0def3e427c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 9 Mar 2010 12:52:26 -0800
Subject: ceph: add missing locking to protect i_snap_realm_item during split

All ci->i_snap_realm_item/realm->inodes_with_caps manipulation should be
protected by realm->inodes_with_caps_lock.  This bug would have only bit
us in a rare race with a realm split (during some snap creations).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/snap.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index bf2a5f3846a4..8a43bc8675eb 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -818,7 +818,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 			 * queued (again) by ceph_update_snap_trace()
 			 * below.  Queue it _now_, under the old context.
 			 */
+			spin_lock(&realm->inodes_with_caps_lock);
 			list_del_init(&ci->i_snap_realm_item);
+			spin_unlock(&realm->inodes_with_caps_lock);
 			spin_unlock(&inode->i_lock);
 
 			ceph_queue_cap_snap(ci,
-- 
cgit v1.2.2


From 8b218b8a4a65bf4e304ae8690cadb9100ef029c0 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 9 Mar 2010 12:59:08 -0800
Subject: ceph: fix inode removal from snap realm when racing with migration

When an inode was dropped while being migrated between two MDSs,
i_cap_exporting_issued was non-zero such that issue caps were non-zero and
__ceph_is_any_caps(ci) was true.  This prevented the inode from being
removed from the snap realm, even as it was dropped from the cache.

Fix this by dropping any residual i_snap_realm ref in destroy_inode.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 7abe1aed819b..aca82d55cc53 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -378,6 +378,22 @@ void ceph_destroy_inode(struct inode *inode)
 
 	ceph_queue_caps_release(inode);
 
+	/*
+	 * we may still have a snap_realm reference if there are stray
+	 * caps in i_cap_exporting_issued or i_snap_caps.
+	 */
+	if (ci->i_snap_realm) {
+		struct ceph_mds_client *mdsc =
+			&ceph_client(ci->vfs_inode.i_sb)->mdsc;
+		struct ceph_snap_realm *realm = ci->i_snap_realm;
+
+		dout(" dropping residual ref to snap realm %p\n", realm);
+		spin_lock(&realm->inodes_with_caps_lock);
+		list_del_init(&ci->i_snap_realm_item);
+		spin_unlock(&realm->inodes_with_caps_lock);
+		ceph_put_snap_realm(mdsc, realm);
+	}
+
 	kfree(ci->i_symlink);
 	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
 		frag = rb_entry(n, struct ceph_inode_frag, node);
-- 
cgit v1.2.2


From 63733a0fc55cca74b1911769633dc5dfd1a45907 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 15 Mar 2010 15:47:22 -0700
Subject: ceph: fix authenticator timeout

We were failing to reconnect to services due to an old authenticator, even
though we had the new ticket, because we weren't properly retrying the
connect handshake, because we were calling an old/incorrect helper that
left in_base_pos incorrect.  The result was a failure to reconnect to the
OSD or MDS (with an authentication error) if the MDS restarted after the
service had been up a few hours (long enough for the original authenticator
to be invalid).  This was only a problem if the AUTH_X authentication was
enabled.

Now that the 'negotiate' and 'connect' stages are fully separated, we
should use the prepare_read_connect() helper instead, and remove the
obsolete one.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 781656a49bf8..203c4359b549 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -830,13 +830,6 @@ static void prepare_read_connect(struct ceph_connection *con)
 	con->in_base_pos = 0;
 }
 
-static void prepare_read_connect_retry(struct ceph_connection *con)
-{
-	dout("prepare_read_connect_retry %p\n", con);
-	con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr)
-		+ sizeof(con->peer_addr_for_me);
-}
-
 static void prepare_read_ack(struct ceph_connection *con)
 {
 	dout("prepare_read_ack %p\n", con);
@@ -1146,7 +1139,7 @@ static int process_connect(struct ceph_connection *con)
 		}
 		con->auth_retry = 1;
 		prepare_write_connect(con->msgr, con, 0);
-		prepare_read_connect_retry(con);
+		prepare_read_connect(con);
 		break;
 
 	case CEPH_MSGR_TAG_RESETSESSION:
-- 
cgit v1.2.2


From 807c86e2ceba8febe79b289d50cd0d5e0b0af917 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 15 Mar 2010 15:52:17 -0700
Subject: ceph: fix authenticator buffer size calculation

The buffer size was incorrectly calculated for the ceph_x_encrypt()
encapsulated ticket blob.  Use a helper (with correct arithmetic) and
BUG out if we were wrong.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/auth_x.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index f0318427b6da..96e7aaa77678 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -28,6 +28,12 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
 	return (ac->want_keys & xi->have_keys) == ac->want_keys;
 }
 
+static int ceph_x_encrypt_buflen(int ilen)
+{
+	return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
+		sizeof(u32);
+}
+
 static int ceph_x_encrypt(struct ceph_crypto_key *secret,
 			  void *ibuf, int ilen, void *obuf, size_t olen)
 {
@@ -242,7 +248,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
 				   struct ceph_x_ticket_handler *th,
 				   struct ceph_x_authorizer *au)
 {
-	int len;
+	int maxlen;
 	struct ceph_x_authorize_a *msg_a;
 	struct ceph_x_authorize_b msg_b;
 	void *p, *end;
@@ -253,15 +259,15 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
 	dout("build_authorizer for %s %p\n",
 	     ceph_entity_type_name(th->service), au);
 
-	len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) +
-		ticket_blob_len + 16;
-	dout("  need len %d\n", len);
-	if (au->buf && au->buf->alloc_len < len) {
+	maxlen = sizeof(*msg_a) + sizeof(msg_b) +
+		ceph_x_encrypt_buflen(ticket_blob_len);
+	dout("  need len %d\n", maxlen);
+	if (au->buf && au->buf->alloc_len < maxlen) {
 		ceph_buffer_put(au->buf);
 		au->buf = NULL;
 	}
 	if (!au->buf) {
-		au->buf = ceph_buffer_new(len, GFP_NOFS);
+		au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
 		if (!au->buf)
 			return -ENOMEM;
 	}
@@ -296,6 +302,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
 	au->buf->vec.iov_len = p - au->buf->vec.iov_base;
 	dout(" built authorizer nonce %llx len %d\n", au->nonce,
 	     (int)au->buf->vec.iov_len);
+	BUG_ON(au->buf->vec.iov_len > maxlen);
 	return 0;
 
 out_buf:
-- 
cgit v1.2.2


From 5b3dbb44ab40660a080d03585bd35f45b2890c49 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 15 Mar 2010 15:57:04 -0700
Subject: ceph: release old ticket_blob buffer

Release the old ticket_blob buffer when we get an updated service ticket
from the monitor.  Previously these were getting leaked.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/auth_x.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 96e7aaa77678..33d3ad4dc456 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -156,6 +156,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		struct timespec validity;
 		struct ceph_crypto_key old_key;
 		void *tp, *tpend;
+		struct ceph_buffer *new_ticket_blob;
 
 		ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
 
@@ -223,9 +224,12 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
 		struct_v = ceph_decode_8(&tp);
 		th->secret_id = ceph_decode_64(&tp);
-		ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend);
+		ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
 		if (ret)
 			goto out;
+		if (th->ticket_blob)
+			ceph_buffer_put(th->ticket_blob);
+		th->ticket_blob = new_ticket_blob;
 		dout(" got ticket service %d (%s) secret_id %lld len %d\n",
 		     type, ceph_entity_type_name(type), th->secret_id,
 		     (int)th->ticket_blob->vec.iov_len);
-- 
cgit v1.2.2


From 2d8428acaee5468d194d8a84de22a0797791cb33 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 22 Mar 2010 14:01:24 +0900
Subject: nilfs2: fix duplicate call to nilfs_segctor_cancel_freev

Andreas Beckmann gave me a report that nilfs logged the following
warnings when it got a disk full:

  nilfs_sufile_do_cancel_free: segment 0 must be clean
  nilfs_sufile_do_cancel_free: segment 1 must be clean

These arise from a duplicate call to nilfs_segctor_cancel_freev in an
error path of log writer.  This will fix the issue.

Reported-by: Andreas Beckmann <debian@abeckmann.de>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segment.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 69576a95e13f..b6221230f840 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1510,6 +1510,12 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
 		if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
 			break;
 
+		nilfs_clear_logs(&sci->sc_segbufs);
+
+		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+		if (unlikely(err))
+			return err;
+
 		if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
 			err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
 							sci->sc_freesegs,
@@ -1517,12 +1523,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
 							NULL);
 			WARN_ON(err); /* do not happen */
 		}
-		nilfs_clear_logs(&sci->sc_segbufs);
-
-		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
-		if (unlikely(err))
-			return err;
-
 		nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
 		sci->sc_stage = prev_stage;
 	}
-- 
cgit v1.2.2


From 556ae3bb32cabe483375b857dda1322384c57b65 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sun, 21 Mar 2010 12:10:36 -0400
Subject: NFS: don't try to decode GETATTR if DELEGRETURN returned error

The reply parsing code attempts to decode the GETATTR response even if
the DELEGRETURN portion of the compound returned an error. The GETATTR
response won't actually exist if that's the case and we're asking the
parser to read past the end of the response.

This bug is fairly benign. The parser catches this without reading past
the end of the response and decode_getfattr returns -EIO. Earlier
kernels however had decode_op_hdr using the READ_BUF macro, and this
bug would make this printk pop any time the client got an error from
a delegreturn:

kernel: decode_op_hdr: reply buffer overflowed in line XXXX

More recent kernels seem to have replaced this printk with a dprintk.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4d338be492cb..dd17713413a5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5552,6 +5552,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
 	if (status != 0)
 		goto out;
 	status = decode_delegreturn(&xdr);
+	if (status != 0)
+		goto out;
 	decode_getfattr(&xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
-- 
cgit v1.2.2


From 99b437a9257cb6b267bf32adfb7675948dc6d485 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Mon, 22 Mar 2010 13:07:14 +0000
Subject: AFS: Potential null dereference

It seems clear from the surrounding code that xpermits is allowed to be
NULL here.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/security.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
 	if (!permits)
 		goto out_unlock;
 
-	memcpy(permits->permits, xpermits->permits,
-	       count * sizeof(struct afs_permit));
+	if (xpermits)
+		memcpy(permits->permits, xpermits->permits,
+			count * sizeof(struct afs_permit));
 
 	_debug("key %x access %x",
 	       key_serial(key), vnode->status.caller_access);
-- 
cgit v1.2.2


From 0a990e7093566ceb07e38951e1a01686923d4f09 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 15 Mar 2010 16:12:25 -0700
Subject: ceph: clean up service ticket decoding

Previously we would decode state directly into our current ticket_handler.
This is problematic if for some reason we fail to decode, because we end
up with half new state and half old state.

We are probably already in bad shape if we get an update we can't decode,
but we may as well be tidy anyway.  Decode into new_* temporaries and
update the ticket_handler only on success.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/auth_x.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 33d3ad4dc456..8d8a84964763 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -156,7 +156,11 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		struct timespec validity;
 		struct ceph_crypto_key old_key;
 		void *tp, *tpend;
+		struct ceph_timespec new_validity;
+		struct ceph_crypto_key new_session_key;
 		struct ceph_buffer *new_ticket_blob;
+		unsigned long new_expires, new_renew_after;
+		u64 new_secret_id;
 
 		ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
 
@@ -189,16 +193,16 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 			goto bad;
 
 		memcpy(&old_key, &th->session_key, sizeof(old_key));
-		ret = ceph_crypto_key_decode(&th->session_key, &dp, dend);
+		ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
 		if (ret)
 			goto out;
 
-		ceph_decode_copy(&dp, &th->validity, sizeof(th->validity));
-		ceph_decode_timespec(&validity, &th->validity);
-		th->expires = get_seconds() + validity.tv_sec;
-		th->renew_after = th->expires - (validity.tv_sec / 4);
-		dout(" expires=%lu renew_after=%lu\n", th->expires,
-		     th->renew_after);
+		ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
+		ceph_decode_timespec(&validity, &new_validity);
+		new_expires = get_seconds() + validity.tv_sec;
+		new_renew_after = new_expires - (validity.tv_sec / 4);
+		dout(" expires=%lu renew_after=%lu\n", new_expires,
+		     new_renew_after);
 
 		/* ticket blob for service */
 		ceph_decode_8_safe(&p, end, is_enc, bad);
@@ -223,13 +227,21 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		dout(" ticket blob is %d bytes\n", dlen);
 		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
 		struct_v = ceph_decode_8(&tp);
-		th->secret_id = ceph_decode_64(&tp);
+		new_secret_id = ceph_decode_64(&tp);
 		ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
 		if (ret)
 			goto out;
+
+		/* all is well, update our ticket */
+		ceph_crypto_key_destroy(&th->session_key);
 		if (th->ticket_blob)
 			ceph_buffer_put(th->ticket_blob);
+		th->session_key = new_session_key;
 		th->ticket_blob = new_ticket_blob;
+		th->validity = new_validity;
+		th->secret_id = new_secret_id;
+		th->expires = new_expires;
+		th->renew_after = new_renew_after;
 		dout(" got ticket service %d (%s) secret_id %lld len %d\n",
 		     type, ceph_entity_type_name(type), th->secret_id,
 		     (int)th->ticket_blob->vec.iov_len);
-- 
cgit v1.2.2


From 12eadc190038e68b5884a4aa313b6ab89ba60f5e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 15 Mar 2010 22:20:39 -0700
Subject: ceph: fix null pointer deref of r_osd in debug output

This causes an oops when debug output is enabled and we kick
an osd request with no current r_osd (sometime after an osd
failure).  Check the pointer before dereferencing.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index dbe63db9762f..221038253e61 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -913,7 +913,7 @@ static int __kick_requests(struct ceph_osd_client *osdc,
 
 kick:
 		dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
-		     req->r_osd->o_osd);
+		     req->r_osd ? req->r_osd->o_osd : -1);
 		req->r_flags |= CEPH_OSD_FLAG_RETRY;
 		err = __send_request(osdc, req);
 		if (err) {
-- 
cgit v1.2.2


From 4ea0043a29c82ca52ca54728d837314563bec574 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 16 Mar 2010 10:36:40 -0700
Subject: ceph: drop unnecessary WARN_ON in caps migration

If we don't have the exported cap it's because we already released it. No
need to WARN.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 57d9b44a8820..726c8d445995 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2550,9 +2550,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 			ci->i_cap_exporting_issued = cap->issued;
 		}
 		__ceph_remove_cap(cap);
-	} else {
-		WARN_ON(!cap);
 	}
+	/* else, we already released it */
 
 	spin_unlock(&inode->i_lock);
 }
-- 
cgit v1.2.2


From cdc2ce056a3620139056b60ad7f6d355ad13f445 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 16 Mar 2010 13:39:28 -0700
Subject: ceph: fix session locking in handle_caps, ceph_check_caps

Passing a session pointer to ceph_check_caps() used to mean it would leave
the session mutex locked.  That wasn't always possible if it wasn't passed
CHECK_CAPS_AUTHONLY.   If could unlock the passed session and lock a
differet session mutex, which was clearly wrong, and also emitted a
warning when it a racing CPU retook it and we did an unlock from the wrong
context.

This was only a problem when there was more than one MDS.

First, make ceph_check_caps unconditionally drop the session mutex, so that
it is free to lock other sessions as needed.  Then adjust the one caller
that passes in a session (handle_cap_grant) accordingly.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 726c8d445995..782848632e81 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1407,6 +1407,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
  */
 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		     struct ceph_mds_session *session)
+	__releases(session->s_mutex)
 {
 	struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
 	struct ceph_mds_client *mdsc = &client->mdsc;
@@ -1414,7 +1415,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	struct ceph_cap *cap;
 	int file_wanted, used;
 	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
-	int drop_session_lock = session ? 0 : 1;
 	int issued, implemented, want, retain, revoking, flushing = 0;
 	int mds = -1;   /* keep track of how far we've gone through i_caps list
 			   to avoid an infinite loop on retry */
@@ -1639,7 +1639,7 @@ ack:
 	if (queue_invalidate)
 		ceph_queue_invalidate(inode);
 
-	if (session && drop_session_lock)
+	if (session)
 		mutex_unlock(&session->s_mutex);
 	if (took_snap_rwsem)
 		up_read(&mdsc->snap_rwsem);
@@ -2688,14 +2688,17 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	case CEPH_CAP_OP_REVOKE:
 	case CEPH_CAP_OP_GRANT:
 		r = handle_cap_grant(inode, h, session, cap, msg->middle);
-		if (r == 1)
+		if (r == 1) {
 			ceph_check_caps(ceph_inode(inode),
 					CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
 					session);
-		else if (r == 2)
+			session = NULL;
+		} else if (r == 2) {
 			ceph_check_caps(ceph_inode(inode),
 					CHECK_CAPS_NODELAY,
 					session);
+			session = NULL;
+		}
 		break;
 
 	case CEPH_CAP_OP_FLUSH_ACK:
@@ -2713,7 +2716,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	}
 
 done:
-	mutex_unlock(&session->s_mutex);
+	if (session)
+		mutex_unlock(&session->s_mutex);
 
 	if (check_caps)
 		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
-- 
cgit v1.2.2


From 15637c8b1251c38694c32214eba69b72a30e9d9b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 16 Mar 2010 13:42:00 -0700
Subject: ceph: clean up handle_cap_grant, handle_caps wrt session mutex

Drop session mutex unconditionally in handle_cap_grant, and do the
check_caps from the handle_cap_grant helper.  This avoids using a magic
return value.

Also avoid using a flag variable in the IMPORT case and call
check_caps at the appropriate point.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 57 +++++++++++++++++++++++++--------------------------------
 1 file changed, 25 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 782848632e81..d9e860ff9f3a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2195,18 +2195,19 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
  * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
  * actually be a revocation if it specifies a smaller cap set.)
  *
- * caller holds s_mutex.
+ * caller holds s_mutex and i_lock, we drop both.
+ *
  * return value:
  *  0 - ok
  *  1 - check_caps on auth cap only (writeback)
  *  2 - check_caps (ack revoke)
  */
-static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
-			    struct ceph_mds_session *session,
-			    struct ceph_cap *cap,
-			    struct ceph_buffer *xattr_buf)
+static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+			     struct ceph_mds_session *session,
+			     struct ceph_cap *cap,
+			     struct ceph_buffer *xattr_buf)
 	__releases(inode->i_lock)
-
+	__releases(session->s_mutex)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int mds = session->s_mds;
@@ -2216,7 +2217,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	u64 size = le64_to_cpu(grant->size);
 	u64 max_size = le64_to_cpu(grant->max_size);
 	struct timespec mtime, atime, ctime;
-	int reply = 0;
+	int check_caps = 0;
 	int wake = 0;
 	int writeback = 0;
 	int revoked_rdcache = 0;
@@ -2329,10 +2330,10 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 		if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
 			writeback = 1; /* will delay ack */
 		else if (dirty & ~newcaps)
-			reply = 1;     /* initiate writeback in check_caps */
+			check_caps = 1;  /* initiate writeback in check_caps */
 		else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
 			   revoked_rdcache)
-			reply = 2;     /* send revoke ack in check_caps */
+			check_caps = 2;     /* send revoke ack in check_caps */
 		cap->issued = newcaps;
 		cap->implemented |= newcaps;
 	} else if (cap->issued == newcaps) {
@@ -2361,7 +2362,14 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 		ceph_queue_invalidate(inode);
 	if (wake)
 		wake_up(&ci->i_cap_wq);
-	return reply;
+
+	if (check_caps == 1)
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+				session);
+	else if (check_caps == 2)
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+	else
+		mutex_unlock(&session->s_mutex);
 }
 
 /*
@@ -2622,9 +2630,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	u64 cap_id;
 	u64 size, max_size;
 	u64 tid;
-	int check_caps = 0;
 	void *snaptrace;
-	int r;
 
 	dout("handle_caps from mds%d\n", mds);
 
@@ -2669,8 +2675,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	case CEPH_CAP_OP_IMPORT:
 		handle_cap_import(mdsc, inode, h, session,
 				  snaptrace, le32_to_cpu(h->snap_trace_len));
-		check_caps = 1; /* we may have sent a RELEASE to the old auth */
-		goto done;
+		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
+				session);
+		goto done_unlocked;
 	}
 
 	/* the rest require a cap */
@@ -2687,19 +2694,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	switch (op) {
 	case CEPH_CAP_OP_REVOKE:
 	case CEPH_CAP_OP_GRANT:
-		r = handle_cap_grant(inode, h, session, cap, msg->middle);
-		if (r == 1) {
-			ceph_check_caps(ceph_inode(inode),
-					CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
-					session);
-			session = NULL;
-		} else if (r == 2) {
-			ceph_check_caps(ceph_inode(inode),
-					CHECK_CAPS_NODELAY,
-					session);
-			session = NULL;
-		}
-		break;
+		handle_cap_grant(inode, h, session, cap, msg->middle);
+		goto done_unlocked;
 
 	case CEPH_CAP_OP_FLUSH_ACK:
 		handle_cap_flush_ack(inode, tid, h, session, cap);
@@ -2716,11 +2712,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	}
 
 done:
-	if (session)
-		mutex_unlock(&session->s_mutex);
-
-	if (check_caps)
-		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
+	mutex_unlock(&session->s_mutex);
+done_unlocked:
 	if (inode)
 		iput(inode);
 	return;
-- 
cgit v1.2.2


From 916623da10e270c7e9e802a7ddfe1ec8f890982d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 16 Mar 2010 15:01:07 -0700
Subject: ceph: only release unused caps with mds requests

We were releasing used caps (e.g. FILE_CACHE) from encode_inode_release
with MDS requests (e.g. setattr).  We don't carry refs on most caps, so
this code worked most of the time, but for setattr (utimes) we try to
drop Fscr.

This causes cap state to get slightly out of sync with reality, and may
result in subsequent mds revoke messages getting ignored.

Fix by only releasing unused caps.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d9e860ff9f3a..7d0a0d0adc18 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2836,11 +2836,18 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 	struct ceph_cap *cap;
 	struct ceph_mds_request_release *rel = *p;
 	int ret = 0;
-
-	dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
-	     mds, ceph_cap_string(drop), ceph_cap_string(unless));
+	int used = 0;
 
 	spin_lock(&inode->i_lock);
+	used = __ceph_caps_used(ci);
+
+	dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
+	     mds, ceph_cap_string(used), ceph_cap_string(drop),
+	     ceph_cap_string(unless));
+
+	/* only drop unused caps */
+	drop &= ~used;
+
 	cap = __get_cap_for_mds(ci, mds);
 	if (cap && __cap_is_valid(cap)) {
 		if (force ||
-- 
cgit v1.2.2


From 80fc7314a7e26e8d2e4ba5b3d8cc2d4aeb750015 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 16 Mar 2010 15:28:54 -0700
Subject: ceph: fix mds sync() race with completing requests

The wait_unsafe_requests() helper dropped the mdsc mutex to wait
for each request to complete, and then examined r_node to get the
next request after retaking the lock.  But the request completion
removes the request from the tree, so r_node was always undefined
at this point.  Since it's a small race, it usually led to a
valid request, but not always.  The result was an occasional
crash in rb_next() while dereferencing node->rb_left.

Fix this by clearing the rb_node when removing the request from
the request tree, and not walking off into the weeds when we
are done waiting for a request.  Since the request we waited on
will _always_ be out of the request tree, take a ref on the next
request, in the hopes that it won't be.  But if it is, it's ok:
we can start over from the beginning (and traverse over older read
requests again).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a2600101ec22..5ec864198368 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -529,6 +529,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 {
 	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
 	rb_erase(&req->r_node, &mdsc->request_tree);
+	RB_CLEAR_NODE(&req->r_node);
 	ceph_mdsc_put_request(req);
 
 	if (req->r_unsafe_dir) {
@@ -2682,29 +2683,41 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
  */
 static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
 {
-	struct ceph_mds_request *req = NULL;
+	struct ceph_mds_request *req = NULL, *nextreq;
 	struct rb_node *n;
 
 	mutex_lock(&mdsc->mutex);
 	dout("wait_unsafe_requests want %lld\n", want_tid);
+restart:
 	req = __get_oldest_req(mdsc);
 	while (req && req->r_tid <= want_tid) {
+		/* find next request */
+		n = rb_next(&req->r_node);
+		if (n)
+			nextreq = rb_entry(n, struct ceph_mds_request, r_node);
+		else
+			nextreq = NULL;
 		if ((req->r_op & CEPH_MDS_OP_WRITE)) {
 			/* write op */
 			ceph_mdsc_get_request(req);
+			if (nextreq)
+				ceph_mdsc_get_request(nextreq);
 			mutex_unlock(&mdsc->mutex);
 			dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
 			     req->r_tid, want_tid);
 			wait_for_completion(&req->r_safe_completion);
 			mutex_lock(&mdsc->mutex);
-			n = rb_next(&req->r_node);
 			ceph_mdsc_put_request(req);
-		} else {
-			n = rb_next(&req->r_node);
+			if (!nextreq)
+				break;  /* next dne before, so we're done! */
+			if (RB_EMPTY_NODE(&nextreq->r_node)) {
+				/* next request was removed from tree */
+				ceph_mdsc_put_request(nextreq);
+				goto restart;
+			}
+			ceph_mdsc_put_request(nextreq);  /* won't go away */
 		}
-		if (!n)
-			break;
-		req = rb_entry(n, struct ceph_mds_request, r_node);
+		req = nextreq;
 	}
 	mutex_unlock(&mdsc->mutex);
 	dout("wait_unsafe_requests done\n");
-- 
cgit v1.2.2


From efd7576b2392cc5a0934352936d793e8884c46bf Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 17 Mar 2010 10:05:28 -0700
Subject: ceph: fix pg pool decoding from incremental osdmap update

The incremental map decoding of pg pool updates wasn't skipping
the snaps and removed_snaps vectors.  This caused osd requests
to stall when pool snapshots were created or fs snapshots were
deleted.  Use a common helper for full and incremental map
decoders that decodes pools properly.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index b83f2692b835..d82fe87c2a6e 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -480,6 +480,14 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
 	return NULL;
 }
 
+void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
+{
+	ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+	calc_pg_masks(pi);
+	*p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+	*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+}
+
 /*
  * decode a full map.
  */
@@ -526,12 +534,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 				   ev, CEPH_PG_POOL_VERSION);
 			goto bad;
 		}
-		ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+		__decode_pool(p, pi);
 		__insert_pg_pool(&map->pg_pools, pi);
-		calc_pg_masks(pi);
-		*p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
-		*p += le32_to_cpu(pi->v.num_removed_snap_intervals)
-			* sizeof(u64) * 2;
 	}
 	ceph_decode_32_safe(p, end, map->pool_max, bad);
 
@@ -714,8 +718,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 			pi->id = pool;
 			__insert_pg_pool(&map->pg_pools, pi);
 		}
-		ceph_decode_copy(p, &pi->v, sizeof(pi->v));
-		calc_pg_masks(pi);
+		__decode_pool(p, pi);
 	}
 
 	/* old_pool */
-- 
cgit v1.2.2


From e4cb4cb8a03adde1aa4b874623c50b9a5b56e635 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 18 Mar 2010 13:43:09 -0700
Subject: ceph: prevent dup stale messages to console for restarting mds

Prevent duplicate 'mds0 caps stale' message from spamming the console every
few seconds while the MDS restarts.  Set s_renew_requested earlier, so that
we only print the message once, even if we don't send an actual request.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 5ec864198368..5cbf46abfee3 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -863,6 +863,7 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
 	if (time_after_eq(jiffies, session->s_cap_ttl) &&
 	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
 		pr_info("mds%d caps stale\n", session->s_mds);
+	session->s_renew_requested = jiffies;
 
 	/* do not try to renew caps until a recovering mds has reconnected
 	 * with its clients. */
@@ -875,7 +876,6 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
 
 	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
 		ceph_mds_state_name(state));
-	session->s_renew_requested = jiffies;
 	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
 				 ++session->s_renew_seq);
 	if (IS_ERR(msg))
-- 
cgit v1.2.2


From 3c3f2e32effd4c6acc3a9434bd7eecb0af653d89 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 18 Mar 2010 15:20:53 -0700
Subject: ceph: fix connection fault con_work reentrancy problem

The messenger fault was clearing the BUSY bit, for reasons unclear.  This
made it possible for the con->ops->fault function to reopen the connection,
and requeue work in the workqueue--even though the current thread was
already in con_work.

This avoids a problem where the client busy loops with connection failures
on an unreachable OSD, but doesn't address the root cause of that problem.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 203c4359b549..983285540945 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1836,8 +1836,6 @@ static void ceph_fault(struct ceph_connection *con)
 		goto out;
 	}
 
-	clear_bit(BUSY, &con->state);  /* to avoid an improbable race */
-
 	mutex_lock(&con->mutex);
 	if (test_bit(CLOSED, &con->state))
 		goto out_unlock;
-- 
cgit v1.2.2


From 3dd72fc0e6dc49c79fa9e7cd7c654deac7ccaa29 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 22 Mar 2010 14:42:30 -0700
Subject: ceph: rename r_sent_stamp r_stamp

Make variable name slightly more generic, since it will (soon)
reflect either the time the request was sent OR the time it was
last determined to be still retrying.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 12 ++++++------
 fs/ceph/osd_client.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 221038253e61..04359217ea6c 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -633,7 +633,7 @@ static int __send_request(struct ceph_osd_client *osdc,
 	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
 	reqhead->reassert_version = req->r_reassert_version;
 
-	req->r_sent_stamp = jiffies;
+	req->r_stamp = jiffies;
 	list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
 
 	ceph_msg_get(req->r_request); /* send consumes a ref */
@@ -660,7 +660,7 @@ static void handle_timeout(struct work_struct *work)
 	unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
 	unsigned long keepalive =
 		osdc->client->mount_args->osd_keepalive_timeout * HZ;
-	unsigned long last_sent = 0;
+	unsigned long last_stamp = 0;
 	struct rb_node *p;
 	struct list_head slow_osds;
 
@@ -697,12 +697,12 @@ static void handle_timeout(struct work_struct *work)
 		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
 				 r_req_lru_item);
 
-		if (time_before(jiffies, req->r_sent_stamp + timeout))
+		if (time_before(jiffies, req->r_stamp + timeout))
 			break;
 
-		BUG_ON(req == last_req && req->r_sent_stamp == last_sent);
+		BUG_ON(req == last_req && req->r_stamp == last_stamp);
 		last_req = req;
-		last_sent = req->r_sent_stamp;
+		last_stamp = req->r_stamp;
 
 		osd = req->r_osd;
 		BUG_ON(!osd);
@@ -718,7 +718,7 @@ static void handle_timeout(struct work_struct *work)
 	 */
 	INIT_LIST_HEAD(&slow_osds);
 	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-		if (time_before(jiffies, req->r_sent_stamp + keepalive))
+		if (time_before(jiffies, req->r_stamp + keepalive))
 			break;
 
 		osd = req->r_osd;
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index 1b1a3ca43afc..b0759911e7c3 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -70,7 +70,7 @@ struct ceph_osd_request {
 
 	char              r_oid[40];          /* object name */
 	int               r_oid_len;
-	unsigned long     r_sent_stamp;
+	unsigned long     r_stamp;            /* send OR check time */
 	bool              r_resend;           /* msg send failed, needs retry */
 
 	struct ceph_file_layout r_file_layout;
-- 
cgit v1.2.2


From 87b315a5b5cec5d7086494b203577602f5befc8c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 22 Mar 2010 14:51:18 -0700
Subject: ceph: avoid reopening osd connections when address hasn't changed

We get a fault callback on _every_ tcp connection fault.  Normally, we
want to reopen the connection when that happens.  If the address we have
is bad, however, and connection attempts always result in a connection
refused or similar error, explicitly closing and reopening the msgr
connection just prevents the messenger's backoff logic from kicking in.
The result can be a console full of

[ 3974.417106] ceph: osd11 10.3.14.138:6800 connection failed
[ 3974.423295] ceph: osd11 10.3.14.138:6800 connection failed
[ 3974.429709] ceph: osd11 10.3.14.138:6800 connection failed

Instead, if we get a fault, and have outstanding requests, but the osd
address hasn't changed and the connection never successfully connected in
the first place, do nothing to the osd connection.  The messenger layer
will back off and retry periodically, because we never connected and thus
the lossy bit is not set.

Instead, touch each request's r_stamp so that handle_timeout can tell the
request is still alive and kicking.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c  |  8 ++++++++
 fs/ceph/messenger.h  |  1 +
 fs/ceph/osd_client.c | 15 ++++++++++++++-
 3 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 983285540945..a32f0f896d9f 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -365,6 +365,14 @@ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
 	queue_con(con);
 }
 
+/*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+	return con->connect_seq > 0;
+}
+
 /*
  * generic get/put
  */
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 4caaa5911110..a343dae73cdc 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -223,6 +223,7 @@ extern void ceph_con_init(struct ceph_messenger *msgr,
 			  struct ceph_connection *con);
 extern void ceph_con_open(struct ceph_connection *con,
 			  struct ceph_entity_addr *addr);
+extern bool ceph_con_opened(struct ceph_connection *con);
 extern void ceph_con_close(struct ceph_connection *con);
 extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
 extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 04359217ea6c..c7b4dedaace6 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -413,11 +413,22 @@ static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
  */
 static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 {
+	struct ceph_osd_request *req;
 	int ret = 0;
 
 	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
 	if (list_empty(&osd->o_requests)) {
 		__remove_osd(osdc, osd);
+	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
+			  &osd->o_con.peer_addr,
+			  sizeof(osd->o_con.peer_addr)) == 0 &&
+		   !ceph_con_opened(&osd->o_con)) {
+		dout(" osd addr hasn't changed and connection never opened,"
+		     " letting msgr retry");
+		/* touch each r_stamp for handle_timeout()'s benfit */
+		list_for_each_entry(req, &osd->o_requests, r_osd_item)
+			req->r_stamp = jiffies;
+		ret = -EAGAIN;
 	} else {
 		ceph_con_close(&osd->o_con);
 		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
@@ -862,7 +873,9 @@ static int __kick_requests(struct ceph_osd_client *osdc,
 
 	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
 	if (kickosd) {
-		__reset_osd(osdc, kickosd);
+		err = __reset_osd(osdc, kickosd);
+		if (err == -EAGAIN)
+			return 1;
 	} else {
 		for (p = rb_first(&osdc->osds); p; p = n) {
 			struct ceph_osd *osd =
-- 
cgit v1.2.2


From ec4318bcb4c59d8b8bf7037c9f444a9887ccb265 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 19 Mar 2010 13:24:39 -0700
Subject: ceph: fix snap rebuild condition

We were rebuilding the snap context when it was not necessary
(i.e. when the realm seq hadn't changed _and_ the parent seq
was still older), which caused page snapc pointers to not match
the realm's snapc pointer (even though the snap context itself
was identical).  This confused begin_write and put it into an
endless loop.

The correct logic is: rebuild snapc if _my_ realm seq changed, or
if my parent realm's seq is newer than mine (and thus mine needs
to be rebuilt too).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/snap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 8a43bc8675eb..df04e210a055 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -314,9 +314,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 	   because we rebuild_snap_realms() works _downward_ in
 	   hierarchy after each update.) */
 	if (realm->cached_context &&
-	    realm->cached_context->seq <= realm->seq &&
+	    realm->cached_context->seq == realm->seq &&
 	    (!parent ||
-	     realm->cached_context->seq <= parent->cached_context->seq)) {
+	     realm->cached_context->seq >= parent->cached_context->seq)) {
 		dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
 		     " (unchanged)\n",
 		     realm->ino, realm, realm->cached_context,
-- 
cgit v1.2.2


From 8f883c24de33ba929c95e018ac0ba66e4f46734b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 19 Mar 2010 13:27:53 -0700
Subject: ceph: make write_begin wait propagate ERESTARTSYS

Currently, if the wait_event_interruptible is interrupted, we
return EAGAIN unconditionally and loop, such that we aren't, in
fact, interruptible.  So, propagate ERESTARTSYS if we get it.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 23bb0ceabe31..ce8ef6107727 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -919,6 +919,10 @@ static int context_is_writeable_or_written(struct inode *inode,
 /*
  * We are only allowed to write into/dirty the page if the page is
  * clean, or already dirty within the same snap context.
+ *
+ * called with page locked.
+ * return success with page locked,
+ * or any failure (incl -EAGAIN) with page unlocked.
  */
 static int ceph_update_writeable_page(struct file *file,
 			    loff_t pos, unsigned len,
@@ -961,9 +965,11 @@ retry_locked:
 			snapc = ceph_get_snap_context((void *)page->private);
 			unlock_page(page);
 			ceph_queue_writeback(inode);
-			wait_event_interruptible(ci->i_cap_wq,
+			r = wait_event_interruptible(ci->i_cap_wq,
 			       context_is_writeable_or_written(inode, snapc));
 			ceph_put_snap_context(snapc);
+			if (r == -ERESTARTSYS)
+				return r;
 			return -EAGAIN;
 		}
 
@@ -1035,7 +1041,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 	int r;
 
 	do {
-		/* get a page*/
+		/* get a page */
 		page = grab_cache_page_write_begin(mapping, index, 0);
 		if (!page)
 			return -ENOMEM;
-- 
cgit v1.2.2


From 9c423956b8a495f0c048143abc5da955a70eac97 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sat, 20 Mar 2010 20:43:28 -0700
Subject: ceph: propagate mds session allocation failures to caller

Return error to original caller if register_session() fails.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 5cbf46abfee3..b6b5348055fc 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1567,8 +1567,13 @@ static int __do_request(struct ceph_mds_client *mdsc,
 
 	/* get, open session */
 	session = __ceph_lookup_mds_session(mdsc, mds);
-	if (!session)
+	if (!session) {
 		session = register_session(mdsc, mds);
+		if (IS_ERR(session)) {
+			err = PTR_ERR(session);
+			goto finish;
+		}
+	}
 	dout("do_request mds%d session %p state %s\n", mds, session,
 	     session_state_name(session->s_state));
 	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
-- 
cgit v1.2.2


From 4736b009b880b7c19bea36327a71032a6dbee402 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 20 Mar 2010 15:30:16 +0300
Subject: ceph: handle kmalloc() failure

Return ERR_PTR(-ENOMEM) if kmalloc() fails.  We handle allocation
failures the same way later in the function.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b6b5348055fc..ad0fbc3128d3 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -328,6 +328,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	struct ceph_mds_session *s;
 
 	s = kzalloc(sizeof(*s), GFP_NOFS);
+	if (!s)
+		return ERR_PTR(-ENOMEM);
 	s->s_mdsc = mdsc;
 	s->s_mds = mds;
 	s->s_state = CEPH_MDS_SESSION_NEW;
-- 
cgit v1.2.2


From d96d60498ff748c5a88c72ec5d1cc4ba9a583e7e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sat, 20 Mar 2010 20:50:58 -0700
Subject: ceph: fix session check on mds reply

Fix a broken check that a reply came back from the same MDS we sent the
request to.  I don't think a case that actually triggers this would ever
come up in practice, but it's clearly wrong and easy to fix.

Reported-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ad0fbc3128d3..5268d404963c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1778,7 +1778,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	dout("handle_reply %p\n", req);
 
 	/* correct session? */
-	if (!req->r_session && req->r_session != session) {
+	if (req->r_session != session) {
 		pr_err("mdsc_handle_reply got %llu on session mds%d"
 		       " not mds%d\n", tid, session->s_mds,
 		       req->r_session ? req->r_session->s_mds : -1);
-- 
cgit v1.2.2


From 393f66209669ad23f4f6d4191234c1df4367df3c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 10 Mar 2010 12:03:32 -0800
Subject: ceph: fix possible double-free of mds request reference

Clear pointer to mds request after dropping the reference to
ensure we don't drop it again, as there is at least one error
path through this function that does not reset fi->last_readdir
to a new value.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 5107384ee029..8a9116e15b70 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -288,8 +288,10 @@ more:
 			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
 
 		/* discard old result, if any */
-		if (fi->last_readdir)
+		if (fi->last_readdir) {
 			ceph_mdsc_put_request(fi->last_readdir);
+			fi->last_readdir = NULL;
+		}
 
 		/* requery frag tree, as the frag topology may have changed */
 		frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
-- 
cgit v1.2.2


From 110d735a0ae69bdd11af9acb6ea3b979137eb118 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 22 Mar 2010 21:36:06 +0900
Subject: nilfs2: fix hang-up of cleaner after log writer returned with error

According to the report from Andreas Beckmann (Message-ID:
<4BA54677.3090902@abeckmann.de>), nilfs in 2.6.33 kernel got stuck
after a disk full error.

This turned out to be a regression by log writer updates merged at
kernel 2.6.33.  nilfs_segctor_abort_construction, which is a cleanup
function for erroneous cases, was skipping writeback completion for
some logs.

This fixes the bug and would resolve the hang issue.

Reported-by: Andreas Beckmann <debian@abeckmann.de>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: stable <stable@kernel.org>                     [2.6.33.x]
---
 fs/nilfs2/segment.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index b6221230f840..c161d89061b5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1897,8 +1897,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
 
 	list_splice_tail_init(&sci->sc_write_logs, &logs);
 	ret = nilfs_wait_on_logs(&logs);
-	if (ret)
-		nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
+	nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err);
 
 	list_splice_tail_init(&sci->sc_segbufs, &logs);
 	nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
-- 
cgit v1.2.2


From d067633b4483f3c7d971d8f889f35340a8635bb5 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 22 Mar 2010 19:33:43 +0900
Subject: nilfs2: fix imperfect completion wait in nilfs_wait_on_logs

nilfs_wait_on_logs has a potential to slip out before completion of
all bio requests when it met an error.  This synchronization fault may
cause unexpected results, for instance, violative access to freed
segment buffers from an end-bio callback routine.

This fixes the issue by ensuring that nilfs_wait_on_logs waits all
given logs.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segbuf.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 636eaafd6ea2..6129a431aa34 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -323,14 +323,14 @@ int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
 int nilfs_wait_on_logs(struct list_head *logs)
 {
 	struct nilfs_segment_buffer *segbuf;
-	int err;
+	int err, ret = 0;
 
 	list_for_each_entry(segbuf, logs, sb_list) {
 		err = nilfs_segbuf_wait(segbuf);
-		if (err)
-			return err;
+		if (err && !ret)
+			ret = err;
 	}
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit v1.2.2


From 157f1071354db1aed885816094888e0e257c9d0a Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Thu, 11 Feb 2010 07:10:38 -0600
Subject: eCryptfs: Fix metadata in xattr feature regression

Fixes regression in 8faece5f906725c10e7a1f6caf84452abadbdc7b

When using the ecryptfs_xattr_metadata mount option, eCryptfs stores the
metadata (normally stored at the front of the file) in the user.ecryptfs
xattr.  This causes ecryptfs_crypt_stat.num_header_bytes_at_front to be
0, since there is no header data at the front of the file.  This results
in too much memory being requested and ENOMEM being returned from
ecryptfs_write_metadata().

This patch fixes the problem by using the num_header_bytes_at_front
variable for specifying the max size of the metadata, despite whether it
is stored in the header or xattr.

Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/crypto.c          |  7 ++++---
 fs/ecryptfs/ecryptfs_kernel.h |  8 ++++++++
 fs/ecryptfs/inode.c           |  2 +-
 fs/ecryptfs/mmap.c            | 19 +++++--------------
 4 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 7cb0a59f4b9d..c907f6f49351 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -381,8 +381,8 @@ out:
 static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
 					     struct ecryptfs_crypt_stat *crypt_stat)
 {
-	(*offset) = (crypt_stat->num_header_bytes_at_front
-		     + (crypt_stat->extent_size * extent_num));
+	(*offset) = ecryptfs_lower_header_size(crypt_stat)
+		    + (crypt_stat->extent_size * extent_num);
 }
 
 /**
@@ -834,7 +834,8 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
 	set_extent_mask_and_shift(crypt_stat);
 	crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
 	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-		crypt_stat->num_header_bytes_at_front = 0;
+		crypt_stat->num_header_bytes_at_front =
+					ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
 	else {
 		if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
 			crypt_stat->num_header_bytes_at_front =
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 542f625312f3..8456f70606ad 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -464,6 +464,14 @@ struct ecryptfs_daemon {
 
 extern struct mutex ecryptfs_daemon_hash_mux;
 
+static inline size_t
+ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+		return 0;
+	return crypt_stat->num_header_bytes_at_front;
+}
+
 static inline struct ecryptfs_file_info *
 ecryptfs_file_to_private(struct file *file)
 {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4a430ab4115c..1a739531b0dd 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -768,7 +768,7 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
 {
 	loff_t lower_size;
 
-	lower_size = crypt_stat->num_header_bytes_at_front;
+	lower_size = ecryptfs_lower_header_size(crypt_stat);
 	if (upper_size != 0) {
 		loff_t num_extents;
 
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index df4ce99d0597..5a30e01547f1 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -97,19 +97,6 @@ out:
  *                        (big-endian)
  *     Octet  26:         Begin RFC 2440 authentication token packet set
  */
-static void set_header_info(char *page_virt,
-			    struct ecryptfs_crypt_stat *crypt_stat)
-{
-	size_t written;
-	size_t save_num_header_bytes_at_front =
-		crypt_stat->num_header_bytes_at_front;
-
-	crypt_stat->num_header_bytes_at_front =
-		ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
-	ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written);
-	crypt_stat->num_header_bytes_at_front =
-		save_num_header_bytes_at_front;
-}
 
 /**
  * ecryptfs_copy_up_encrypted_with_header
@@ -146,9 +133,13 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 			memset(page_virt, 0, PAGE_CACHE_SIZE);
 			/* TODO: Support more than one header extent */
 			if (view_extent_num == 0) {
+				size_t written;
+
 				rc = ecryptfs_read_xattr_region(
 					page_virt, page->mapping->host);
-				set_header_info(page_virt, crypt_stat);
+				ecryptfs_write_header_metadata(page_virt + 20,
+							       crypt_stat,
+							       &written);
 			}
 			kunmap_atomic(page_virt, KM_USER0);
 			flush_dcache_page(page);
-- 
cgit v1.2.2


From fa3ef1cb4e6e9958a9bfaa977c107c515907f102 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Thu, 11 Feb 2010 05:09:14 -0600
Subject: eCryptfs: Rename ecryptfs_crypt_stat.num_header_bytes_at_front

This patch renames the num_header_bytes_at_front variable to
metadata_size since it now contains the max size of the metadata.

Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/crypto.c          | 24 ++++++++++--------------
 fs/ecryptfs/ecryptfs_kernel.h |  4 ++--
 fs/ecryptfs/inode.c           |  2 +-
 fs/ecryptfs/mmap.c            |  5 ++---
 4 files changed, 15 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index c907f6f49351..391f558eb4d0 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -834,14 +834,13 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
 	set_extent_mask_and_shift(crypt_stat);
 	crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
 	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-		crypt_stat->num_header_bytes_at_front =
-					ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
+		crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
 	else {
 		if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
-			crypt_stat->num_header_bytes_at_front =
+			crypt_stat->metadata_size =
 				ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
 		else
-			crypt_stat->num_header_bytes_at_front =	PAGE_CACHE_SIZE;
+			crypt_stat->metadata_size = PAGE_CACHE_SIZE;
 	}
 }
 
@@ -1238,8 +1237,7 @@ ecryptfs_write_header_metadata(char *virt,
 
 	header_extent_size = (u32)crypt_stat->extent_size;
 	num_header_extents_at_front =
-		(u16)(crypt_stat->num_header_bytes_at_front
-		      / crypt_stat->extent_size);
+		(u16)(crypt_stat->metadata_size / crypt_stat->extent_size);
 	put_unaligned_be32(header_extent_size, virt);
 	virt += 4;
 	put_unaligned_be16(num_header_extents_at_front, virt);
@@ -1382,7 +1380,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 		rc = -EINVAL;
 		goto out;
 	}
-	virt_len = crypt_stat->num_header_bytes_at_front;
+	virt_len = crypt_stat->metadata_size;
 	order = get_order(virt_len);
 	/* Released in this function */
 	virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
@@ -1428,16 +1426,15 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
 	header_extent_size = get_unaligned_be32(virt);
 	virt += sizeof(__be32);
 	num_header_extents_at_front = get_unaligned_be16(virt);
-	crypt_stat->num_header_bytes_at_front =
-		(((size_t)num_header_extents_at_front
-		  * (size_t)header_extent_size));
+	crypt_stat->metadata_size = (((size_t)num_header_extents_at_front
+				     * (size_t)header_extent_size));
 	(*bytes_read) = (sizeof(__be32) + sizeof(__be16));
 	if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
-	    && (crypt_stat->num_header_bytes_at_front
+	    && (crypt_stat->metadata_size
 		< ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
 		rc = -EINVAL;
 		printk(KERN_WARNING "Invalid header size: [%zd]\n",
-		       crypt_stat->num_header_bytes_at_front);
+		       crypt_stat->metadata_size);
 	}
 	return rc;
 }
@@ -1452,8 +1449,7 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
  */
 static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
 {
-	crypt_stat->num_header_bytes_at_front =
-		ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
+	crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
 }
 
 /**
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 8456f70606ad..d031efd7666b 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -273,7 +273,7 @@ struct ecryptfs_crypt_stat {
 	u32 flags;
 	unsigned int file_version;
 	size_t iv_bytes;
-	size_t num_header_bytes_at_front;
+	size_t metadata_size;
 	size_t extent_size; /* Data extent size; default is 4096 */
 	size_t key_size;
 	size_t extent_shift;
@@ -469,7 +469,7 @@ ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
 {
 	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
 		return 0;
-	return crypt_stat->num_header_bytes_at_front;
+	return crypt_stat->metadata_size;
 }
 
 static inline struct ecryptfs_file_info *
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1a739531b0dd..a50efb18701c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -335,7 +335,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 		ecryptfs_dentry->d_sb)->mount_crypt_stat;
 	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
 		if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-			file_size = (crypt_stat->num_header_bytes_at_front
+			file_size = (crypt_stat->metadata_size
 				     + i_size_read(lower_dentry->d_inode));
 		else
 			file_size = i_size_read(lower_dentry->d_inode);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 5a30e01547f1..270f42ae7c0d 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -122,8 +122,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 					   * num_extents_per_page)
 					  + extent_num_in_page);
 		size_t num_header_extents_at_front =
-			(crypt_stat->num_header_bytes_at_front
-			 / crypt_stat->extent_size);
+			(crypt_stat->metadata_size / crypt_stat->extent_size);
 
 		if (view_extent_num < num_header_extents_at_front) {
 			/* This is a header extent */
@@ -152,7 +151,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 			/* This is an encrypted data extent */
 			loff_t lower_offset =
 				((view_extent_num * crypt_stat->extent_size)
-				 - crypt_stat->num_header_bytes_at_front);
+				 - crypt_stat->metadata_size);
 
 			rc = ecryptfs_read_lower_page_segment(
 				page, (lower_offset >> PAGE_CACHE_SHIFT),
-- 
cgit v1.2.2


From 1984c23f9e0cdb432d90a85ecf88b424d36878fc Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Wed, 10 Feb 2010 23:17:44 -0600
Subject: eCryptfs: Clear buffer before reading in metadata xattr

We initially read in the first PAGE_CACHE_SIZE of a file to if the
eCryptfs header marker can be found.  If it isn't found and
ecryptfs_xattr_metadata was given as a mount option, then the
user.ecryptfs xattr is read into the same buffer.  Since the data from
the first page of the file wasn't cleared, it is possible that we think
we've found a second tag 3 or tag 1 packet and then error out after the
packet contents aren't as expected.  This patch clears the buffer before
filling it with metadata from the user.ecryptfs xattr.

Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/crypto.c | 1 +
 fs/ecryptfs/inode.c  | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 391f558eb4d0..4d9db0ed88ea 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1603,6 +1603,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 						ecryptfs_dentry,
 						ECRYPTFS_VALIDATE_HEADER_SIZE);
 	if (rc) {
+		memset(page_virt, 0, PAGE_CACHE_SIZE);
 		rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
 		if (rc) {
 			printk(KERN_DEBUG "Valid eCryptfs headers not found in "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a50efb18701c..ddbd096c7406 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -323,6 +323,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 	rc = ecryptfs_read_and_validate_header_region(page_virt,
 						      ecryptfs_dentry->d_inode);
 	if (rc) {
+		memset(page_virt, 0, PAGE_CACHE_SIZE);
 		rc = ecryptfs_read_and_validate_xattr_region(page_virt,
 							     ecryptfs_dentry);
 		if (rc) {
-- 
cgit v1.2.2


From f4e60e6b303bc46cdc477d3174dbf9cb5dd013aa Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Thu, 11 Feb 2010 00:02:32 -0600
Subject: eCryptfs: Strip metadata in xattr flag in encrypted view

The ecryptfs_encrypted_view mount option provides a unified way of
viewing encrypted eCryptfs files.  If the metadata is stored in a xattr,
the metadata is moved to the file header when the file is read inside
the eCryptfs mount.  Because of this, we should strip the
ECRYPTFS_METADATA_IN_XATTR flag from the header's flag section.  This
allows eCryptfs to treat the file as an eCryptfs file with a header
at the front.

Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/crypto.c          |  9 +++++----
 fs/ecryptfs/ecryptfs_kernel.h |  3 +++
 fs/ecryptfs/mmap.c            | 14 ++++++++++++++
 3 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 4d9db0ed88ea..fad5bf6a6116 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1107,9 +1107,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written)
 	(*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
 }
 
-static void
-write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat,
-		     size_t *written)
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
+				     struct ecryptfs_crypt_stat *crypt_stat,
+				     size_t *written)
 {
 	u32 flags = 0;
 	int i;
@@ -1290,7 +1290,8 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
 	offset = ECRYPTFS_FILE_SIZE_BYTES;
 	write_ecryptfs_marker((page_virt + offset), &written);
 	offset += written;
-	write_ecryptfs_flags((page_virt + offset), crypt_stat, &written);
+	ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat,
+					&written);
 	offset += written;
 	ecryptfs_write_header_metadata((page_virt + offset), crypt_stat,
 				       &written);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index d031efd7666b..bc7115403f38 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -659,6 +659,9 @@ int ecryptfs_decrypt_page(struct page *page);
 int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry);
 int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
 int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
+				     struct ecryptfs_crypt_stat *crypt_stat,
+				     size_t *written);
 int ecryptfs_read_and_validate_header_region(char *data,
 					     struct inode *ecryptfs_inode);
 int ecryptfs_read_and_validate_xattr_region(char *page_virt,
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 270f42ae7c0d..bea998a25afd 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -82,6 +82,19 @@ out:
 	return rc;
 }
 
+static void strip_xattr_flag(char *page_virt,
+			     struct ecryptfs_crypt_stat *crypt_stat)
+{
+	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
+		size_t written;
+
+		crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR;
+		ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat,
+						&written);
+		crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+	}
+}
+
 /**
  *   Header Extent:
  *     Octets 0-7:        Unencrypted file size (big-endian)
@@ -136,6 +149,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 
 				rc = ecryptfs_read_xattr_region(
 					page_virt, page->mapping->host);
+				strip_xattr_flag(page_virt + 16, crypt_stat);
 				ecryptfs_write_header_metadata(page_virt + 20,
 							       crypt_stat,
 							       &written);
-- 
cgit v1.2.2


From b4414eea0e7b9c134262c801a87e338bf675962c Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Thu, 11 Mar 2010 18:31:09 -0800
Subject: ocfs2: Clear undo bits when local alloc is freed

When the local alloc file changes windows, unused bits are freed back to the
global bitmap. By defnition, those bits can not be in use by any file. Also,
the local alloc will never have been able to allocate those bits if they
were part of a previous truncate. Therefore it makes sense that we should
clear unused local alloc bits in the undo buffer so that they can be used
immediatly.

[ Modified to call it ocfs2_release_clusters() -- Joel ]

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/localalloc.c |   6 ++-
 fs/ocfs2/ocfs2.h      |  14 +++++-
 fs/ocfs2/suballoc.c   | 116 ++++++++++++++++++++++++++++++++------------------
 fs/ocfs2/suballoc.h   |   5 +++
 4 files changed, 95 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 171c691b42a0..c983715d8d8c 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -872,8 +872,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
 			     (unsigned long long)la_start_blk,
 			     (unsigned long long)blkno);
 
-			status = ocfs2_free_clusters(handle, main_bm_inode,
-						     main_bm_bh, blkno, count);
+			status = ocfs2_release_clusters(handle,
+							main_bm_inode,
+							main_bm_bh, blkno,
+							count);
 			if (status < 0) {
 				mlog_errno(status);
 				goto bail;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1238b491db90..adf5e2ebc2c4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -763,8 +763,18 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
 	return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
 }
 
-#define ocfs2_set_bit ext2_set_bit
-#define ocfs2_clear_bit ext2_clear_bit
+static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
+{
+	ext2_set_bit(bit, bitmap);
+}
+#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
+
+static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
+{
+	ext2_clear_bit(bit, bitmap);
+}
+#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
+
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
 #define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 0016503d83ef..19ba00f28547 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -95,13 +95,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 					     struct buffer_head *group_bh,
 					     unsigned int bit_off,
 					     unsigned int num_bits);
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
-					       struct inode *alloc_inode,
-					       struct ocfs2_group_desc *bg,
-					       struct buffer_head *group_bh,
-					       unsigned int bit_off,
-					       unsigned int num_bits);
-
 static int ocfs2_relink_block_group(handle_t *handle,
 				    struct inode *alloc_inode,
 				    struct buffer_head *fe_bh,
@@ -1978,18 +1971,18 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 				      bits_wanted, cluster_start, num_clusters);
 }
 
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
-					       struct inode *alloc_inode,
-					       struct ocfs2_group_desc *bg,
-					       struct buffer_head *group_bh,
-					       unsigned int bit_off,
-					       unsigned int num_bits)
+static int ocfs2_block_group_clear_bits(handle_t *handle,
+					struct inode *alloc_inode,
+					struct ocfs2_group_desc *bg,
+					struct buffer_head *group_bh,
+					unsigned int bit_off,
+					unsigned int num_bits,
+					void (*undo_fn)(unsigned int bit,
+							unsigned long *bmap))
 {
 	int status;
 	unsigned int tmp;
-	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
 	struct ocfs2_group_desc *undo_bg = NULL;
-	int cluster_bitmap = 0;
 
 	mlog_entry_void();
 
@@ -1999,20 +1992,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
 
 	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
 
-	if (ocfs2_is_cluster_bitmap(alloc_inode))
-		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-
+	BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
-					 group_bh, journal_type);
+					 group_bh,
+					 undo_fn ?
+					 OCFS2_JOURNAL_ACCESS_UNDO :
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 
-	if (ocfs2_is_cluster_bitmap(alloc_inode))
-		cluster_bitmap = 1;
-
-	if (cluster_bitmap) {
+	if (undo_fn) {
 		jbd_lock_bh_state(group_bh);
 		undo_bg = (struct ocfs2_group_desc *)
 					bh2jh(group_bh)->b_committed_data;
@@ -2023,13 +2014,13 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
 	while(tmp--) {
 		ocfs2_clear_bit((bit_off + tmp),
 				(unsigned long *) bg->bg_bitmap);
-		if (cluster_bitmap)
-			ocfs2_set_bit(bit_off + tmp,
-				      (unsigned long *) undo_bg->bg_bitmap);
+		if (undo_fn)
+			undo_fn(bit_off + tmp,
+				(unsigned long *) undo_bg->bg_bitmap);
 	}
 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
 
-	if (cluster_bitmap)
+	if (undo_fn)
 		jbd_unlock_bh_state(group_bh);
 
 	status = ocfs2_journal_dirty(handle, group_bh);
@@ -2042,12 +2033,14 @@ bail:
 /*
  * expects the suballoc inode to already be locked.
  */
-int ocfs2_free_suballoc_bits(handle_t *handle,
-			     struct inode *alloc_inode,
-			     struct buffer_head *alloc_bh,
-			     unsigned int start_bit,
-			     u64 bg_blkno,
-			     unsigned int count)
+static int _ocfs2_free_suballoc_bits(handle_t *handle,
+				     struct inode *alloc_inode,
+				     struct buffer_head *alloc_bh,
+				     unsigned int start_bit,
+				     u64 bg_blkno,
+				     unsigned int count,
+				     void (*undo_fn)(unsigned int bit,
+						     unsigned long *bitmap))
 {
 	int status = 0;
 	u32 tmp_used;
@@ -2082,7 +2075,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 
 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
 					      group, group_bh,
-					      start_bit, count);
+					      start_bit, count, undo_fn);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -2113,6 +2106,17 @@ bail:
 	return status;
 }
 
+int ocfs2_free_suballoc_bits(handle_t *handle,
+			     struct inode *alloc_inode,
+			     struct buffer_head *alloc_bh,
+			     unsigned int start_bit,
+			     u64 bg_blkno,
+			     unsigned int count)
+{
+	return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
+					 start_bit, bg_blkno, count, NULL);
+}
+
 int ocfs2_free_dinode(handle_t *handle,
 		      struct inode *inode_alloc_inode,
 		      struct buffer_head *inode_alloc_bh,
@@ -2126,11 +2130,13 @@ int ocfs2_free_dinode(handle_t *handle,
 					inode_alloc_bh, bit, bg_blkno, 1);
 }
 
-int ocfs2_free_clusters(handle_t *handle,
-		       struct inode *bitmap_inode,
-		       struct buffer_head *bitmap_bh,
-		       u64 start_blk,
-		       unsigned int num_clusters)
+static int _ocfs2_free_clusters(handle_t *handle,
+				struct inode *bitmap_inode,
+				struct buffer_head *bitmap_bh,
+				u64 start_blk,
+				unsigned int num_clusters,
+				void (*undo_fn)(unsigned int bit,
+						unsigned long *bitmap))
 {
 	int status;
 	u16 bg_start_bit;
@@ -2157,9 +2163,9 @@ int ocfs2_free_clusters(handle_t *handle,
 	mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
 	     (unsigned long long)bg_blkno, bg_start_bit);
 
-	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
-					  bg_start_bit, bg_blkno,
-					  num_clusters);
+	status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+					   bg_start_bit, bg_blkno,
+					   num_clusters, undo_fn);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out;
@@ -2173,6 +2179,32 @@ out:
 	return status;
 }
 
+int ocfs2_free_clusters(handle_t *handle,
+			struct inode *bitmap_inode,
+			struct buffer_head *bitmap_bh,
+			u64 start_blk,
+			unsigned int num_clusters)
+{
+	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+				    start_blk, num_clusters,
+				    _ocfs2_set_bit);
+}
+
+/*
+ * Give never-used clusters back to the global bitmap.  We don't need
+ * to protect these bits in the undo buffer.
+ */
+int ocfs2_release_clusters(handle_t *handle,
+			   struct inode *bitmap_inode,
+			   struct buffer_head *bitmap_bh,
+			   u64 start_blk,
+			   unsigned int num_clusters)
+{
+	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+				    start_blk, num_clusters,
+				    _ocfs2_clear_bit);
+}
+
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
 {
 	printk("Block Group:\n");
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index fa60723c43e8..e0f46df357e6 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -127,6 +127,11 @@ int ocfs2_free_clusters(handle_t *handle,
 			struct buffer_head *bitmap_bh,
 			u64 start_blk,
 			unsigned int num_clusters);
+int ocfs2_release_clusters(handle_t *handle,
+			   struct inode *bitmap_inode,
+			   struct buffer_head *bitmap_bh,
+			   u64 start_blk,
+			   unsigned int num_clusters);
 
 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
 {
-- 
cgit v1.2.2


From 3939fda4b389993caf8741df5739b3e49f33a263 Mon Sep 17 00:00:00 2001
From: Tristan Ye <tristan.ye@oracle.com>
Date: Fri, 19 Mar 2010 09:21:09 +0800
Subject: Ocfs2: Journaling i_flags and i_orphaned_slot when adding inode to
 orphan dir.

Currently, some callers were missing to journal the dirty inode after
adding it to orphan dir.

Now we're going to journal such modifications within the ocfs2_orphan_add()
itself, It's safe to do so, though some existing caller may duplicate this,
and it makes the logic look more straightforward anyway.

Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/namei.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d9cd4e373a53..b1eb50ae4097 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
 			    handle_t *handle,
 			    struct inode *inode,
-			    struct ocfs2_dinode *fe,
+			    struct buffer_head *fe_bh,
 			    char *name,
 			    struct ocfs2_dir_lookup_result *lookup,
 			    struct inode *orphan_dir_inode);
@@ -879,7 +879,7 @@ static int ocfs2_unlink(struct inode *dir,
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 	if (inode_is_unlinkable(inode)) {
-		status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
+		status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
 					  &orphan_insert, orphan_dir);
 		if (status < 0) {
 			mlog_errno(status);
@@ -1300,7 +1300,7 @@ static int ocfs2_rename(struct inode *old_dir,
 		if (S_ISDIR(new_inode->i_mode) ||
 		    (ocfs2_read_links_count(newfe) == 1)) {
 			status = ocfs2_orphan_add(osb, handle, new_inode,
-						  newfe, orphan_name,
+						  newfe_bh, orphan_name,
 						  &orphan_insert, orphan_dir);
 			if (status < 0) {
 				mlog_errno(status);
@@ -1911,7 +1911,7 @@ leave:
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
 			    handle_t *handle,
 			    struct inode *inode,
-			    struct ocfs2_dinode *fe,
+			    struct buffer_head *fe_bh,
 			    char *name,
 			    struct ocfs2_dir_lookup_result *lookup,
 			    struct inode *orphan_dir_inode)
@@ -1919,6 +1919,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	struct buffer_head *orphan_dir_bh = NULL;
 	int status = 0;
 	struct ocfs2_dinode *orphan_fe;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 
@@ -1959,6 +1960,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 		goto leave;
 	}
 
+	/*
+	 * We're going to journal the change of i_flags and i_orphaned_slot.
+	 * It's safe anyway, though some callers may duplicate the journaling.
+	 * Journaling within the func just make the logic look more
+	 * straightforward.
+	 */
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(inode),
+					 fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
 	le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
 
 	/* Record which orphan dir our inode now resides
@@ -1966,6 +1982,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	 * dir to lock. */
 	fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
 
+	ocfs2_journal_dirty(handle, fe_bh);
+
 	mlog(0, "Inode %llu orphaned in slot %d\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
 
@@ -2123,7 +2141,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 	}
 
 	di = (struct ocfs2_dinode *)new_di_bh->b_data;
-	status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name,
+	status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
 				  &orphan_insert, orphan_dir);
 	if (status < 0) {
 		mlog_errno(status);
-- 
cgit v1.2.2


From b54c2ca475fa7d7450a45b6d778dae9dbe0bcbfe Mon Sep 17 00:00:00 2001
From: Tristan Ye <tristan.ye@oracle.com>
Date: Fri, 19 Mar 2010 09:21:10 +0800
Subject: Ocfs2: Handle deletion of reflinked oprhan inodes correctly.

The rule is that all inodes in the orphan dir have ORPHANED_FL,
otherwise we treated it as an ERROR.  This rule works well except
for some rare cases of reflink operation:

http://oss.oracle.com/bugzilla/show_bug.cgi?id=1215

The problem is caused by how reflink and our orphan_scan thread
interact.

 * The orphan scan pulls the orphans into a queue first, then runs the
   queue at a later time.  We only hold the orphan_dir's lock
   during scanning.

 * Reflink create a oprhaned target in orphan_dir as its first step.
   It removes the target and clears the flag as the final step.
   These two steps take the orphan_dir's lock, but it is not held for
   the duration.

Based on the above semantics, a reflink inode can be moved out of the
orphan dir and have its ORPHANED_FL cleared before the queue of orphans
is run.  This leads to a ERROR in ocfs2_query_wipde_inode().

This patch teaches ocfs2_query_wipe_inode() to detect previously
orphaned reflink targets.  If a reflink fails or a crash occurs during
the relfink operation, the inode will retain ORPHANED_FL and will be
properly wiped.

Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/inode.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 278a223aae14..ab207901d32a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -891,6 +891,21 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 	/* Do some basic inode verification... */
 	di = (struct ocfs2_dinode *) di_bh->b_data;
 	if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+		/*
+		 * Inodes in the orphan dir must have ORPHANED_FL.  The only
+		 * inodes that come back out of the orphan dir are reflink
+		 * targets. A reflink target may be moved out of the orphan
+		 * dir between the time we scan the directory and the time we
+		 * process it. This would lead to HAS_REFCOUNT_FL being set but
+		 * ORPHANED_FL not.
+		 */
+		if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
+			mlog(0, "Reflinked inode %llu is no longer orphaned.  "
+			     "it shouldn't be deleted\n",
+			     (unsigned long long)oi->ip_blkno);
+			goto bail;
+		}
+
 		/* for lack of a better error? */
 		status = -EEXIST;
 		mlog(ML_ERROR,
-- 
cgit v1.2.2


From 14741472a05245ed5778aa0aec055e1f920b6ef8 Mon Sep 17 00:00:00 2001
From: Srinivas Eeda <srinivas.eeda@oracle.com>
Date: Mon, 22 Mar 2010 16:50:47 -0700
Subject: ocfs2: Fix a race in o2dlm lockres mastery

In o2dlm, the master of a lock resource keeps a map of all interested
nodes.  This prevents the master from purging the resource before an
interested node can create a lock.

A race between the mastery thread and the mastery handler allowed an
interested node to discover who the master is without informing the
master directly.  This is easily fixed by holding the dlm spinlock a
little longer in the mastery handler.

Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a659606dcb95..9289b4357d27 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1875,7 +1875,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
 ok:
 		spin_unlock(&res->spinlock);
 	}
-	spin_unlock(&dlm->spinlock);
 
 	// mlog(0, "woo!  got an assert_master from node %u!\n",
 	// 	     assert->node_idx);
@@ -1926,7 +1925,6 @@ ok:
 		/* master is known, detach if not already detached.
 		 * ensures that only one assert_master call will happen
 		 * on this mle. */
-		spin_lock(&dlm->spinlock);
 		spin_lock(&dlm->master_lock);
 
 		rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1957,6 @@ ok:
 			__dlm_put_mle(mle);
 		}
 		spin_unlock(&dlm->master_lock);
-		spin_unlock(&dlm->spinlock);
 	} else if (res) {
 		if (res->owner != assert->node_idx) {
 			mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1964,7 @@ ok:
 			     res->owner, namelen, name);
 		}
 	}
+	spin_unlock(&dlm->spinlock);
 
 done:
 	ret = 0;
-- 
cgit v1.2.2


From 37f328eb60a94779dd020084209fc4db2d6444a0 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 24 Mar 2010 20:06:41 -0400
Subject: ext4: Fix spelling of CONTIG_FS_EXT3 to CONFIG_FS_EXT3

Oops.  (Blush.)

Thanks to Sedat Dilek for pointing this out.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ce84a6ed4a48..f4b038f51f31 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4068,7 +4068,7 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
-#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext2",
@@ -4095,7 +4095,7 @@ static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
 #endif
 
-#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext3",
-- 
cgit v1.2.2


From ba69f9ab7df844125898104e854e063b47c26637 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 24 Mar 2010 20:18:37 -0400
Subject: ext4: Don't use delayed allocation by default when used instead of
 ext3

When ext4 driver is used to mount a filesystem instead of the ext3 file
system driver (through CONFIG_EXT4_USE_FOR_EXT23), do not enable delayed
allocation by default since some ext3 users and application writers have
developed unfortunate expectations about the safety of writing files on
systems subject to sudden and violent death without using fsync().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f4b038f51f31..29c6875d9b4d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -68,7 +68,21 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data, struct vfsmount *mnt);
 
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext3",
+	.get_sb		= ext4_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#else
+#define IS_EXT3_SB(sb) (0)
+#endif
 
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 			       struct ext4_group_desc *bg)
@@ -2539,7 +2553,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * enable delayed allocation by default
 	 * Use -o nodelalloc to turn it off
 	 */
-	set_opt(sbi->s_mount_opt, DELALLOC);
+	if (!IS_EXT3_SB(sb))
+		set_opt(sbi->s_mount_opt, DELALLOC);
 
 	if (!parse_options((char *) data, sb, &journal_devnum,
 			   &journal_ioprio, NULL, 0))
@@ -4096,14 +4111,6 @@ static inline void unregister_as_ext2(void) { }
 #endif
 
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext3_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "ext3",
-	.get_sb		= ext4_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
 static inline void register_as_ext3(void)
 {
 	int err = register_filesystem(&ext3_fs_type);
-- 
cgit v1.2.2


From c4caae25187ff3f5e837c6f04eb1acc2723c72d3 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 23 Mar 2010 21:32:00 -0400
Subject: ext4: Fixed inode allocator to correctly track a flex_bg's used_dirs

When used_dirs was introduced for the flex_groups struct, it looks
like the accounting was not put into place properly, in some places
manipulating free_inodes rather than used_dirs.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 361c0b9962a8..57f6eef6ccd6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -263,7 +263,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 					ext4_group_t f;
 
 					f = ext4_flex_group(sbi, block_group);
-					atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+					atomic_dec(&sbi->s_flex_groups[f].used_dirs);
 				}
 
 			}
@@ -773,7 +773,7 @@ static int ext4_claim_inode(struct super_block *sb,
 		if (sbi->s_log_groups_per_flex) {
 			ext4_group_t f = ext4_flex_group(sbi, group);
 
-			atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
 		}
 	}
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-- 
cgit v1.2.2


From 7731d9a5d415414aa6903709453786d4a5ff57e4 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Tue, 23 Mar 2010 13:35:15 -0700
Subject: fs/binfmt_aout.c: fix pointer warnings

fs/binfmt_aout.c: In function `aout_core_dump':
fs/binfmt_aout.c:125: warning: passing argument 2 of `dump_write' makes pointer from integer without a cast
include/linux/coredump.h:12: note: expected `const void *' but argument is of type `long unsigned int'
fs/binfmt_aout.c:132: warning: passing argument 2 of `dump_write' makes pointer from integer without a cast
include/linux/coredump.h:12: note: expected `const void *' but argument is of type `long unsigned int'

due to dump_write() expecting a user void *.  Fold casts into the
START_DATA/START_STACK macros and shut up the warnings.

Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Cc: Daisuke HATAYAMA <d.hatayama@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_aout.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 15d80bb35d6f..9b6aef0f75e5 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -75,14 +75,16 @@ static int aout_core_dump(struct coredump_params *cprm)
 	struct file *file = cprm->file;
 	mm_segment_t fs;
 	int has_dumped = 0;
-	unsigned long dump_start, dump_size;
+	void __user *dump_start;
+	int dump_size;
 	struct user dump;
 #ifdef __alpha__
-#       define START_DATA(u)	(u.start_data)
+#       define START_DATA(u)	((void __user *)u.start_data)
 #else
-#	define START_DATA(u)	((u.u_tsize << PAGE_SHIFT) + u.start_code)
+#	define START_DATA(u)	((void __user *)((u.u_tsize << PAGE_SHIFT) + \
+				 u.start_code))
 #endif
-#       define START_STACK(u)   (u.start_stack)
+#       define START_STACK(u)   ((void __user *)u.start_stack)
 
 	fs = get_fs();
 	set_fs(KERNEL_DS);
@@ -104,9 +106,9 @@ static int aout_core_dump(struct coredump_params *cprm)
 
 /* make sure we actually have a data and stack area to dump */
 	set_fs(USER_DS);
-	if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+	if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
 		dump.u_dsize = 0;
-	if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+	if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
 		dump.u_ssize = 0;
 
 	set_fs(KERNEL_DS);
-- 
cgit v1.2.2


From 6cb4aff0a77cc0e6bae9475d62205319e3ebbf3f Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 23 Mar 2010 13:35:38 -0700
Subject: reiserfs: fix oops while creating privroot with selinux enabled

Commit 57fe60df ("reiserfs: add atomic addition of selinux attributes
during inode creation") contains a bug that will cause it to oops when
mounting a file system that didn't previously contain extended attributes
on a system using security.* xattrs.

The issue is that while creating the privroot during mount
reiserfs_security_init calls reiserfs_xattr_jcreate_nblocks which
dereferences the xattr root.  The xattr root doesn't exist, so we get an
oops.

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=15309

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/xattr_security.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index d8b5bfcbdd30..de1fcffd906b 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -76,7 +76,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
 		return error;
 	}
 
-	if (sec->length) {
+	if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
 		blocks = reiserfs_xattr_jcreate_nblocks(inode) +
 			 reiserfs_xattr_nblocks(inode, sec->length);
 		/* We don't want to count the directories twice if we have
-- 
cgit v1.2.2


From 3f8b5ee33293d43ca360771b535dfae8c57259dc Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 23 Mar 2010 13:35:39 -0700
Subject: reiserfs: properly honor read-only devices

The reiserfs journal behaves inconsistently when determining whether to
allow a mount of a read-only device.

This is due to the use of the continue_replay variable to short circuit
the journal scanning.  If it's set, it's assumed that there are
transactions to replay, but there may not be.  If it's unset, it's assumed
that there aren't any, and that may not be the case either.

I've observed two failure cases:
1) Where a clean file system on a read-only device refuses to mount
2) Where a clean file system on a read-only device passes the
   optimization and then tries writing the journal header to update
   the latest mount id.

The former is easily observable by using a freshly created file system on
a read-only loopback device.

This patch moves the check into journal_read_transaction, where it can
bail out before it's about to replay a transaction.  That way it can go
through and skip transactions where appropriate, yet still refuse to mount
a file system with outstanding transactions.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/journal.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ba98546fabbd..f3de5e8a2ae8 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2217,6 +2217,15 @@ static int journal_read_transaction(struct super_block *sb,
 		brelse(d_bh);
 		return 1;
 	}
+
+	if (bdev_read_only(sb->s_bdev)) {
+		reiserfs_warning(sb, "clm-2076",
+				 "device is readonly, unable to replay log");
+		brelse(c_bh);
+		brelse(d_bh);
+		return -EROFS;
+	}
+
 	trans_id = get_desc_trans_id(desc);
 	/* now we know we've got a good transaction, and it was inside the valid time ranges */
 	log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2459,12 +2468,6 @@ static int journal_read(struct super_block *sb)
 		goto start_log_replay;
 	}
 
-	if (continue_replay && bdev_read_only(sb->s_bdev)) {
-		reiserfs_warning(sb, "clm-2076",
-				 "device is readonly, unable to replay log");
-		return -1;
-	}
-
 	/* ok, there are transactions that need to be replayed.  start with the first log block, find
 	 ** all the valid transactions, and pick out the oldest.
 	 */
-- 
cgit v1.2.2


From 4fd2c20d964a8fb9861045f1022475c9d200d684 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Tue, 23 Mar 2010 13:35:42 -0700
Subject: kcore: fix test for end of list

"m" is never NULL here.  We need a different test for the end of list
condition.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4d..b442dac8f5f9 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -490,7 +490,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		}
 		read_unlock(&kclist_lock);
 
-		if (m == NULL) {
+		if (&m->list == &kclist_head) {
 			if (clear_user(buffer, tsz))
 				return -EFAULT;
 		} else if (is_vmalloc_or_module_addr((void *)start)) {
-- 
cgit v1.2.2


From 3fbf586cf7f245392142e5407c2a56f1cff979b6 Mon Sep 17 00:00:00 2001
From: Daniel Taylor <Daniel.Taylor@wdc.com>
Date: Tue, 23 Mar 2010 13:35:50 -0700
Subject: fs/partitions/msdos: add support for large disks

In order to use disks larger than 2TiB on Windows XP, it is necessary to
use 4096-byte logical sectors in an MBR.

Although the kernel storage and functions called from msdos.c used
"sector_t" internally, msdos.c still used u32 variables, which results in
the ability to handle XP-compatible large disks.

This patch changes the internal variables to "sector_t".

Daniel said: "In the near future, WD will be releasing products that need
this patch".

[hirofumi@mail.parknet.co.jp: tweaks and fix]
Signed-off-by: Daniel Taylor <daniel.taylor@wdc.com>
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/partitions/msdos.c | 72 +++++++++++++++++++++++++++------------------------
 1 file changed, 38 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..b78385840a55 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
  */
 #include <asm/unaligned.h>
 
-#define SYS_IND(p)	(get_unaligned(&p->sys_ind))
-#define NR_SECTS(p)	({ __le32 __a =	get_unaligned(&p->nr_sects);	\
-				le32_to_cpu(__a); \
-			})
+#define SYS_IND(p)	get_unaligned(&p->sys_ind)
 
-#define START_SECT(p)	({ __le32 __a =	get_unaligned(&p->start_sect);	\
-				le32_to_cpu(__a); \
-			})
+static inline sector_t nr_sects(struct partition *p)
+{
+	return (sector_t)get_unaligned_le32(&p->nr_sects);
+}
+
+static inline sector_t start_sect(struct partition *p)
+{
+	return (sector_t)get_unaligned_le32(&p->start_sect);
+}
 
 static inline int is_extended_partition(struct partition *p)
 {
@@ -104,13 +107,13 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
 
 static void
 parse_extended(struct parsed_partitions *state, struct block_device *bdev,
-			u32 first_sector, u32 first_size)
+			sector_t first_sector, sector_t first_size)
 {
 	struct partition *p;
 	Sector sect;
 	unsigned char *data;
-	u32 this_sector, this_size;
-	int sector_size = bdev_logical_block_size(bdev) / 512;
+	sector_t this_sector, this_size;
+	sector_t sector_size = bdev_logical_block_size(bdev) / 512;
 	int loopct = 0;		/* number of links followed
 				   without finding a data partition */
 	int i;
@@ -145,14 +148,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 		 * First process the data partition(s)
 		 */
 		for (i=0; i<4; i++, p++) {
-			u32 offs, size, next;
-			if (!NR_SECTS(p) || is_extended_partition(p))
+			sector_t offs, size, next;
+			if (!nr_sects(p) || is_extended_partition(p))
 				continue;
 
 			/* Check the 3rd and 4th entries -
 			   these sometimes contain random garbage */
-			offs = START_SECT(p)*sector_size;
-			size = NR_SECTS(p)*sector_size;
+			offs = start_sect(p)*sector_size;
+			size = nr_sects(p)*sector_size;
 			next = this_sector + offs;
 			if (i >= 2) {
 				if (offs + size > this_size)
@@ -179,13 +182,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 		 */
 		p -= 4;
 		for (i=0; i<4; i++, p++)
-			if (NR_SECTS(p) && is_extended_partition(p))
+			if (nr_sects(p) && is_extended_partition(p))
 				break;
 		if (i == 4)
 			goto done;	 /* nothing left to do */
 
-		this_sector = first_sector + START_SECT(p) * sector_size;
-		this_size = NR_SECTS(p) * sector_size;
+		this_sector = first_sector + start_sect(p) * sector_size;
+		this_size = nr_sects(p) * sector_size;
 		put_dev_sector(sect);
 	}
 done:
@@ -197,7 +200,7 @@ done:
 
 static void
 parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
-			u32 offset, u32 size, int origin)
+			sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_SOLARIS_X86_PARTITION
 	Sector sect;
@@ -244,7 +247,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
  */
 static void
 parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin, char *flavour,
+		sector_t offset, sector_t size, int origin, char *flavour,
 		int max_partitions)
 {
 	Sector sect;
@@ -263,7 +266,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
 	if (le16_to_cpu(l->d_npartitions) < max_partitions)
 		max_partitions = le16_to_cpu(l->d_npartitions);
 	for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
-		u32 bsd_start, bsd_size;
+		sector_t bsd_start, bsd_size;
 
 		if (state->next == state->limit)
 			break;
@@ -290,7 +293,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
 
 static void
 parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+		sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
 	parse_bsd(state, bdev, offset, size, origin,
@@ -300,7 +303,7 @@ parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
 
 static void
 parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+		sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
 	parse_bsd(state, bdev, offset, size, origin,
@@ -310,7 +313,7 @@ parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
 
 static void
 parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+		sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
 	parse_bsd(state, bdev, offset, size, origin,
@@ -324,7 +327,7 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
  */
 static void
 parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+		sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_UNIXWARE_DISKLABEL
 	Sector sect;
@@ -348,7 +351,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
 
 		if (p->s_label != UNIXWARE_FS_UNUSED)
 			put_partition(state, state->next++,
-						START_SECT(p), NR_SECTS(p));
+				      le32_to_cpu(p->start_sect),
+				      le32_to_cpu(p->nr_sects));
 		p++;
 	}
 	put_dev_sector(sect);
@@ -363,7 +367,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
  */
 static void
 parse_minix(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+		sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_MINIX_SUBPARTITION
 	Sector sect;
@@ -390,7 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 			/* add each partition in use */
 			if (SYS_IND(p) == MINIX_PARTITION)
 				put_partition(state, state->next++,
-					      START_SECT(p), NR_SECTS(p));
+					      start_sect(p), nr_sects(p));
 		}
 		printk(" >\n");
 	}
@@ -401,7 +405,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 static struct {
 	unsigned char id;
 	void (*parse)(struct parsed_partitions *, struct block_device *,
-			u32, u32, int);
+			sector_t, sector_t, int);
 } subtypes[] = {
 	{FREEBSD_PARTITION, parse_freebsd},
 	{NETBSD_PARTITION, parse_netbsd},
@@ -415,7 +419,7 @@ static struct {
  
 int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-	int sector_size = bdev_logical_block_size(bdev) / 512;
+	sector_t sector_size = bdev_logical_block_size(bdev) / 512;
 	Sector sect;
 	unsigned char *data;
 	struct partition *p;
@@ -483,8 +487,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 
 	state->next = 5;
 	for (slot = 1 ; slot <= 4 ; slot++, p++) {
-		u32 start = START_SECT(p)*sector_size;
-		u32 size = NR_SECTS(p)*sector_size;
+		sector_t start = start_sect(p)*sector_size;
+		sector_t size = nr_sects(p)*sector_size;
 		if (!size)
 			continue;
 		if (is_extended_partition(p)) {
@@ -513,7 +517,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 		unsigned char id = SYS_IND(p);
 		int n;
 
-		if (!NR_SECTS(p))
+		if (!nr_sects(p))
 			continue;
 
 		for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +525,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 
 		if (!subtypes[n].parse)
 			continue;
-		subtypes[n].parse(state, bdev, START_SECT(p)*sector_size,
-						NR_SECTS(p)*sector_size, slot);
+		subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
+						nr_sects(p)*sector_size, slot);
 	}
 	put_dev_sector(sect);
 	return 1;
-- 
cgit v1.2.2


From 8e0cc811e0f8029a7225372fb0951fab102c012f Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Tue, 23 Mar 2010 13:35:50 -0700
Subject: fs/partition/msdos: fix unusable extended partition for > 512B sector

Smaller size than a minimum blocksize can't be used, after all it's
handled like 0 size.

For extended partition itself, this makes sure to use bigger size than one
logical sector size at least.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Daniel Taylor <Daniel.Taylor@wdc.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/partitions/msdos.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index b78385840a55..90be97f1f5a8 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -492,9 +492,16 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 		if (!size)
 			continue;
 		if (is_extended_partition(p)) {
-			/* prevent someone doing mkfs or mkswap on an
-			   extended partition, but leave room for LILO */
-			put_partition(state, slot, start, size == 1 ? 1 : 2);
+			/*
+			 * prevent someone doing mkfs or mkswap on an
+			 * extended partition, but leave room for LILO
+			 * FIXME: this uses one logical sector for > 512b
+			 * sector, although it may not be enough/proper.
+			 */
+			sector_t n = 2;
+			n = min(size, max(sector_size, n));
+			put_partition(state, slot, start, n);
+
 			printk(" <");
 			parse_extended(state, bdev, start, size);
 			printk(" >");
-- 
cgit v1.2.2


From 47568d4c56677ede6e7ff7c8e4e0fd860a9c4372 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 24 Mar 2010 17:02:28 +0000
Subject: FDPIC: For-loop in elf_core_vma_data_size() is incorrect

Fix an incorrect for-loop in elf_core_vma_data_size().  The advance-pointer
statement lacks an assignment:

	  CC      fs/binfmt_elf_fdpic.o
	fs/binfmt_elf_fdpic.c: In function 'elf_core_vma_data_size':
	fs/binfmt_elf_fdpic.c:1593: warning: statement with no effect

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf_fdpic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c32d00a6690..7ab23e006e4c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1590,7 +1590,7 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags)
 	struct vm_area_struct *vma;
 	size_t size = 0;
 
-	for (vma = current->mm->mmap; vma; vma->vm_next)
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next)
 		if (maydump(vma, mm_flags))
 			size += vma->vm_end - vma->vm_start;
 	return size;
-- 
cgit v1.2.2


From 61964eba5c419ff710ac996c5ed3a84d5af7490f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 24 Mar 2010 17:09:19 +0000
Subject: do_sync_read/write() should set kiocb.ki_nbytes to be consistent

do_sync_read/write() should set kiocb.ki_nbytes to be consistent with
do_sync_readv_writev().

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/read_write.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index b7f4a1f94d48..113386d6fd2d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -258,6 +258,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
 	kiocb.ki_left = len;
+	kiocb.ki_nbytes = len;
 
 	for (;;) {
 		ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +314,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
 	kiocb.ki_left = len;
+	kiocb.ki_nbytes = len;
 
 	for (;;) {
 		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
-- 
cgit v1.2.2


From 1147d0f915e3b4c5c4fa279dae2c40016b8f441d Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Tue, 23 Mar 2010 14:48:37 +0000
Subject: fscache: add missing unlock

Sparse complained about this missing spin_unlock()

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fscache/page.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7d..69809024d71d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -881,6 +881,7 @@ submit_failed:
 	goto nobufs;
 
 nobufs_unlock_obj:
+	spin_unlock(&cookie->stores_lock);
 	spin_unlock(&object->lock);
 nobufs:
 	spin_unlock(&cookie->lock);
-- 
cgit v1.2.2


From 3e297b613491f0d4928aa652a2cd266aa06dc409 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 26 Mar 2010 12:40:13 -0400
Subject: Restore LOOKUP_DIRECTORY hint handling in final lookup on open()

	Lose want_dir argument, while we are at it - since now
nd->flags & LOOKUP_DIRECTORY is equivalent to it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 1c0fca6e899e..a7dce91a7e42 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1610,8 +1610,7 @@ exit:
 
 static struct file *do_last(struct nameidata *nd, struct path *path,
 			    int open_flag, int acc_mode,
-			    int mode, const char *pathname,
-			    int *want_dir)
+			    int mode, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
@@ -1642,7 +1641,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (nd->last.name[nd->last.len]) {
 		if (open_flag & O_CREAT)
 			goto exit;
-		*want_dir = 1;
+		nd->flags |= LOOKUP_DIRECTORY;
 	}
 
 	/* just plain open? */
@@ -1656,8 +1655,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		if (path->dentry->d_inode->i_op->follow_link)
 			return NULL;
 		error = -ENOTDIR;
-		if (*want_dir && !path->dentry->d_inode->i_op->lookup)
-			goto exit_dput;
+		if (nd->flags & LOOKUP_DIRECTORY) {
+			if (!path->dentry->d_inode->i_op->lookup)
+				goto exit_dput;
+		}
 		path_to_nameidata(path, nd);
 		audit_inode(pathname, nd->path.dentry);
 		goto ok;
@@ -1766,7 +1767,6 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	int count = 0;
 	int flag = open_to_namei_flags(open_flag);
 	int force_reval = 0;
-	int want_dir = open_flag & O_DIRECTORY;
 
 	if (!(open_flag & O_CREAT))
 		mode = 0;
@@ -1828,7 +1828,9 @@ reval:
 		if (open_flag & O_EXCL)
 			nd.flags |= LOOKUP_EXCL;
 	}
-	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
+	if (open_flag & O_DIRECTORY)
+		nd.flags |= LOOKUP_DIRECTORY;
+	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path holder;
 		struct inode *inode = path.dentry->d_inode;
@@ -1866,7 +1868,7 @@ reval:
 		}
 		holder = path;
 		nd.flags &= ~LOOKUP_PARENT;
-		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
+		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 		if (inode->i_op->put_link)
 			inode->i_op->put_link(holder.dentry, &nd, cookie);
 		path_put(&holder);
-- 
cgit v1.2.2


From 810627a013163cd294762d57c0ea2ec055ffe4f6 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Sat, 27 Mar 2010 02:00:49 +0000
Subject: [CIFS] Add mmap for direct, nobrl cifs mount types

without mmap functions in file_ops OpenOffice can't save changes in
existing document. The same situation you can see with gedit. Also, a.out
format of files can't be executed without mmap.

Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5183bc2a1916..ded66be6597c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -808,6 +808,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 	.release = cifs_close,
 	.fsync = cifs_fsync,
 	.flush = cifs_flush,
+	.mmap = cifs_file_mmap,
 	.splice_read = generic_file_splice_read,
 #ifdef CONFIG_CIFS_POSIX
 	.unlocked_ioctl  = cifs_ioctl,
-- 
cgit v1.2.2


From 49137f2efb5cf68724bccaba531ab3d59acd71f9 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Tue, 16 Mar 2010 21:46:15 +0100
Subject: Open segment file before using it

logfs_recover_sb() needs it open.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/super.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index c66beab78dee..018728120bb3 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -289,6 +289,10 @@ static int logfs_make_writeable(struct super_block *sb)
 {
 	int err;
 
+	err = logfs_open_segfile(sb);
+	if (err)
+		return err;
+
 	/* Repair any broken superblock copies */
 	err = logfs_recover_sb(sb);
 	if (err)
@@ -299,10 +303,6 @@ static int logfs_make_writeable(struct super_block *sb)
 	if (err)
 		return err;
 
-	err = logfs_open_segfile(sb);
-	if (err)
-		return err;
-
 	/* Do one GC pass before any data gets dirtied */
 	logfs_gc_pass(sb);
 
-- 
cgit v1.2.2


From 59fe27c0a8173a74b105debc803b97582028c90b Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Wed, 17 Mar 2010 13:47:45 +0100
Subject: Limit max_pages for insane devices

Intel SSDs have a limit of 0xffff as queue_max_hw_sectors(q).  Such a
limit may make sense from a hardware pov, but it causes bio_alloc() to
return NULL.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/dev_bdev.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9718c22f186d..f99f5dcce546 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -97,8 +97,10 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 	unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
 	int i;
 
+	if (max_pages > BIO_MAX_PAGES)
+		max_pages = BIO_MAX_PAGES;
 	bio = bio_alloc(GFP_NOFS, max_pages);
-	BUG_ON(!bio); /* FIXME: handle this */
+	BUG_ON(!bio);
 
 	for (i = 0; i < nr_pages; i++) {
 		if (i >= max_pages) {
@@ -191,8 +193,10 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
 	unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
 	int i;
 
+	if (max_pages > BIO_MAX_PAGES)
+		max_pages = BIO_MAX_PAGES;
 	bio = bio_alloc(GFP_NOFS, max_pages);
-	BUG_ON(!bio); /* FIXME: handle this */
+	BUG_ON(!bio);
 
 	for (i = 0; i < nr_pages; i++) {
 		if (i >= max_pages) {
-- 
cgit v1.2.2


From e07bf553f37cd4fa470b499ff34d800956df2d48 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Wed, 17 Mar 2010 15:29:38 +0100
Subject: Plug memory leak in writeseg_end_io

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/dev_bdev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index f99f5dcce546..a5d0c56d3ebc 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -80,6 +80,7 @@ static void writeseg_end_io(struct bio *bio, int err)
 			prefetchw(&bvec->bv_page->flags);
 
 		end_page_writeback(page);
+		page_cache_release(page);
 	} while (bvec >= bio->bi_io_vec);
 	bio_put(bio);
 	if (atomic_dec_and_test(&super->s_pending_writes))
-- 
cgit v1.2.2


From e326068806ee044cc617b1dc24be1293fca3fbf6 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Wed, 17 Mar 2010 16:00:07 +0100
Subject: Prevent schedule while atomic in __logfs_readdir

Apparently filldir can sleep, which forbids kmap_atomic.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 56a8bfbb0120..c76b4b5c7ff6 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -303,12 +303,12 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
 				(filler_t *)logfs_readpage, NULL);
 		if (IS_ERR(page))
 			return PTR_ERR(page);
-		dd = kmap_atomic(page, KM_USER0);
+		dd = kmap(page);
 		BUG_ON(dd->namelen == 0);
 
 		full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
 				pos, be64_to_cpu(dd->ino), dd->type);
-		kunmap_atomic(dd, KM_USER0);
+		kunmap(page);
 		page_cache_release(page);
 		if (full)
 			break;
-- 
cgit v1.2.2


From faaa27ab919799929732c356a92a160f8657ecc6 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Fri, 26 Mar 2010 10:18:36 +0100
Subject: Write out both superblocks on mismatch

If the first superblock is wrong and the second gets written, there
will still be a mismatch on next mount.  Write both to make sure they
match.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 018728120bb3..006670fe9e8b 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -277,7 +277,7 @@ static int logfs_recover_sb(struct super_block *sb)
 	}
 	if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
 		printk(KERN_INFO"Superblocks don't match - fixing.\n");
-		return write_one_sb(sb, super->s_devops->find_last_sb);
+		return logfs_write_sb(sb);
 	}
 	/* If neither is valid now, something's wrong.  Didn't we properly
 	 * check them before?!? */
-- 
cgit v1.2.2


From 7db8064c17b92e95aec2e333096c035db9ddd4fe Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Fri, 26 Mar 2010 14:45:55 +0100
Subject: Fix logfs_get_sb_final error path

rootdir was already allocated, so we must iput it again.
Found by Al Viro.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 006670fe9e8b..2845c41d70d4 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -328,7 +328,7 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
 
 	sb->s_root = d_alloc_root(rootdir);
 	if (!sb->s_root)
-		goto fail;
+		goto fail2;
 
 	super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
 	if (!super->s_erase_page)
-- 
cgit v1.2.2


From 6f2e9e6a950a165a7d2c399ab7557e6745ef2bfd Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Fri, 26 Mar 2010 14:50:08 +0100
Subject: Use deactivate_locked_super

Found by Al Viro.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/super.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 2845c41d70d4..9d856c49afc5 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -572,8 +572,7 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
 	return 0;
 
 err1:
-	up_write(&sb->s_umount);
-	deactivate_super(sb);
+	deactivate_locked_super(sb);
 	return err;
 err0:
 	kfree(super);
-- 
cgit v1.2.2


From 193219172691e29813821dc8433317768c6ed1a3 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Sat, 27 Mar 2010 09:56:58 +0100
Subject: Prevent data corruption in logfs_rewrite_block()

The comment was correct, so make the code match the comment.  As the
new comment indicates, we might be able to do a little less work.  But
for the current -rc series let's keep it simple and just fix the bug.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/readwrite.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 7a23b3e7c0a7..c3a3a6814b84 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1594,7 +1594,6 @@ int logfs_delete(struct inode *inode, pgoff_t index,
 	return ret;
 }
 
-/* Rewrite cannot mark the inode dirty but has to write it immediatly. */
 int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
 		gc_level_t gc_level, long flags)
 {
@@ -1611,6 +1610,18 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
 		if (level != 0)
 			alloc_indirect_block(inode, page, 0);
 		err = logfs_write_buf(inode, page, flags);
+		if (!err && shrink_level(gc_level) == 0) {
+			/* Rewrite cannot mark the inode dirty but has to
+			 * write it immediatly.
+			 * Q: Can't we just create an alias for the inode
+			 * instead?  And if not, why not?
+			 */
+			if (inode->i_ino == LOGFS_INO_MASTER)
+				logfs_write_anchor(inode->i_sb);
+			else {
+				err = __logfs_write_inode(inode, flags);
+			}
+		}
 	}
 	logfs_put_write_page(page);
 	return err;
-- 
cgit v1.2.2


From 81def6b9862764924a99ac1b680e73ac8c80ac64 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Sun, 28 Mar 2010 12:47:09 +0200
Subject: Simplify and fix pad_wbuf

A comment in the old code read:
        /* The math in this function can surely use some love */

And indeed it did.  In the case that area->a_used_bytes is exactly
4096 bytes below segment size it fell apart.  pad_wbuf is now split
into two helpers that are significantly less complicated.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/segment.c | 52 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 1a14f9910d55..ff9d7f3b4d36 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -93,49 +93,57 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
 	} while (len);
 }
 
-/*
- * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
- */
-static void pad_wbuf(struct logfs_area *area, int final)
+static void pad_partial_page(struct logfs_area *area)
 {
 	struct super_block *sb = area->a_sb;
-	struct logfs_super *super = logfs_super(sb);
 	struct page *page;
 	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
 	pgoff_t index = ofs >> PAGE_SHIFT;
 	long offset = ofs & (PAGE_SIZE-1);
 	u32 len = PAGE_SIZE - offset;
 
-	if (len == PAGE_SIZE) {
-		/* The math in this function can surely use some love */
-		len = 0;
-	}
-	if (len) {
-		BUG_ON(area->a_used_bytes >= super->s_segsize);
-
-		page = get_mapping_page(area->a_sb, index, 0);
+	if (len % PAGE_SIZE) {
+		page = get_mapping_page(sb, index, 0);
 		BUG_ON(!page); /* FIXME: reserve a pool */
 		memset(page_address(page) + offset, 0xff, len);
 		SetPagePrivate(page);
 		page_cache_release(page);
 	}
+}
 
-	if (!final)
-		return;
+static void pad_full_pages(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+	u32 len = super->s_segsize - area->a_used_bytes;
+	pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
+	pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
+	struct page *page;
 
-	area->a_used_bytes += len;
-	for ( ; area->a_used_bytes < super->s_segsize;
-			area->a_used_bytes += PAGE_SIZE) {
-		/* Memset another page */
-		index++;
-		page = get_mapping_page(area->a_sb, index, 0);
+	while (no_indizes) {
+		page = get_mapping_page(sb, index, 0);
 		BUG_ON(!page); /* FIXME: reserve a pool */
-		memset(page_address(page), 0xff, PAGE_SIZE);
+		SetPageUptodate(page);
+		memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
 		SetPagePrivate(page);
 		page_cache_release(page);
+		index++;
+		no_indizes--;
 	}
 }
 
+/*
+ * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
+ * Also make sure we allocate (and memset) all pages for final writeout.
+ */
+static void pad_wbuf(struct logfs_area *area, int final)
+{
+	pad_partial_page(area);
+	if (final)
+		pad_full_pages(area);
+}
+
 /*
  * We have to be careful with the alias tree.  Since lookup is done by bix,
  * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
-- 
cgit v1.2.2


From 723b2ff40876678b49e61df34fb1d8001e34639d Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Sun, 28 Mar 2010 18:10:07 +0200
Subject: [LogFS] Clear PagePrivate when moving journal

do_logfs_journal_wl_pass() must call freeseg(), thereby clear
PagePrivate on all pages of the current journal segment.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/journal.c | 1 +
 fs/logfs/logfs.h   | 1 +
 fs/logfs/segment.c | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 6ad30a4c9052..15454ac7bd93 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -821,6 +821,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
 		logfs_set_segment_reserved(sb, segno);
 	}
 	/* Manually move journal_area */
+	freeseg(sb, area->a_segno);
 	area->a_segno = super->s_journal_seg[0];
 	area->a_is_open = 0;
 	area->a_used_bytes = 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 129779431373..b84b0eec6024 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -587,6 +587,7 @@ void move_page_to_btree(struct page *page);
 int logfs_init_mapping(struct super_block *sb);
 void logfs_sync_area(struct logfs_area *area);
 void logfs_sync_segments(struct super_block *sb);
+void freeseg(struct super_block *sb, u32 segno);
 
 /* area handling */
 int logfs_init_areas(struct super_block *sb);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index ff9d7f3b4d36..0ecd8f07c11e 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -691,7 +691,7 @@ int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
 	return 0;
 }
 
-static void freeseg(struct super_block *sb, u32 segno)
+void freeseg(struct super_block *sb, u32 segno)
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-- 
cgit v1.2.2


From 94aa8ae13db2ecf2ec1b4e65a65d3fe92b468e0e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sun, 28 Mar 2010 21:22:50 -0700
Subject: ceph: fix use after free on mds __unregister_request

There was a use after free in __unregister_request that would trigger
whenever the request map held the last reference.  This appears to have
triggered an oops during 'umount -f' when requests are being torn down.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 5268d404963c..5c7920be6420 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -532,7 +532,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
 	rb_erase(&req->r_node, &mdsc->request_tree);
 	RB_CLEAR_NODE(&req->r_node);
-	ceph_mdsc_put_request(req);
 
 	if (req->r_unsafe_dir) {
 		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -541,6 +540,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 		list_del_init(&req->r_unsafe_dir_item);
 		spin_unlock(&ci->i_unsafe_lock);
 	}
+
+	ceph_mdsc_put_request(req);
 }
 
 /*
-- 
cgit v1.2.2


From a53f4f9efaeb1d87cfae066346979d4d70e1abe9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 29 Mar 2010 13:08:52 +0100
Subject: SLOW_WORK: CONFIG_SLOW_WORK_PROC should be CONFIG_SLOW_WORK_DEBUG

CONFIG_SLOW_WORK_PROC was changed to CONFIG_SLOW_WORK_DEBUG, but not in all
instances.  Change the remaining instances.  This makes the debugfs file
display the time mark and the owner's description again.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fscache/object.c    | 6 +++---
 fs/fscache/operation.c | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index e513ac599c8e..0b589a9b4ffc 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -53,7 +53,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
 static void fscache_object_slow_work_put_ref(struct slow_work *);
 static int  fscache_object_slow_work_get_ref(struct slow_work *);
 static void fscache_object_slow_work_execute(struct slow_work *);
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
 #endif
 static void fscache_initialise_object(struct fscache_object *);
@@ -69,7 +69,7 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
 	.get_ref	= fscache_object_slow_work_get_ref,
 	.put_ref	= fscache_object_slow_work_put_ref,
 	.execute	= fscache_object_slow_work_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	.desc		= fscache_object_slow_work_desc,
 #endif
 };
@@ -364,7 +364,7 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
 /*
  * describe an object for slow-work debugging
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_object_slow_work_desc(struct slow_work *work,
 					  struct seq_file *m)
 {
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 313e79a14266..9f6c928d4586 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -500,7 +500,7 @@ static void fscache_op_execute(struct slow_work *work)
 /*
  * describe an operation for slow-work debugging
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
 {
 	struct fscache_operation *op =
@@ -517,7 +517,7 @@ const struct slow_work_ops fscache_op_slow_work_ops = {
 	.get_ref	= fscache_op_get_ref,
 	.put_ref	= fscache_op_put_ref,
 	.execute	= fscache_op_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	.desc		= fscache_op_desc,
 #endif
 };
-- 
cgit v1.2.2


From 0943846ae05603efd98550f2d475e9c98191bde8 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Mon, 29 Mar 2010 21:13:28 +0200
Subject: [LogFS] Move reserved segments with journal

Fixes a GC livelock.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/journal.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 15454ac7bd93..25b1345c4652 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -800,6 +800,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct logfs_area *area = super->s_journal_area;
+	struct btree_head32 *head = &super->s_reserved_segments;
 	u32 segno, ec;
 	int i, err;
 
@@ -807,6 +808,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
 	/* Drop old segments */
 	journal_for_each(i)
 		if (super->s_journal_seg[i]) {
+			btree_remove32(head, super->s_journal_seg[i]);
 			logfs_set_segment_unreserved(sb,
 					super->s_journal_seg[i],
 					super->s_journal_ec[i]);
@@ -819,6 +821,8 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
 		super->s_journal_seg[i] = segno;
 		super->s_journal_ec[i] = ec;
 		logfs_set_segment_reserved(sb, segno);
+		err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
+		BUG_ON(err); /* mempool should prevent this */
 	}
 	/* Manually move journal_area */
 	freeseg(sb, area->a_segno);
-- 
cgit v1.2.2


From 6be7fa06eb4d721df734bd0946b5e63b27c0589b Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Mon, 29 Mar 2010 21:14:52 +0200
Subject: [LogFS] Erase new journal segments

If the device contains on old logfs image and the journal is moved to
segment that have never been used by the current logfs and not all
journal segments are erased before the next mount, the old content can
confuse mount code.  To prevent this, always erase the new journal
segments.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/journal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 25b1345c4652..d57c7b07b60b 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -823,6 +823,8 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
 		logfs_set_segment_reserved(sb, segno);
 		err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
 		BUG_ON(err); /* mempool should prevent this */
+		err = logfs_erase_segment(sb, segno, 1);
+		BUG_ON(err); /* FIXME: remount-ro would be nicer */
 	}
 	/* Manually move journal_area */
 	freeseg(sb, area->a_segno);
-- 
cgit v1.2.2


From de329820e920cd9cfbc2127cad26a37026260cce Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 29 Mar 2010 14:30:19 -0700
Subject: ext3: fix broken handling of EXT3_STATE_NEW

In commit 9df93939b735 ("ext3: Use bitops to read/modify
EXT3_I(inode)->i_state") ext3 changed its internal 'i_state' variable to
use bitops for its state handling.  However, unline the same ext4
change, it didn't actually change the name of the field when it changed
the semantics of it.

As a result, an old use of 'i_state' remained in fs/ext3/ialloc.c that
initialized the field to EXT3_STATE_NEW.  And that does not work
_at_all_ when we're now working with individually named bits rather than
values that get masked.  So the code tried to mark the state to be new,
but in actual fact set the field to EXT3_STATE_JDATA.  Which makes no
sense at all, and screws up all the code that checks whether the inode
was newly allocated.

In particular, it made the xattr code unhappy, and caused various random
behavior, like apparently

	https://bugzilla.redhat.com/show_bug.cgi?id=577911

So fix the initialization, and rename the field to match ext4 so that we
don't have this happen again.

Cc: James Morris <jmorris@namei.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Daniel J Walsh <dwalsh@redhat.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/ialloc.c | 4 +++-
 fs/ext3/inode.c  | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index ef9008b885b5..0d0e97ed3ff6 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -582,7 +582,9 @@ got:
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
 
-	ei->i_state = EXT3_STATE_NEW;
+	ei->i_state_flags = 0;
+	ext3_set_inode_state(inode, EXT3_STATE_NEW);
+
 	ei->i_extra_isize =
 		(EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
 		sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 7f920b7263a4..ea33bdf0a300 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2811,7 +2811,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
 	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
 	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
 
-	ei->i_state = 0;
+	ei->i_state_flags = 0;
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
-- 
cgit v1.2.2


From 5a0e3ad6af8660be21ca98a971cd00f331318c05 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Mar 2010 17:04:11 +0900
Subject: include cleanup: Update gfp.h and slab.h includes to prepare for
 breaking implicit slab.h inclusion from percpu.h

percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files.  percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.

percpu.h -> slab.h dependency is about to be removed.  Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability.  As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.

  http://userweb.kernel.org/~tj/misc/slabh-sweep.py

The script does the followings.

* Scan files for gfp and slab usages and update includes such that
  only the necessary includes are there.  ie. if only gfp is used,
  gfp.h, if slab is used, slab.h.

* When the script inserts a new include, it looks at the include
  blocks and try to put the new include such that its order conforms
  to its surrounding.  It's put in the include block which contains
  core kernel includes, in the same order that the rest are ordered -
  alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
  doesn't seem to be any matching order.

* If the script can't find a place to put a new include (mostly
  because the file doesn't have fitting include block), it prints out
  an error message indicating which .h file needs to be added to the
  file.

The conversion was done in the following steps.

1. The initial automatic conversion of all .c files updated slightly
   over 4000 files, deleting around 700 includes and adding ~480 gfp.h
   and ~3000 slab.h inclusions.  The script emitted errors for ~400
   files.

2. Each error was manually checked.  Some didn't need the inclusion,
   some needed manual addition while adding it to implementation .h or
   embedding .c file was more appropriate for others.  This step added
   inclusions to around 150 files.

3. The script was run again and the output was compared to the edits
   from #2 to make sure no file was left behind.

4. Several build tests were done and a couple of problems were fixed.
   e.g. lib/decompress_*.c used malloc/free() wrappers around slab
   APIs requiring slab.h to be added manually.

5. The script was run on all .h files but without automatically
   editing them as sprinkling gfp.h and slab.h inclusions around .h
   files could easily lead to inclusion dependency hell.  Most gfp.h
   inclusion directives were ignored as stuff from gfp.h was usually
   wildly available and often used in preprocessor macros.  Each
   slab.h inclusion directive was examined and added manually as
   necessary.

6. percpu.h was updated not to include slab.h.

7. Build test were done on the following configurations and failures
   were fixed.  CONFIG_GCOV_KERNEL was turned off for all tests (as my
   distributed build env didn't work with gcov compiles) and a few
   more options had to be turned off depending on archs to make things
   build (like ipr on powerpc/64 which failed due to missing writeq).

   * x86 and x86_64 UP and SMP allmodconfig and a custom test config.
   * powerpc and powerpc64 SMP allmodconfig
   * sparc and sparc64 SMP allmodconfig
   * ia64 SMP allmodconfig
   * s390 SMP allmodconfig
   * alpha SMP allmodconfig
   * um on x86_64 SMP allmodconfig

8. percpu.h modifications were reverted so that it could be applied as
   a separate patch and serve as bisection point.

Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
---
 fs/9p/cache.c                  | 1 +
 fs/9p/fid.c                    | 1 +
 fs/9p/v9fs.c                   | 1 +
 fs/9p/vfs_dentry.c             | 1 +
 fs/9p/vfs_dir.c                | 1 +
 fs/9p/vfs_inode.c              | 1 +
 fs/9p/vfs_super.c              | 1 +
 fs/adfs/super.c                | 1 +
 fs/affs/bitmap.c               | 1 +
 fs/affs/inode.c                | 1 +
 fs/affs/super.c                | 1 +
 fs/afs/cache.c                 | 1 -
 fs/afs/cmservice.c             | 1 +
 fs/afs/dir.c                   | 1 -
 fs/afs/file.c                  | 2 +-
 fs/afs/fsclient.c              | 1 +
 fs/afs/inode.c                 | 1 -
 fs/afs/mntpt.c                 | 2 +-
 fs/afs/rxrpc.c                 | 1 +
 fs/afs/vlclient.c              | 1 +
 fs/afs/vlocation.c             | 1 +
 fs/afs/vnode.c                 | 1 -
 fs/anon_inodes.c               | 1 -
 fs/autofs/root.c               | 1 +
 fs/autofs4/dev-ioctl.c         | 1 +
 fs/autofs4/root.c              | 1 +
 fs/befs/datastream.c           | 1 -
 fs/binfmt_aout.c               | 2 +-
 fs/binfmt_em86.c               | 1 -
 fs/binfmt_script.c             | 1 -
 fs/bio-integrity.c             | 1 +
 fs/btrfs/acl.c                 | 1 +
 fs/btrfs/async-thread.c        | 1 +
 fs/btrfs/compression.c         | 1 +
 fs/btrfs/ctree.c               | 1 +
 fs/btrfs/ctree.h               | 1 +
 fs/btrfs/delayed-ref.c         | 1 +
 fs/btrfs/disk-io.c             | 1 +
 fs/btrfs/extent-tree.c         | 1 +
 fs/btrfs/extent_io.c           | 1 -
 fs/btrfs/extent_map.c          | 1 -
 fs/btrfs/file-item.c           | 1 +
 fs/btrfs/file.c                | 1 +
 fs/btrfs/free-space-cache.c    | 1 +
 fs/btrfs/inode.c               | 1 +
 fs/btrfs/ioctl.c               | 1 +
 fs/btrfs/locking.c             | 1 -
 fs/btrfs/ordered-data.c        | 1 -
 fs/btrfs/ref-cache.c           | 1 +
 fs/btrfs/relocation.c          | 1 +
 fs/btrfs/super.c               | 1 +
 fs/btrfs/transaction.c         | 1 +
 fs/btrfs/tree-log.c            | 1 +
 fs/btrfs/volumes.c             | 1 +
 fs/cachefiles/interface.c      | 1 +
 fs/cachefiles/namei.c          | 1 +
 fs/cachefiles/rdwr.c           | 1 +
 fs/cachefiles/xattr.c          | 1 +
 fs/ceph/addr.c                 | 1 +
 fs/ceph/auth.c                 | 1 +
 fs/ceph/auth_none.c            | 1 +
 fs/ceph/auth_x.c               | 1 +
 fs/ceph/buffer.c               | 3 +++
 fs/ceph/caps.c                 | 1 +
 fs/ceph/crypto.c               | 1 +
 fs/ceph/debugfs.c              | 1 +
 fs/ceph/dir.c                  | 1 +
 fs/ceph/export.c               | 1 +
 fs/ceph/file.c                 | 1 +
 fs/ceph/mds_client.c           | 1 +
 fs/ceph/messenger.c            | 1 +
 fs/ceph/mon_client.c           | 1 +
 fs/ceph/osdmap.c               | 4 +++-
 fs/ceph/pagelist.c             | 1 +
 fs/ceph/snap.c                 | 1 +
 fs/ceph/super.c                | 1 +
 fs/ceph/super.h                | 1 +
 fs/ceph/xattr.c                | 1 +
 fs/cifs/cifs_dfs_ref.c         | 1 +
 fs/cifs/cifs_spnego.c          | 1 +
 fs/cifs/cifs_unicode.c         | 1 +
 fs/cifs/cifsacl.c              | 1 +
 fs/cifs/cifsencrypt.c          | 1 +
 fs/cifs/cifsglob.h             | 1 +
 fs/cifs/cifssmb.c              | 1 +
 fs/cifs/connect.c              | 1 +
 fs/cifs/dns_resolve.c          | 1 +
 fs/cifs/file.c                 | 1 +
 fs/cifs/inode.c                | 1 +
 fs/cifs/link.c                 | 1 +
 fs/cifs/readdir.c              | 1 +
 fs/cifs/sess.c                 | 1 +
 fs/cifs/smbencrypt.c           | 1 +
 fs/cifs/transport.c            | 1 +
 fs/cifs/xattr.c                | 1 +
 fs/coda/dir.c                  | 1 +
 fs/coda/file.c                 | 1 +
 fs/coda/inode.c                | 1 +
 fs/coda/upcall.c               | 1 +
 fs/compat.c                    | 1 +
 fs/compat_ioctl.c              | 2 +-
 fs/configfs/inode.c            | 1 +
 fs/configfs/mount.c            | 1 +
 fs/configfs/symlink.c          | 1 +
 fs/debugfs/inode.c             | 1 +
 fs/devpts/inode.c              | 1 +
 fs/dlm/config.c                | 1 +
 fs/dlm/debug_fs.c              | 1 +
 fs/dlm/lock.c                  | 1 +
 fs/dlm/lowcomms.c              | 1 +
 fs/dlm/netlink.c               | 1 +
 fs/dlm/plock.c                 | 1 +
 fs/dlm/user.c                  | 1 +
 fs/ecryptfs/crypto.c           | 1 +
 fs/ecryptfs/dentry.c           | 1 +
 fs/ecryptfs/file.c             | 1 +
 fs/ecryptfs/inode.c            | 1 +
 fs/ecryptfs/keystore.c         | 1 +
 fs/ecryptfs/kthread.c          | 1 +
 fs/ecryptfs/main.c             | 1 +
 fs/ecryptfs/messaging.c        | 1 +
 fs/ecryptfs/miscdev.c          | 1 +
 fs/ecryptfs/mmap.c             | 1 +
 fs/ecryptfs/super.c            | 1 +
 fs/eventfd.c                   | 1 +
 fs/exofs/inode.c               | 1 +
 fs/exofs/ios.c                 | 1 +
 fs/exofs/super.c               | 1 +
 fs/ext2/balloc.c               | 1 +
 fs/ext2/xattr_security.c       | 1 +
 fs/ext3/balloc.c               | 1 +
 fs/ext3/xattr_security.c       | 1 +
 fs/ext4/block_validity.c       | 1 +
 fs/ext4/inode.c                | 1 +
 fs/ext4/mballoc.c              | 1 +
 fs/ext4/migrate.c              | 1 +
 fs/ext4/move_extent.c          | 1 +
 fs/ext4/xattr_security.c       | 1 +
 fs/fat/cache.c                 | 1 +
 fs/fifo.c                      | 1 -
 fs/filesystems.c               | 2 +-
 fs/freevxfs/vxfs_subr.c        | 1 -
 fs/fs-writeback.c              | 1 +
 fs/fscache/object-list.c       | 1 +
 fs/fscache/operation.c         | 1 +
 fs/fscache/page.c              | 1 +
 fs/fuse/cuse.c                 | 1 +
 fs/generic_acl.c               | 1 +
 fs/gfs2/bmap.c                 | 1 -
 fs/gfs2/dentry.c               | 1 -
 fs/gfs2/export.c               | 1 -
 fs/gfs2/glops.c                | 1 -
 fs/gfs2/lock_dlm.c             | 1 +
 fs/gfs2/rgrp.h                 | 2 ++
 fs/gfs2/sys.c                  | 1 -
 fs/gfs2/util.c                 | 1 -
 fs/hfs/bnode.c                 | 1 +
 fs/hfs/btree.c                 | 1 +
 fs/hfs/mdb.c                   | 1 +
 fs/hfs/super.c                 | 1 +
 fs/hfsplus/options.c           | 1 +
 fs/hostfs/hostfs_kern.c        | 1 +
 fs/hpfs/buffer.c               | 1 +
 fs/hpfs/dir.c                  | 1 +
 fs/hpfs/inode.c                | 1 +
 fs/hpfs/super.c                | 1 +
 fs/ioprio.c                    | 1 +
 fs/isofs/dir.c                 | 1 +
 fs/isofs/namei.c               | 1 +
 fs/jbd/commit.c                | 1 -
 fs/jbd/recovery.c              | 1 -
 fs/jbd2/recovery.c             | 1 -
 fs/jffs2/compr_lzo.c           | 1 -
 fs/jffs2/compr_zlib.c          | 1 -
 fs/jffs2/debug.c               | 1 +
 fs/jffs2/file.c                | 1 -
 fs/jffs2/nodelist.c            | 1 -
 fs/jffs2/nodemgmt.c            | 1 -
 fs/jffs2/symlink.c             | 1 -
 fs/jffs2/write.c               | 1 -
 fs/jfs/acl.c                   | 1 +
 fs/jfs/jfs_dmap.c              | 1 +
 fs/jfs/jfs_dtree.c             | 1 +
 fs/jfs/jfs_imap.c              | 1 +
 fs/jfs/jfs_logmgr.c            | 1 +
 fs/jfs/jfs_metapage.c          | 1 +
 fs/jfs/jfs_unicode.h           | 1 +
 fs/jfs/super.c                 | 1 +
 fs/jfs/xattr.c                 | 1 +
 fs/libfs.c                     | 1 +
 fs/lockd/clntlock.c            | 1 +
 fs/lockd/clntproc.c            | 1 +
 fs/lockd/mon.c                 | 1 +
 fs/lockd/svc.c                 | 1 -
 fs/lockd/svc4proc.c            | 1 -
 fs/lockd/svclock.c             | 1 +
 fs/lockd/svcproc.c             | 1 -
 fs/lockd/svcsubs.c             | 1 +
 fs/logfs/dev_bdev.c            | 1 +
 fs/logfs/dir.c                 | 2 +-
 fs/logfs/gc.c                  | 1 +
 fs/logfs/inode.c               | 1 +
 fs/logfs/journal.c             | 1 +
 fs/logfs/readwrite.c           | 1 +
 fs/logfs/segment.c             | 1 +
 fs/logfs/super.c               | 1 +
 fs/minix/itree_v1.c            | 1 +
 fs/mpage.c                     | 1 +
 fs/ncpfs/dir.c                 | 1 -
 fs/ncpfs/file.c                | 1 -
 fs/ncpfs/ioctl.c               | 1 +
 fs/ncpfs/mmap.c                | 2 +-
 fs/ncpfs/sock.c                | 1 +
 fs/ncpfs/symlink.c             | 1 +
 fs/nfs/cache_lib.c             | 1 +
 fs/nfs/callback_proc.c         | 1 +
 fs/nfs/callback_xdr.c          | 1 +
 fs/nfs/client.c                | 1 +
 fs/nfs/delegation.c            | 1 +
 fs/nfs/direct.c                | 1 +
 fs/nfs/dns_resolve.c           | 1 +
 fs/nfs/file.c                  | 2 +-
 fs/nfs/fscache.c               | 1 +
 fs/nfs/inode.c                 | 1 +
 fs/nfs/namespace.c             | 1 +
 fs/nfs/nfs2xdr.c               | 1 -
 fs/nfs/nfs3acl.c               | 1 +
 fs/nfs/nfs3proc.c              | 1 +
 fs/nfs/nfs3xdr.c               | 1 -
 fs/nfs/nfs4namespace.c         | 1 +
 fs/nfs/nfs4proc.c              | 1 +
 fs/nfs/nfs4xdr.c               | 1 -
 fs/nfs/proc.c                  | 1 -
 fs/nfs/super.c                 | 1 +
 fs/nfs/symlink.c               | 1 -
 fs/nfs_common/nfsacl.c         | 1 +
 fs/nfsd/export.c               | 1 +
 fs/nfsd/nfs2acl.c              | 1 +
 fs/nfsd/nfs3acl.c              | 1 +
 fs/nfsd/nfs4acl.c              | 1 +
 fs/nfsd/nfs4callback.c         | 1 +
 fs/nfsd/nfs4idmap.c            | 1 +
 fs/nfsd/nfs4proc.c             | 1 +
 fs/nfsd/nfs4recover.c          | 1 +
 fs/nfsd/nfs4state.c            | 1 +
 fs/nfsd/nfs4xdr.c              | 1 +
 fs/nfsd/nfscache.c             | 2 ++
 fs/nfsd/nfsctl.c               | 1 +
 fs/nfsd/vfs.c                  | 1 +
 fs/nilfs2/alloc.c              | 1 +
 fs/nilfs2/btnode.c             | 1 +
 fs/nilfs2/gcinode.c            | 1 +
 fs/nilfs2/inode.c              | 1 +
 fs/nilfs2/ioctl.c              | 1 +
 fs/nilfs2/mdt.c                | 1 +
 fs/nilfs2/page.c               | 1 +
 fs/nilfs2/recovery.c           | 1 +
 fs/nilfs2/segbuf.c             | 1 +
 fs/nilfs2/segment.c            | 1 +
 fs/nilfs2/the_nilfs.h          | 1 +
 fs/notify/fsnotify.c           | 1 +
 fs/notify/inode_mark.c         | 1 -
 fs/ntfs/aops.c                 | 1 +
 fs/ntfs/attrib.c               | 1 +
 fs/ntfs/compress.c             | 1 +
 fs/ntfs/dir.c                  | 1 +
 fs/ntfs/file.c                 | 1 +
 fs/ntfs/index.c                | 2 ++
 fs/ntfs/mft.c                  | 1 +
 fs/ntfs/namei.c                | 1 +
 fs/ocfs2/acl.c                 | 1 +
 fs/ocfs2/buffer_head_io.c      | 1 -
 fs/ocfs2/cluster/heartbeat.c   | 1 +
 fs/ocfs2/cluster/nodemanager.c | 1 +
 fs/ocfs2/cluster/quorum.c      | 1 -
 fs/ocfs2/dlm/dlmast.c          | 1 -
 fs/ocfs2/dlm/dlmconvert.c      | 1 -
 fs/ocfs2/dlm/dlmthread.c       | 1 -
 fs/ocfs2/dlm/dlmunlock.c       | 1 -
 fs/ocfs2/extent_map.c          | 1 +
 fs/ocfs2/heartbeat.c           | 1 -
 fs/ocfs2/inode.c               | 1 -
 fs/ocfs2/mmap.c                | 1 -
 fs/ocfs2/quota_global.c        | 1 +
 fs/ocfs2/quota_local.c         | 1 +
 fs/ocfs2/refcounttree.c        | 1 -
 fs/ocfs2/stack_o2cb.c          | 1 +
 fs/ocfs2/stack_user.c          | 1 +
 fs/ocfs2/sysfile.c             | 1 -
 fs/omfs/inode.c                | 1 +
 fs/open.c                      | 2 +-
 fs/partitions/check.c          | 1 +
 fs/partitions/efi.c            | 1 +
 fs/proc/array.c                | 1 -
 fs/proc/base.c                 | 1 +
 fs/proc/generic.c              | 1 +
 fs/proc/inode.c                | 1 +
 fs/proc/kcore.c                | 1 +
 fs/proc/nommu.c                | 1 -
 fs/proc/proc_devtree.c         | 1 +
 fs/proc/proc_net.c             | 1 +
 fs/proc/stat.c                 | 1 -
 fs/proc/task_mmu.c             | 1 +
 fs/proc/task_nommu.c           | 1 +
 fs/proc/vmcore.c               | 1 +
 fs/quota/netlink.c             | 1 +
 fs/ramfs/file-nommu.c          | 1 +
 fs/ramfs/inode.c               | 1 +
 fs/reiserfs/dir.c              | 1 +
 fs/reiserfs/fix_node.c         | 1 +
 fs/reiserfs/inode.c            | 1 +
 fs/reiserfs/journal.c          | 1 +
 fs/reiserfs/namei.c            | 1 +
 fs/reiserfs/super.c            | 1 +
 fs/reiserfs/xattr.c            | 1 +
 fs/reiserfs/xattr_acl.c        | 1 +
 fs/reiserfs/xattr_security.c   | 1 +
 fs/signalfd.c                  | 1 +
 fs/smbfs/file.c                | 1 -
 fs/smbfs/smbiod.c              | 1 -
 fs/smbfs/symlink.c             | 1 +
 fs/splice.c                    | 1 +
 fs/squashfs/symlink.c          | 1 -
 fs/squashfs/zlib_wrapper.c     | 1 +
 fs/sync.c                      | 1 +
 fs/sysfs/inode.c               | 1 +
 fs/sysfs/mount.c               | 1 +
 fs/sysfs/symlink.c             | 1 +
 fs/timerfd.c                   | 1 +
 fs/ubifs/commit.c              | 1 +
 fs/ubifs/debug.c               | 1 +
 fs/ubifs/file.c                | 1 +
 fs/ubifs/gc.c                  | 1 +
 fs/ubifs/io.c                  | 1 +
 fs/ubifs/lpt.c                 | 1 +
 fs/ubifs/lpt_commit.c          | 1 +
 fs/ubifs/recovery.c            | 1 +
 fs/ubifs/sb.c                  | 1 +
 fs/ubifs/tnc.c                 | 1 +
 fs/ubifs/ubifs.h               | 1 +
 fs/ubifs/xattr.c               | 1 +
 fs/udf/partition.c             | 1 -
 fs/udf/symlink.c               | 1 -
 fs/udf/unicode.c               | 1 +
 fs/xattr_acl.c                 | 2 +-
 fs/xfs/linux-2.6/kmem.c        | 1 +
 fs/xfs/linux-2.6/xfs_acl.c     | 1 +
 fs/xfs/linux-2.6/xfs_aops.c    | 1 +
 fs/xfs/linux-2.6/xfs_buf.c     | 2 +-
 fs/xfs/linux-2.6/xfs_ioctl.c   | 1 +
 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 +
 fs/xfs/linux-2.6/xfs_iops.c    | 1 +
 fs/xfs/linux-2.6/xfs_super.c   | 1 +
 353 files changed, 300 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index e777961939f3..0dbe0d139ac2 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -22,6 +22,7 @@
 
 #include <linux/jiffies.h>
 #include <linux/file.h>
+#include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 08b2eb157048..82ee460e534d 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -24,6 +24,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/idr.h>
 #include <net/9p/9p.h>
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6c7f6a251115..cb57d3326182 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -29,6 +29,7 @@
 #include <linux/sched.h>
 #include <linux/parser.h>
 #include <linux/idr.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 #include <net/9p/transport.h>
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index d74325295b1e..cbf4e50f3933 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -34,6 +34,7 @@
 #include <linux/namei.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index d8a3afe4ff72..909711f57c0d 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -32,6 +32,7 @@
 #include <linux/sched.h>
 #include <linux/inet.h>
 #include <linux/idr.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5fe45d692c9f..63c2b5af268a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -34,6 +34,7 @@
 #include <linux/namei.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 69357c0d9899..a271549d9e21 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -37,6 +37,7 @@
 #include <linux/mount.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 6910a98bd73c..4a3af7075c1d 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/parser.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/statfs.h>
 #include "adfs.h"
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index 8306d53307ed..3e262711ae06 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -7,6 +7,7 @@
  *  block allocation, deallocation, calculation of free space.
  */
 
+#include <linux/slab.h>
 #include "affs.h"
 
 /* This is, of course, shamelessly stolen from fs/minix */
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index c9744d771d98..f4b2a4ee4f91 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -10,6 +10,7 @@
  *  (C) 1991  Linus Torvalds - minix filesystem
  */
 #include <linux/sched.h>
+#include <linux/gfp.h>
 #include "affs.h"
 
 extern const struct inode_operations affs_symlink_inode_operations;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d41e9673cd97..16a3e4765f68 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -17,6 +17,7 @@
 #include <linux/magic.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "affs.h"
 
 extern struct timezone sys_tz;
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index e2b1d3f16519..0fb315dd4d2a 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -9,7 +9,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/slab.h>
 #include <linux/sched.h>
 #include "internal.h"
 
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index eb765489164f..a3bcec75c54a 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -11,6 +11,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/ip.h>
 #include "internal.h"
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 88067f36e5e7..adc1cb771b57 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 39b301662f22..0df9bc2b724d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -12,10 +12,10 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
+#include <linux/gfp.h>
 #include "internal.h"
 
 static int afs_readpage(struct file *file, struct page *page);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 023b95b0d9d7..4bd0218473a9 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/circ_buf.h>
 #include "internal.h"
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index c048f0658751..d00b312e3110 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -16,7 +16,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/sched.h>
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 5ffb570cd3a8..5e813a816ce4 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -12,11 +12,11 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/gfp.h>
 #include "internal.h"
 
 
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bde3f19c0995..67cf810e0fd6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -9,6 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/slab.h>
 #include <net/sock.h>
 #include <net/af_rxrpc.h>
 #include <rxrpc/packet.h>
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 36c1306e09e0..340afd0cd182 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -9,6 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 6e689208def2..9ac260d1361d 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -11,6 +11,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 2f05c4fc2a70..25cf4c3f4ff7 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include "internal.h"
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 2de009565d8e..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -12,7 +12,6 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 4a1401cea0a1..8713c7cfbc79 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -13,6 +13,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/smp_lock.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index c8a80dffb455..d29b7f6df862 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -22,6 +22,7 @@
 #include <linux/magic.h>
 #include <linux/dcache.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 
 #include "autofs_i.h"
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index a015b49891df..109a6c606d92 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -15,6 +15,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
 #include "autofs_i.h"
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index e3287d0d1a58..59096b5e0fc7 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -11,7 +11,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/string.h>
 
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 9b6aef0f75e5..f96eff04e11a 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -20,11 +20,11 @@
 #include <linux/fcntl.h>
 #include <linux/ptrace.h>
 #include <linux/user.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/personality.h>
 #include <linux/init.h>
 #include <linux/coredump.h>
+#include <linux/slab.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 32fb00b52cd0..b8e8b0acf9bd 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -11,7 +11,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/elf.h>
 #include <linux/init.h>
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 08343505e184..aca9d55afb22 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -8,7 +8,6 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/binfmts.h>
 #include <linux/init.h>
 #include <linux/file.h>
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index a16f29e888cd..612a5c38d3c1 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -24,6 +24,7 @@
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
+#include <linux/slab.h>
 
 struct integrity_slab {
 	struct kmem_cache *slab;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6df6d6ed74fd..6ef7b26724ec 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 
 #include "ctree.h"
 #include "btrfs_inode.h"
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..462859a30141 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 28b92a7218ab..629880ec8ac4 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,6 +32,7 @@
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
 #include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4bc570a396e..8e29260cd0bb 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0af2e3868573..8a6518a3f041 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,7 @@
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
+#include <linux/slab.h>
 #include <asm/kmap_types.h>
 #include "extent_io.h"
 #include "extent_map.h"
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..902ce507c4e3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/sort.h>
 #include "ctree.h"
 #include "delayed-ref.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 11d0ad30e203..e57ded75d910 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/crc32c.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1727b26fb194..fa2907d531a2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
 #include <linux/sort.h>
 #include <linux/rcupdate.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c99121ac5d6b..0adceb525167 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
 #include <linux/slab.h>
 #include <linux/bio.h>
 #include <linux/mm.h>
-#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/page-flags.h>
 #include <linux/module.h>
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 28d87ba60ce8..454ca52d6451 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,5 +1,4 @@
 #include <linux/err.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..54a255065aa3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include "ctree.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ee3323c7fc1c..29ff749ff4ca 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index dd831ed31eea..f488fac04d99 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,6 +18,7 @@
 
 #include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/math64.h>
 #include "ctree.h"
 #include "free-space-cache.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 02bb099845fd..2fe494c84e60 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2845c6ceecd2..9b3d73a0fdc8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
 #include <linux/security.h>
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/sched.h>
-#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 #include <linux/page-flags.h>
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a8ffecd0b491..ecb22ff7d1fd 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/sort.h>
 #include "ctree.h"
 #include "ref-cache.h"
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0b23942cbc0d..e558dd941ded 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/rbtree.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9ac612e6ca60..693a664318fe 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
 #include <linux/namei.h>
 #include <linux/miscdevice.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2d654c1c794d..c0d0e3e7bc63 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1255fcc8ade5..af57dd2b43d4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9df8e3f1ccab..1692ec9cd7b0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
  */
 #include <linux/sched.h>
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 27089311fbea..37fe101a4e0d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -9,6 +9,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/buffer_head.h>
 #include "internal.h"
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index eeb4986ea7db..d5db84a1ee0d 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -19,6 +19,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 #include "internal.h"
 
 #define CACHEFILES_KEYBUF_SIZE 512
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 1d8332563863..0f0d41fbb03f 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -10,6 +10,7 @@
  */
 
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <linux/file.h>
 #include "internal.h"
 
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index f3e7a0bf068b..e18b183b47e1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -16,6 +16,7 @@
 #include <linux/fsnotify.h>
 #include <linux/quotaops.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include "internal.h"
 
 static const char cachefiles_xattr_cache[] =
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index ce8ef6107727..aa3cd7cc3e40 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -5,6 +5,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>	/* generic_writepages */
+#include <linux/slab.h>
 #include <linux/pagevec.h>
 #include <linux/task_io_accounting_ops.h>
 
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index abb204fea6c7..f6394b94b866 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/err.h>
 
 #include "types.h"
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index b4ef6f0a6c85..8cd9e3af07f7 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -4,6 +4,7 @@
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/slab.h>
 
 #include "auth_none.h"
 #include "auth.h"
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 8d8a84964763..d9001a4dc8cc 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -4,6 +4,7 @@
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/slab.h>
 
 #include "auth_x.h"
 #include "auth_x_protocol.h"
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
index b98086c7aeba..c67535d70aa6 100644
--- a/fs/ceph/buffer.c
+++ b/fs/ceph/buffer.c
@@ -1,5 +1,8 @@
 
 #include "ceph_debug.h"
+
+#include <linux/slab.h>
+
 #include "buffer.h"
 #include "decode.h"
 
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7d0a0d0adc18..3710e077a857 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/wait.h>
 #include <linux/writeback.h>
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index 291ac288e791..f704b3b62424 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -3,6 +3,7 @@
 
 #include <linux/err.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <crypto/hash.h>
 
 #include "crypto.h"
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index e159f1415110..f7048da92acc 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 
 #include <linux/device.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 8a9116e15b70..7261dc6c2ead 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -3,6 +3,7 @@
 #include <linux/spinlock.h>
 #include <linux/fs_struct.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 
 #include "super.h"
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index fc68e39cbad6..9d67572fb328 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 
 #include <linux/exportfs.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 
 #include "super.h"
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5d2af8464f6a..4add3d5da2c1 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/namei.h>
 #include <linux/writeback.h>
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 5c7920be6420..60a9a4ae47be 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 
 #include <linux/wait.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 
 #include "mds_client.h"
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index a32f0f896d9f..8f1715ffbe4b 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -6,6 +6,7 @@
 #include <linux/inet.h>
 #include <linux/kthread.h>
 #include <linux/net.h>
+#include <linux/slab.h>
 #include <linux/socket.h>
 #include <linux/string.h>
 #include <net/tcp.h>
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 890597c09d43..8fdc011ca956 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/sched.h>
 
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index d82fe87c2a6e..21c6623c4b07 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -1,4 +1,7 @@
 
+#include "ceph_debug.h"
+
+#include <linux/slab.h>
 #include <asm/div64.h>
 
 #include "super.h"
@@ -6,7 +9,6 @@
 #include "crush/hash.h"
 #include "crush/mapper.h"
 #include "decode.h"
-#include "ceph_debug.h"
 
 char *ceph_osdmap_state_str(char *str, int len, int state)
 {
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 370e93695474..5f8dbf7c745a 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -1,4 +1,5 @@
 
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index df04e210a055..e6f9bc57d472 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 
 #include <linux/sort.h>
+#include <linux/slab.h>
 
 #include "super.h"
 #include "decode.h"
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 4290a6e860b0..75d02eaa1279 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -11,6 +11,7 @@
 #include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
 #include <linux/version.h>
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 65d12036b670..ca702c67bc66 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -12,6 +12,7 @@
 #include <linux/pagemap.h>
 #include <linux/wait.h>
 #include <linux/writeback.h>
+#include <linux/slab.h>
 
 #include "types.h"
 #include "messenger.h"
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 37d6ce645691..2845422907fc 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -3,6 +3,7 @@
 #include "decode.h"
 
 #include <linux/xattr.h>
+#include <linux/slab.h>
 
 static bool ceph_is_valid_xattr(const char *name)
 {
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b1d61d0bdfc7..78e4d2a3a68b 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -15,6 +15,7 @@
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/vfs.h>
 #include <linux/fs.h>
 #include "cifsglob.h"
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 8ec7736ce954..310d12f69a92 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -20,6 +20,7 @@
  */
 
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <keys/user-type.h>
 #include <linux/key-type.h>
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 714a542cbafc..d07676bd76d2 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -19,6 +19,7 @@
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifs_unicode.h"
 #include "cifs_uniupr.h"
 #include "cifspdu.h"
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7dfe0842a6f6..9b716d044bbd 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -22,6 +22,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsacl.h"
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7efe1745494d..fbe986430d0c 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -20,6 +20,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifs_debug.h"
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 63c89d1d70b5..ecf0ffbe2b64 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
  */
 #include <linux/in.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <linux/slow-work.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7cc7f83e9314..3f4fbd670507 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -30,6 +30,7 @@
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/vfs.h>
+#include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
 #include <asm/uaccess.h>
 #include "cifspdu.h"
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 45eb6cba793f..d9566bf8f917 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/wait.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 87948147d7ec..6f8a0e3fb25b 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -23,6 +23,7 @@
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
+#include <linux/slab.h>
 #include <keys/user-type.h>
 #include "dns_resolve.h"
 #include "cifsglob.h"
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ca2ba7a0193c..058b390d3da8 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -31,6 +31,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/delay.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 723daaccbd0e..35ec11716213 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -20,6 +20,7 @@
  */
 #include <linux/fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index fc1e0487eaee..c1a9d4236a8c 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -20,6 +20,7 @@
  */
 #include <linux/fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index c343b14ba2d3..18e0bc1fb593 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -22,6 +22,7 @@
  */
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/stat.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index aaa9c1c5a5bd..7c3fd7463f44 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,6 +29,7 @@
 #include "ntlmssp.h"
 #include "nterr.h"
 #include <linux/utsname.h>
+#include <linux/slab.h>
 #include "cifs_spnego.h"
 
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 93fb09a99c69..192ea51af20f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -24,6 +24,7 @@
 */
 
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 07b8e71544ee..ad081fe7eb18 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -22,6 +22,7 @@
 
 #include <linux/fs.h>
 #include <linux/list.h>
+#include <linux/gfp.h>
 #include <linux/wait.h>
 #include <linux/net.h>
 #include <linux/delay.h>
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 3e2ef0de1209..f555ce077d4f 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -21,6 +21,7 @@
 
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 4bb9d0a5decc..ccd98b0f2b0b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/time.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index ffd42815fda1..4c813f2cdc52 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 
 #include <linux/coda.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 830f51abb971..a1695dcadd99 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -18,6 +18,7 @@
 #include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/vfs.h>
+#include <linux/slab.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index c274d949179d..f09c5ed76f6c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -26,6 +26,7 @@
 #include <linux/stat.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
diff --git a/fs/compat.c b/fs/compat.c
index 030602d453b7..4b6ed03cc478 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -49,6 +49,7 @@
 #include <linux/mm.h>
 #include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
+#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6d55b61bfa79..c32a1b6a856b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -23,7 +23,6 @@
 #include <linux/ioctl.h>
 #include <linux/if.h>
 #include <linux/if_bridge.h>
-#include <linux/slab.h>
 #include <linux/raid/md_u.h>
 #include <linux/kd.h>
 #include <linux/route.h>
@@ -60,6 +59,7 @@
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
 #include <linux/atalk.h>
+#include <linux/gfp.h>
 
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci.h>
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index a2f746066c5d..c8af2d91174b 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -34,6 +34,7 @@
 #include <linux/capability.h>
 #include <linux/sched.h>
 #include <linux/lockdep.h>
+#include <linux/slab.h>
 
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 8421cea7d8c7..8c8d64230c2d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -29,6 +29,7 @@
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 32a5f46b1157..0f3eb41d9201 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -27,6 +27,7 @@
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 049d6c36da09..30a87b3dbcac 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -27,6 +27,7 @@
 #include <linux/fsnotify.h>
 #include <linux/string.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 
 static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8882ecc0f1bf..0120247b41c0 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/tty.h>
 #include <linux/mutex.h>
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0df243850818..b54bca03d92f 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/configfs.h>
+#include <linux/slab.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <net/ipv6.h>
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 29d6139c35fc..c6cf25158746 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 
 #include "dlm_internal.h"
 #include "lock.h"
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 46ffd3eeaaf7..17903b491298 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -56,6 +56,7 @@
    L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
 */
 #include <linux/types.h>
+#include <linux/slab.h>
 #include "dlm_internal.h"
 #include <linux/dlm_device.h>
 #include "memory.h"
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 52cab160893c..c0d35c620526 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -51,6 +51,7 @@
 #include <linux/file.h>
 #include <linux/mutex.h>
 #include <linux/sctp.h>
+#include <linux/slab.h>
 #include <net/sctp/user.h>
 #include <net/ipv6.h>
 
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 052095cd592f..2c6ad518100d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -9,6 +9,7 @@
 #include <net/genetlink.h>
 #include <linux/dlm.h>
 #include <linux/dlm_netlink.h>
+#include <linux/gfp.h>
 
 #include "dlm_internal.h"
 
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index b5f89aef3b29..d45c02db6943 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -11,6 +11,7 @@
 #include <linux/poll.h>
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
+#include <linux/slab.h>
 
 #include "dlm_internal.h"
 #include "lockspace.h"
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index a4bfd31ac45b..8b6e73c47435 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
+#include <linux/slab.h>
 
 #include "dlm_internal.h"
 #include "lockspace.h"
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 7cb0a59f4b9d..efb2b9400391 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
 #include <linux/crypto.h>
 #include <linux/file.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 8f006a0d6076..906e803f7f79 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -26,6 +26,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 
 /**
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 678172b61be2..e7440a6f5ebf 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -25,6 +25,7 @@
 
 #include <linux/file.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/security.h>
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4a430ab4115c..d3362faf3852 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/mount.h>
 #include <linux/crypto.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index a0a7847567e9..89c5476506ef 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -32,6 +32,7 @@
 #include <linux/random.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 
 /**
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index e14cf7e588db..d8c3a373aafa 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -22,6 +22,7 @@
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/mount.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index ea2f92101dfe..af1a8f01ebac 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
 #include <linux/key.h>
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
+#include <linux/slab.h>
 #include "ecryptfs_kernel.h"
 
 /**
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index f1c17e87c5fb..2d8dbce9d485 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -20,6 +20,7 @@
  * 02111-1307, USA.
  */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/user_namespace.h>
 #include <linux/nsproxy.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 4ec8f61ccf5a..3745f612bcd4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -24,6 +24,7 @@
 #include <linux/random.h>
 #include <linux/miscdevice.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/module.h>
 #include "ecryptfs_kernel.h"
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index df4ce99d0597..d491237c98e7 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b15a43a80ab7..fcef41c1d2cf 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/key.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/smp_lock.h>
 #include <linux/file.h>
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 7758cc382ef0..6bd3f76fdf88 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -11,6 +11,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/anon_inodes.h>
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a17e4b733e35..76d2a79ef93e 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -31,6 +31,7 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
+#include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <scsi/scsi_device.h>
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index 5293bc411d17..4337cad7777b 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -22,6 +22,7 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
+#include <linux/slab.h>
 #include <scsi/scsi_device.h>
 #include <asm/div64.h>
 
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 6cf5e4e84d61..18e57ea1e5b4 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -37,6 +37,7 @@
 #include <linux/vfs.h>
 #include <linux/random.h>
 #include <linux/exportfs.h>
+#include <linux/slab.h>
 
 #include "exofs.h"
 
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 1d081f0cfec2..3cf038c055d7 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -13,6 +13,7 @@
 
 #include "ext2.h"
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c8155845ac05..b118c6383c6d 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/ext2_fs.h>
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 161da2d3f890..a177122a1b25 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -14,6 +14,7 @@
 #include <linux/time.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 474348788dd9..3af91f476dff 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/ext3_jbd.h>
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 983f0e127493..538c48655084 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -18,6 +18,7 @@
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include "ext4.h"
 
 struct ext4_system_zone {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 11119e07233b..5381802d6052 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,6 +39,7 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 54df209d2eed..bde9d0b170c2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -23,6 +23,7 @@
 
 #include "mballoc.h"
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 #include <trace/events/ext4.h>
 
 /*
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 8b87bd0eac95..34dcfc52ef44 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index aa5fe28d180f..d1fc662cc311 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -15,6 +15,7 @@
 
 #include <linux/fs.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 #include "ext4.h"
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 983c253999a7..8b145e98df07 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -7,6 +7,7 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "xattr.h"
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 923990e4f16e..113f0a1e565d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include "fat.h"
 
diff --git a/fs/fifo.c b/fs/fifo.c
index f8f97b8b6d44..5d6606ffc2d2 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/pipe_fs_i.h>
diff --git a/fs/filesystems.c b/fs/filesystems.c
index a24c58e181db..68ba492d8eef 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -10,10 +10,10 @@
 #include <linux/fs.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 
 /*
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index ed8f0b0dd880..1429f3ae1e86 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -33,7 +33,6 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 
 #include "vxfs_extern.h"
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 76fc4d594acb..781a322ccb45 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 3221a0c7944e..1e1f286dd70e 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -12,6 +12,7 @@
 #define FSCACHE_DEBUG_LEVEL COOKIE
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/key.h>
 #include <keys/user-type.h>
 #include "internal.h"
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 9f6c928d4586..f17cecafae44 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -14,6 +14,7 @@
 #define FSCACHE_DEBUG_LEVEL OPERATION
 #include <linux/module.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "internal.h"
 
 atomic_t fscache_op_debug_id;
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 69809024d71d..47aefd376e54 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -14,6 +14,7 @@
 #include <linux/fscache-cache.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "internal.h"
 
 /*
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index de792dcf3274..e1f8171278bd 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -44,6 +44,7 @@
 #include <linux/magic.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/stat.h>
 
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 55458031e501..fe5df5457656 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/generic_acl.h>
 #include <linux/posix_acl.h>
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 583e823307ae..5e411d5f4697 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -7,7 +7,6 @@
  * of the GNU General Public License version 2.
  */
 
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 91beddadd388..bb7907bde3d8 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -7,7 +7,6 @@
  * of the GNU General Public License version 2.
  */
 
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d15876e9aa26..c22c21174833 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -7,7 +7,6 @@
  * of the GNU General Public License version 2.
  */
 
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 38e3749d476c..49f97d3bb690 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -7,7 +7,6 @@
  * of the GNU General Public License version 2.
  */
 
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 569b46240f61..0e0470ed34c2 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -9,6 +9,7 @@
 
 #include <linux/fs.h>
 #include <linux/dlm.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/gfs2_ondisk.h>
 
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index b4106ddaaa98..f07119d89557 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -10,6 +10,8 @@
 #ifndef __RGRP_DOT_H__
 #define __RGRP_DOT_H__
 
+#include <linux/slab.h>
+
 struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 419042f7f0b6..54fd98425991 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -8,7 +8,6 @@
  */
 
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 226f2bfbf16a..53511291fe36 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,7 +7,6 @@
  * of the GNU General Public License version 2.
  */
 
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 0d200068d0af..cdb41a1f6a64 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 
 #include "btree.h"
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 052f214ea6f0..38a0a9917d7f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/log2.h>
 
 #include "btree.h"
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 8bbe03c3f6d5..86428f5ac991 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -11,6 +11,7 @@
 #include <linux/cdrom.h>
 #include <linux/genhd.h>
 #include <linux/nls.h>
+#include <linux/slab.h>
 
 #include "hfs_fs.h"
 #include "btree.h"
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 5ed7252b7b23..0a81eb7111f3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -19,6 +19,7 @@
 #include <linux/nls.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/vfs.h>
 
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 3fcbb0e1f6fc..572628b4b07d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -15,6 +15,7 @@
 #include <linux/nls.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "hfsplus_fs.h"
 
 enum {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 032604e5ef2c..3a029d8f4cf1 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -11,6 +11,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/statfs.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include "hostfs.h"
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index b6fca543544c..eac5f96323e3 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -6,6 +6,7 @@
  *  general buffer i/o
  */
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 
 void hpfs_lock_creation(struct super_block *s)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 26e3964a4b8c..2338130cceba 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 
 static int hpfs_dir_release(struct inode *inode, struct file *filp)
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index ff90affb94e1..1042a9bc97f3 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include "hpfs_fn.h"
 
 void hpfs_init_inode(struct inode *i)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index cadc4ce48656..aa53842c599c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
 #include <linux/bitmap.h>
+#include <linux/slab.h>
 
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
 
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c7c0b28d7d21..748cfb92dcc6 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -19,6 +19,7 @@
  * See also Documentation/block/ioprio.txt
  *
  */
+#include <linux/gfp.h>
 #include <linux/kernel.h>
 #include <linux/ioprio.h>
 #include <linux/blkdev.h>
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 8ba5441063be..b9ab69b3a482 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -11,6 +11,7 @@
  *  isofs directory handling functions
  */
 #include <linux/smp_lock.h>
+#include <linux/gfp.h>
 #include "isofs.h"
 
 int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode)
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index eaa831311c9c..ab438beb867c 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/smp_lock.h>
+#include <linux/gfp.h>
 #include "isofs.h"
 
 /*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 2c90e3ef625f..ecb44c94ba8d 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -17,7 +17,6 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index cb1a49ae605e..54c9bc9e1b17 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,7 +20,6 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #endif
 
 /*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 73063285b13f..049281b7cb89 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -20,7 +20,6 @@
 #include <linux/fs.h>
 #include <linux/jbd2.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/crc32.h>
 #endif
 
diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c
index 90cb60d09787..cd02acafde8a 100644
--- a/fs/jffs2/compr_lzo.c
+++ b/fs/jffs2/compr_lzo.c
@@ -11,7 +11,6 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/init.h>
 #include <linux/lzo.h>
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index cfd301a5edfc..b46661a42758 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -14,7 +14,6 @@
 #endif
 
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/zlib.h>
 #include <linux/zutil.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 5544d31c066b..ec3538413926 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -15,6 +15,7 @@
 #include <linux/crc32.h>
 #include <linux/jffs2.h>
 #include <linux/mtd/mtd.h>
+#include <linux/slab.h>
 #include "nodelist.h"
 #include "debug.h"
 
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index b7b74e299142..e7291c161a19 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/pagemap.h>
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 87c6f555e1a0..af02bd138469 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -15,7 +15,6 @@
 #include <linux/mtd/mtd.h>
 #include <linux/rbtree.h>
 #include <linux/crc32.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include "nodelist.h"
 
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 21a052915aa9..191359dde4e1 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/mtd/mtd.h>
 #include <linux/compiler.h>
 #include <linux/sched.h> /* For cond_resched() */
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 4ec11e8bda8c..b955626071c2 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include "nodelist.h"
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index ca29440e9435..c819eb0e982d 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/crc32.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 213169780b6c..1057a4998e4e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
 #include "jfs_incore.h"
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d9b031cf69f5..6c4dfcbf3f55 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_dmap.h"
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 0e4623be70ce..9197a1b0d02d 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -102,6 +102,7 @@
 
 #include <linux/fs.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
 #include "jfs_filsys.h"
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 0fc30407f039..f8332dc8eeb2 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -45,6 +45,7 @@
 #include <linux/buffer_head.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
+#include <linux/slab.h>
 
 #include "jfs_incore.h"
 #include "jfs_inode.h"
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 335c4de6552d..c51af2a14516 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -70,6 +70,7 @@
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_metapage.h"
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 07b6c5dfb4b6..48b44bd8267b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/mempool.h>
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 3fbb3a225590..8f0f02cb6ca6 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h
@@ -19,6 +19,7 @@
 #ifndef _H_JFS_UNICODE
 #define _H_JFS_UNICODE
 
+#include <linux/slab.h>
 #include <asm/byteorder.h>
 #include "jfs_types.h"
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 266699deb1c6..157382fa6256 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -30,6 +30,7 @@
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/smp_lock.h>
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 1f594ab21895..fa96bbb26343 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/slab.h>
 #include <linux/quotaops.h>
 #include <linux/security.h>
 #include "jfs_incore.h"
diff --git a/fs/libfs.c b/fs/libfs.c
index 9e50bcf55857..ea9a6cc9b35c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -5,6 +5,7 @@
 
 #include <linux/module.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/mutex.h>
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index fc9032dc8862..64fd427c993c 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -8,6 +8,7 @@
 
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/nfs_fs.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index c81249fef11f..7932c399fab4 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -8,6 +8,7 @@
 
 #include <linux/module.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index fefa4df3f005..e3015464fbab 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -10,6 +10,7 @@
 #include <linux/utsname.h>
 #include <linux/kernel.h>
 #include <linux/ktime.h>
+#include <linux/slab.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprtsock.h>
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 7d150517ddf0..f1bacf1a0391 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -21,7 +21,6 @@
 #include <linux/errno.h>
 #include <linux/in.h>
 #include <linux/uio.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/mutex.h>
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a7966eed3c17..031c6569a134 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -9,7 +9,6 @@
 
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d1001790fa9a..84055d31bfc5 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 56c9519d900a..0f2ab741ae7c 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -9,7 +9,6 @@
 
 #include <linux/types.h>
 #include <linux/time.h>
-#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index ad478da7ca63..d0ef94cfb3da 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 #include <linux/time.h>
 #include <linux/in.h>
+#include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9718c22f186d..18e8c144c7f1 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -9,6 +9,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
 
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
 
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 56a8bfbb0120..e1cb99566100 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -6,7 +6,7 @@
  * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
  */
 #include "logfs.h"
-
+#include <linux/slab.h>
 
 /*
  * Atomic dir operations
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index 92949f95a901..84e36f52fe95 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -7,6 +7,7 @@
  */
 #include "logfs.h"
 #include <linux/sched.h>
+#include <linux/slab.h>
 
 /*
  * Wear leveling needs to kick in when the difference between low erase
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 33ec1aeaeec4..14ed27274da2 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -6,6 +6,7 @@
  * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
  */
 #include "logfs.h"
+#include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 6ad30a4c9052..f186043e862a 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -6,6 +6,7 @@
  * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
  */
 #include "logfs.h"
+#include <linux/slab.h>
 
 static void logfs_calc_free(struct super_block *sb)
 {
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 7a23b3e7c0a7..d5919af2c7a7 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -18,6 +18,7 @@
  */
 #include "logfs.h"
 #include <linux/sched.h>
+#include <linux/slab.h>
 
 static u64 adjust_bix(u64 bix, level_t level)
 {
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 1a14f9910d55..614d7a6fda2d 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -10,6 +10,7 @@
  * three kinds of objects: inodes, dentries and blocks, both data and indirect.
  */
 #include "logfs.h"
+#include <linux/slab.h>
 
 static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
 {
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index c66beab78dee..46990eafe052 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -11,6 +11,7 @@
  */
 #include "logfs.h"
 #include <linux/bio.h>
+#include <linux/slab.h>
 #include <linux/mtd/mtd.h>
 #include <linux/statfs.h>
 #include <linux/buffer_head.h>
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 82d6554b02fe..282e15ad8cd8 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -1,4 +1,5 @@
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include "minix.h"
 
 enum {DEPTH = 3, DIRECT = 7};	/* Only double indirect */
diff --git a/fs/mpage.c b/fs/mpage.c
index 598d54e200eb..fd56ca2ea556 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/kdev_t.h>
+#include <linux/gfp.h>
 #include <linux/bio.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b8b5b30d53f0..7edfcd4d5e52 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -15,7 +15,6 @@
 #include <linux/errno.h>
 #include <linux/stat.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <asm/uaccess.h>
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 6a7d901f1936..1daabb90e0a5 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -15,7 +15,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index ec8f45f12e05..60a5e2864ea8 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 15458decdb8a..56f5b3a0e1ee 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -9,12 +9,12 @@
 #include <linux/stat.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/shm.h>
 #include <linux/errno.h>
 #include <linux/mman.h>
 #include <linux/string.h>
-#include <linux/slab.h>
 #include <linux/fcntl.h>
 #include <linux/ncp_fs.h>
 
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index e37df8d5fe70..c7ff6c700a6e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -21,6 +21,7 @@
 #include <linux/mm.h>
 #include <linux/netdevice.h>
 #include <linux/signal.h>
+#include <linux/slab.h>
 #include <net/scm.h>
 #include <net/sock.h>
 #include <linux/ipx.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e3d26c1bd105..c634fd17b337 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -27,6 +27,7 @@
 #include <linux/fs.h>
 #include <linux/ncp_fs.h>
 #include <linux/time.h>
+#include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include "ncplib_kernel.h"
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index b4ffd0146ea6..84690319e625 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -10,6 +10,7 @@
 #include <linux/moduleparam.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 84761b5bb8e2..a08770a7e857 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -7,6 +7,7 @@
  */
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index a2b8b4df125d..05af212f0edf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -9,6 +9,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include "nfs4_fs.h"
 #include "callback.h"
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2274f1737336..2a3d352c0bff 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -35,6 +35,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <net/ipv6.h>
 #include <linux/nfs_xdr.h>
 #include <linux/sunrpc/bc_xprt.h>
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 2563bebc4c67..15671245c6ee 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -10,6 +10,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/spinlock.h>
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0d289823e856..ad4cd31d6050 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -44,6 +44,7 @@
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/kref.h>
+#include <linux/slab.h>
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 3f0cd4dfddaf..76fd235d0024 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -9,6 +9,7 @@
 #include <linux/hash.h>
 #include <linux/string.h>
 #include <linux/kmod.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/socket.h>
 #include <linux/seq_file.h>
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ae0d92736531..8d965bddb87e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -24,9 +24,9 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/aio.h>
+#include <linux/gfp.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 237874f1af23..a6b16ed93229 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -17,6 +17,7 @@
 #include <linux/nfs_fs_sb.h>
 #include <linux/in6.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 
 #include "internal.h"
 #include "iostat.h"
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e358df75a6ad..737128f777f3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/nfs_xdr.h>
+#include <linux/slab.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 40c766782891..7888cf36022d 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/dcache.h>
+#include <linux/gfp.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 7bc2da8efd4a..81cf14257916 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -12,7 +12,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index bac60515a4b3..d150ae0c5ecd 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -1,4 +1,5 @@
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 24992f0a29f2..e701002694e5 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/slab.h>
 #include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 5fe5492fbd29..56a86f6ac8b5 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -9,7 +9,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index fa3408f20112..f071d12c613b 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -11,6 +11,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/nfs_fs.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/vfs.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f9254fb0c9d0..d79a7b37e56c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -39,6 +39,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs.h>
 #include <linux/nfs4.h>
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index dd17713413a5..38f3b582e7c2 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -38,7 +38,6 @@
 #include <linux/param.h>
 #include <linux/time.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/in.h>
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c752d944fe9e..0288be80444f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -29,7 +29,6 @@
 
 #include <linux/types.h>
 #include <linux/param.h>
-#include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 6baf9a393466..e01637240eeb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -48,6 +48,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
+#include <linux/slab.h>
 #include <net/ipv6.h>
 #include <linux/netdevice.h>
 #include <linux/nfs_xdr.h>
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 2ea9e5c27e55..05c9e02f4153 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -19,7 +19,6 @@
 #include <linux/pagemap.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/namei.h>
 
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 04133aacb1e5..fc1c52571c03 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -22,6 +22,7 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/nfsacl.h>
 #include <linux/nfs3.h>
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index a0c4016413f1..872a5ef550c7 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -12,6 +12,7 @@
  * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
  */
 
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/module.h>
 #include <linux/exportfs.h>
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index f20589d2ae27..6aa5590c3679 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -7,6 +7,7 @@
 #include "nfsd.h"
 /* FIXME: nfsacl.h is a broken header */
 #include <linux/nfsacl.h>
+#include <linux/gfp.h>
 #include "cache.h"
 #include "xdr3.h"
 #include "vfs.h"
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index e0c4846bad92..a596e9d987e4 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -7,6 +7,7 @@
 #include "nfsd.h"
 /* FIXME: nfsacl.h is a broken header */
 #include <linux/nfsacl.h>
+#include <linux/gfp.h>
 #include "cache.h"
 #include "xdr3.h"
 #include "vfs.h"
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 88150685df34..e48052615159 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -34,6 +34,7 @@
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <linux/slab.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs4_acl.h>
 
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4bc22c763de7..7e32bd394e86 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,7 @@
  */
 
 #include <linux/sunrpc/clnt.h>
+#include <linux/slab.h>
 #include "nfsd.h"
 #include "state.h"
 
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6e2983b27f3c..c78dbf493424 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
 #include <linux/nfsd_idmap.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 
 /*
  * Cache entry
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 37514c469846..2ab9e8501bfe 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -33,6 +33,7 @@
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #include <linux/file.h>
+#include <linux/slab.h>
 
 #include "cache.h"
 #include "xdr4.h"
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 98fb98e330b4..7a9ae3254a4b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -32,6 +32,7 @@
 */
 
 #include <linux/file.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/crypto.h>
 #include <linux/sched.h>
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c97fddbd17db..6a8fedaa4f55 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -34,6 +34,7 @@
 
 #include <linux/file.h>
 #include <linux/smp_lock.h>
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/swap.h>
 #include <linux/sunrpc/svcauth_gss.h>
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c47b4d7bafa7..e1703175ee28 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -40,6 +40,7 @@
  * at the end of nfs4svc_decode_compoundargs.
  */
 
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/statfs.h>
 #include <linux/utsname.h>
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index da08560c4818..4666a209678a 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -8,6 +8,8 @@
  * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
  */
 
+#include <linux/slab.h>
+
 #include "nfsd.h"
 #include "cache.h"
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 0f0e77f2012f..e3591073098f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -4,6 +4,7 @@
  * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
  */
 
+#include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/ctype.h>
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a11b0e8678ee..6dd5f1970e01 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -25,6 +25,7 @@
 #include <linux/xattr.h>
 #include <linux/jhash.h>
 #include <linux/ima.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 3f959f1879d8..8d6356a804f3 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -26,6 +26,7 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/bitops.h>
+#include <linux/slab.h>
 #include "mdt.h"
 #include "alloc.h"
 
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 471e269536ae..447ce47a3306 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -27,6 +27,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mm.h>
 #include <linux/backing-dev.h>
+#include <linux/gfp.h>
 #include "nilfs.h"
 #include "mdt.h"
 #include "dat.h"
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 8880a9e281e7..145f03cd7d3e 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -45,6 +45,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/hash.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include "nilfs.h"
 #include "page.h"
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7868cc122ac7..0957b58f909d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -22,6 +22,7 @@
  */
 
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
 #include <linux/uio.h>
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 313d0a21da48..c2ff1b306012 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -23,6 +23,7 @@
 #include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/smp_lock.h>	/* lock_kernel(), unlock_kernel() */
+#include <linux/slab.h>
 #include <linux/capability.h>	/* capable() */
 #include <linux/uaccess.h>	/* copy_from_user(), copy_to_user() */
 #include <linux/vmalloc.h>
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 06713ffcc7f2..024be8c35bb6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -26,6 +26,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/swap.h>
+#include <linux/slab.h>
 #include "nilfs.h"
 #include "segment.h"
 #include "page.h"
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index fc246dba112a..8de3e1e48130 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -29,6 +29,7 @@
 #include <linux/list.h>
 #include <linux/highmem.h>
 #include <linux/pagevec.h>
+#include <linux/gfp.h>
 #include "nilfs.h"
 #include "page.h"
 #include "mdt.h"
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 017bedc761a0..ba43146f3c30 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -23,6 +23,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
+#include <linux/slab.h>
 #include <linux/crc32.h>
 #include "nilfs.h"
 #include "segment.h"
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 6129a431aa34..17851f77f739 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -25,6 +25,7 @@
 #include <linux/writeback.h>
 #include <linux/crc32.h>
 #include <linux/backing-dev.h>
+#include <linux/slab.h>
 #include "page.h"
 #include "segbuf.h"
 
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c161d89061b5..6a7dbd8451db 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -32,6 +32,7 @@
 #include <linux/kthread.h>
 #include <linux/crc32.h>
 #include <linux/pagevec.h>
+#include <linux/slab.h>
 #include "nilfs.h"
 #include "btnode.h"
 #include "page.h"
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index e9795f1724d7..1ab974533697 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -29,6 +29,7 @@
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/slab.h>
 #include "sb.h"
 
 /* the_nilfs struct */
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 037e878e03fc..fcc2f064af83 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -18,6 +18,7 @@
 
 #include <linux/dcache.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/srcu.h>
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3165d85aada2..0399bcbe09c8 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -87,7 +87,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/writeback.h> /* for inode_lock */
 
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index cfce53cb65d7..c3c2c7ac9020 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -23,6 +23,7 @@
 
 #include <linux/errno.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 50d3b0c258e3..f5094ee224c1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -22,6 +22,7 @@
 
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 08f7530e9341..6551c7cbad92 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -25,6 +25,7 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/vmalloc.h>
+#include <linux/slab.h>
 
 #include "attrib.h"
 #include "inode.h"
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 9173e82a45d1..fe44d3feee4a 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 
 #include "dir.h"
 #include "aops.h"
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index b681c71d7069..8804f093ba75 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -20,6 +20,7 @@
  */
 
 #include <linux/buffer_head.h>
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
 #include <linux/sched.h>
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 2194eff49743..096c135691ae 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -19,6 +19,8 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/slab.h>
+
 #include "aops.h"
 #include "collate.h"
 #include "debug.h"
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 1caa0ef0b2bb..b572b6727181 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -21,6 +21,7 @@
  */
 
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 
 #include "attrib.h"
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 2ca00153b6ec..358273e59ade 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -23,6 +23,7 @@
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
 #include <linux/security.h>
+#include <linux/slab.h>
 
 #include "attrib.h"
 #include "debug.h"
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 8ccf0f8c9cc8..e13fc9e8fcdc 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -21,6 +21,7 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 21c808f752d8..ecebb2276790 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -25,7 +25,6 @@
 
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 
 #include <cluster/masklog.h>
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 5c9890006708..41d5f1f92d56 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -34,6 +34,7 @@
 #include <linux/crc32.h>
 #include <linux/time.h>
 #include <linux/debugfs.h>
+#include <linux/slab.h>
 
 #include "heartbeat.h"
 #include "tcp.h"
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index c81142e3ef84..ed0c9f367fed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -19,6 +19,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/configfs.h>
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 639024033fce..cf3e16696216 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -44,7 +44,6 @@
  * and if they're the last, they fire off the decision.
  */
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/reboot.h>
 
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index dccc439fa087..a795eb91f4ea 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index f283bce776b4..90803b47cd8c 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 52ec020ea78b..11a6d1fd1d35 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 49e29ecd0201..b47c1b92b82b 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -28,7 +28,6 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c562a7581cf9..09e3fdfa6d33 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -24,6 +24,7 @@
 
 #include <linux/fs.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/fiemap.h>
 
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index c6e7213db868..1aa863dd901f 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -26,7 +26,6 @@
 
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ab207901d32a..07cc8bb68b6d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -25,7 +25,6 @@
 
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 39737613424a..7898bd3a99f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -25,7 +25,6 @@
 
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/uio.h>
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 355f41d1d520..ab42a74c7539 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -3,6 +3,7 @@
  */
 #include <linux/spinlock.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/dqblk_qtree.h>
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index a6467f3d262e..9ad49305f450 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -3,6 +3,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/module.h>
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 29405f2ff616..bd96f6c7877e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -37,7 +37,6 @@
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 7020e1253ffa..0d3049f696c5 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -19,6 +19,7 @@
 
 #include <linux/kernel.h>
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 
 /* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 5ae8812b2864..2dc57bca0688 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 40e53702948c..bfe7190cdbf1 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -25,7 +25,6 @@
 
 #include <linux/fs.h>
 #include <linux/types.h>
-#include <linux/slab.h>
 #include <linux/highmem.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 75d9b5ba1d45..c82af6acc2e7 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -6,6 +6,7 @@
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
diff --git a/fs/open.c b/fs/open.c
index e17f54454b50..74e5cd9f718e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -10,7 +10,6 @@
 #include <linux/fdtable.h>
 #include <linux/fsnotify.h>
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/tty.h>
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
@@ -20,6 +19,7 @@
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/fcntl.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/fs.h>
 #include <linux/personality.h>
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e8865c11777f..e238ab23a9e7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 49cfd5f54238..91babdae7587 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -95,6 +95,7 @@
  ************************************************************/
 #include <linux/crc32.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 #include "check.h"
 #include "efi.h"
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index aa8637b81028..e51f2ec2c5e5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -68,7 +68,6 @@
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/signal.h>
 #include <linux/highmem.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a7310841c831..9e82adc37b0c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,6 +81,7 @@
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
+#include <linux/slab.h>
 #include "internal.h"
 
 /* NOTE:
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 08f4d71dacd7..43c127490606 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -13,6 +13,7 @@
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/idr.h>
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 445a02bcaab3..d35b23238fb1 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/smp_lock.h>
 #include <linux/sysctl.h>
+#include <linux/slab.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index b442dac8f5f9..19979a2ce272 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -19,6 +19,7 @@
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <linux/list.h>
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 9fe7d7ebe115..b1822dde55c2 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -21,7 +21,6 @@
 #include <linux/mmzone.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index f8650dce74fb..ce94801f48ca 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -12,6 +12,7 @@
 #include <linux/string.h>
 #include <linux/of.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 04d1270f1c38..9020ac15baaa 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -14,6 +14,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/module.h>
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index b9b7aad2003d..bf31b03fc275 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -1,6 +1,5 @@
 #include <linux/cpumask.h>
 #include <linux/fs.h>
-#include <linux/gfp.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 183f8ff5f400..2d45889931f6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
 #include <linux/seq_file.h>
 #include <linux/highmem.h>
 #include <linux/ptrace.h>
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5d9fd64ef81a..46d4b5d72bd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -5,6 +5,7 @@
 #include <linux/fs_struct.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
+#include <linux/slab.h>
 #include <linux/seq_file.h>
 #include "internal.h"
 
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0872afa58d39..9fbc99ec799a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,6 +12,7 @@
 #include <linux/user.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
+#include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
 #include <linux/init.h>
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 2663ed90fb03..d67908b407d9 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -5,6 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/quotaops.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <net/netlink.h>
 #include <net/genetlink.h>
 
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 1739a4aba25f..5ea4ad81a429 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -21,6 +21,7 @@
 #include <linux/pagevec.h>
 #include <linux/mman.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a6090aa1a7c1..c94853473ca9 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,6 +35,7 @@
 #include <linux/sched.h>
 #include <linux/parser.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index c094f58c7448..f8a6075abf50 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -8,6 +8,7 @@
 #include <linux/reiserfs_fs.h>
 #include <linux/stat.h>
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 
 extern const struct reiserfs_key MIN_KEY;
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 6591cb21edf6..1e4250bc3a6f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -35,6 +35,7 @@
  **/
 
 #include <linux/time.h>
+#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/buffer_head.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d1da94b82d8f..dc2c65e04853 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,6 +11,7 @@
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
 #include <linux/buffer_head.h>
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index f3de5e8a2ae8..19fbc810e8e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -50,6 +50,7 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/uaccess.h>
+#include <linux/slab.h>
 
 #include <asm/system.h>
 
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 96e4cbbfaa18..d0c43cb99ffc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -13,6 +13,7 @@
 
 #include <linux/time.h>
 #include <linux/bitops.h>
+#include <linux/slab.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
 #include <linux/reiserfs_xattr.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 04bf5d791bda..d8fd90d83ab3 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/time.h>
 #include <asm/uaccess.h>
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 37d034ca7d99..4f9586bb7631 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -38,6 +38,7 @@
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/errno.h>
+#include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/pagemap.h>
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index dd20a7883f0f..9cdb759645a9 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -5,6 +5,7 @@
 #include <linux/errno.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/reiserfs_acl.h>
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index de1fcffd906b..7271a477c041 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
+#include <linux/slab.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/security.h>
 #include <asm/uaccess.h>
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 1dabe4ee02fe..f329849ce3c0 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/list.h>
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 92d5e8ffb639..dbf6548bbf06 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -13,7 +13,6 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/mm.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/net.h>
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 6bd9b691a463..0e39a924f10a 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -12,7 +12,6 @@
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/dcache.h>
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 00b2909bd469..54350b59046b 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,6 +15,7 @@
 #include <linux/pagemap.h>
 #include <linux/net.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 39208663aaf1..9313b6124a2e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -30,6 +30,7 @@
 #include <linux/syscalls.h>
 #include <linux/uio.h>
 #include <linux/security.h>
+#include <linux/gfp.h>
 
 /*
  * Attempt to steal a page from a pipe buffer. This should perhaps go into
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index e80be2022a7f..32b911f4ee39 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -33,7 +33,6 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
 
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 4dd70e04333b..15a03d0fb9f3 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -24,6 +24,7 @@
 
 #include <linux/mutex.h>
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include <linux/zlib.h>
 
 #include "squashfs_fs.h"
diff --git a/fs/sync.c b/fs/sync.c
index f557d71cb097..fc5c3d75cf3c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -5,6 +5,7 @@
 #include <linux/kernel.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 082daaecac1b..a4a0a9419711 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,7 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include "sysfs.h"
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 0cb10884a2fc..776137828dca 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/magic.h>
+#include <linux/slab.h>
 
 #include "sysfs.h"
 
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1b9a3a1e8a17..b93ec51fa7ac 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 1bfc95ad5f71..98158de91d24 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/time.h>
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 4775af401167..37fa7ed062d8 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -45,6 +45,7 @@
 
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 
 /**
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 90492327b383..c2a68baa782f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,6 +34,7 @@
 #include <linux/moduleparam.h>
 #include <linux/debugfs.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 
 #ifdef CONFIG_UBIFS_FS_DEBUG
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e26c02ab6cd5..5692cf72b807 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -52,6 +52,7 @@
 #include "ubifs.h"
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 
 static int read_block(struct inode *inode, void *addr, unsigned int block,
 		      struct ubifs_data_node *dn)
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index e5a3d8e96bb7..918d1582ca05 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -53,6 +53,7 @@
  * good, and GC takes extra care when moving them.
  */
 
+#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/list_sort.h>
 #include "ubifs.h"
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e589fedaf1ef..77d5cf4a7547 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -51,6 +51,7 @@
  */
 
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 
 /**
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index b2792e84d245..ad7f67b827ea 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -46,6 +46,7 @@
 #include "ubifs.h"
 #include <linux/crc16.h>
 #include <linux/math64.h>
+#include <linux/slab.h>
 
 /**
  * do_calc_lpt_geom - calculate sizes for the LPT area.
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 8cbfb8248025..13cb7a4237bf 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -26,6 +26,7 @@
  */
 
 #include <linux/crc16.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 
 /**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 868a55ee080f..109c6ea03bb5 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -31,6 +31,7 @@
  */
 
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 
 /**
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 57085e43320f..96cb62c8a9dd 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -27,6 +27,7 @@
  */
 
 #include "ubifs.h"
+#include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/math64.h>
 
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index e5b1a7d00fa0..2194915220e5 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -31,6 +31,7 @@
  */
 
 #include <linux/crc32.h>
+#include <linux/slab.h>
 #include "ubifs.h"
 
 /*
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2d976366a46..bd2542dad014 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -28,6 +28,7 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 195830f47569..c74400f88fe0 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -56,6 +56,7 @@
  */
 
 #include "ubifs.h"
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
 
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index 4b540ee632d5..745eb209be0c 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
 
 #include <linux/fs.h>
 #include <linux/string.h>
-#include <linux/slab.h>
 #include <linux/buffer_head.h>
 
 uint32_t udf_get_pblock(struct super_block *sb, uint32_t block,
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 852e91845688..16064787d2b7 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -26,7 +26,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
-#include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index cefa8c8913e6..d03a90b6ad69 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>	/* for memset */
 #include <linux/nls.h>
 #include <linux/crc-itu-t.h>
+#include <linux/slab.h>
 
 #include "udf_sb.h"
 
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
index 05ac0fe9c4d3..8d5a506c82eb 100644
--- a/fs/xattr_acl.c
+++ b/fs/xattr_acl.c
@@ -6,9 +6,9 @@
  */
 
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/gfp.h>
 
 
 /*
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index bc7405585def..666c9db48eb6 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -17,6 +17,7 @@
  */
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index bf85bbe4a9ae..a7bc925c4d60 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -22,6 +22,7 @@
 #include "xfs_inode.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
 
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 99628508cb11..0f8b9968a803 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -40,6 +40,7 @@
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
+#include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index bd111b7e1daa..44c2b0ef9a41 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -18,7 +18,7 @@
 #include "xfs.h"
 #include <linux/stddef.h>
 #include <linux/errno.h>
-#include <linux/slab.h>
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/vmalloc.h>
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 4ea1ee18aded..7b26cc2fd284 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -58,6 +58,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/exportfs.h>
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0bf6d61f0528..593c05b4df8d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -18,6 +18,7 @@
 #include <linux/compat.h>
 #include <linux/ioctl.h>
 #include <linux/mount.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 61a99608731e..e65a7937f3a4 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -56,6 +56,7 @@
 #include <linux/security.h>
 #include <linux/falloc.h>
 #include <linux/fiemap.h>
+#include <linux/slab.h>
 
 /*
  * Bring the timestamps in the XFS inode uptodate.
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 71345a370d9f..52e06b487ced 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -61,6 +61,7 @@
 
 #include <linux/namei.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/mempool.h>
 #include <linux/writeback.h>
-- 
cgit v1.2.2


From e05c378f4973674a16d5b9636f2310cf88aca5f2 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Tue, 30 Mar 2010 18:25:17 +0200
Subject: [LogFS] Remove unused method

All callers are long gone.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/logfs.h     |  1 -
 fs/logfs/readwrite.c | 21 ---------------------
 fs/logfs/segment.c   |  6 ------
 3 files changed, 28 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index b84b0eec6024..97195b9e93a5 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -305,7 +305,6 @@ typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
 		level_t level, int child_no, __be64 val);
 struct logfs_block_ops {
 	void	(*write_block)(struct logfs_block *block);
-	gc_level_t	(*block_level)(struct logfs_block *block);
 	void	(*free_block)(struct super_block *sb, struct logfs_block*block);
 	int	(*write_alias)(struct super_block *sb,
 			struct logfs_block *block,
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index c3a3a6814b84..3659c37fbd72 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -429,25 +429,6 @@ static void inode_write_block(struct logfs_block *block)
 	}
 }
 
-static gc_level_t inode_block_level(struct logfs_block *block)
-{
-	BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
-	return GC_LEVEL(LOGFS_MAX_LEVELS);
-}
-
-static gc_level_t indirect_block_level(struct logfs_block *block)
-{
-	struct page *page;
-	struct inode *inode;
-	u64 bix;
-	level_t level;
-
-	page = block->page;
-	inode = page->mapping->host;
-	logfs_unpack_index(page->index, &bix, &level);
-	return expand_level(inode->i_ino, level);
-}
-
 /*
  * This silences a false, yet annoying gcc warning.  I hate it when my editor
  * jumps into bitops.h each time I recompile this file.
@@ -586,14 +567,12 @@ static void indirect_free_block(struct super_block *sb,
 
 static struct logfs_block_ops inode_block_ops = {
 	.write_block = inode_write_block,
-	.block_level = inode_block_level,
 	.free_block = inode_free_block,
 	.write_alias = inode_write_alias,
 };
 
 struct logfs_block_ops indirect_block_ops = {
 	.write_block = indirect_write_block,
-	.block_level = indirect_block_level,
 	.free_block = indirect_free_block,
 	.write_alias = indirect_write_alias,
 };
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 0ecd8f07c11e..02db22ebbf13 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -182,14 +182,8 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
 	return 0;
 }
 
-static gc_level_t btree_block_level(struct logfs_block *block)
-{
-	return expand_level(block->ino, block->level);
-}
-
 static struct logfs_block_ops btree_block_ops = {
 	.write_block	= btree_write_block,
-	.block_level	= btree_block_level,
 	.free_block	= __free_block,
 	.write_alias	= btree_write_alias,
 };
-- 
cgit v1.2.2


From b7b7fa43103a9fb30dbcc60cbd5161fdfc25f904 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@jeffreymahoney.com>
Date: Mon, 29 Mar 2010 15:12:39 -0400
Subject: reiserfs: Fix locking BUG during mount failure

Commit 8ebc423238341b52912c7295b045a32477b33f09 (reiserfs: kill-the-BKL)
introduced a bug in the mount failure case.

The error label releases the lock before calling journal_release_error,
but it requires that the lock be held. do_journal_release unlocks and
retakes it. When it releases it without it held, we trigger a BUG().

The error_alloc label skips the unlock since the lock isn't held yet
but none of the other conditions that are clean up exist yet either.

This patch returns immediately after the kzalloc failure and moves
the reiserfs_write_unlock after the journal_release_error call.

This was reported in https://bugzilla.novell.com/show_bug.cgi?id=591807

Reported-by:  Thomas Siedentopf <thomas.siedentopf@novell.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Cc: Thomas Siedentopf <thomas.siedentopf@novell.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: 2.6.33.x <stable@kernel.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 fs/reiserfs/super.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 04bf5d791bda..ab190511bc18 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1618,10 +1618,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	save_mount_options(s, data);
 
 	sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
-	if (!sbi) {
-		errval = -ENOMEM;
-		goto error_alloc;
-	}
+	if (!sbi)
+		return -ENOMEM;
 	s->s_fs_info = sbi;
 	/* Set default values for options: non-aggressive tails, RO on errors */
 	REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
@@ -1878,12 +1876,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	return (0);
 
 error:
-	reiserfs_write_unlock(s);
-error_alloc:
 	if (jinit_done) {	/* kill the commit thread, free journal ram */
 		journal_release_error(NULL, s);
 	}
 
+	reiserfs_write_unlock(s);
+
 	reiserfs_free_bitmap_cache(s);
 	if (SB_BUFFER_WITH_SB(s))
 		brelse(SB_BUFFER_WITH_SB(s));
-- 
cgit v1.2.2


From 9358c6d4c0264b1572554c49c4b92673ea9a5c72 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 30 Mar 2010 13:54:41 -0700
Subject: ceph: fix dentry rehashing on virtual .snap dir

If a lookup fails on the magic .snap directory, we bind it to a magic
snap directory inode in ceph_lookup_finish().  That code assumes the dentry
is unhashed, but a recent server-side change started returning NULL leases
on lookup failure, causing the .snap dentry to be hashed and NULL by
ceph_fill_trace().

This causes dentry hash chain corruption, or a dies when d_rehash()
includes
	BUG_ON(!d_unhashed(entry));

So, avoid processing the NULL dentry lease if it the dentry matches the
snapdir name in ceph_fill_trace().  That allows the lookup completion to
properly bind it to the snapdir inode.  BUG there if dentry is hashed to
be sure.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   |  1 +
 fs/ceph/inode.c | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 8a9116e15b70..aed8fda33024 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -488,6 +488,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 		struct inode *inode = ceph_get_snapdir(parent);
 		dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
 		     dentry, dentry->d_name.len, dentry->d_name.name, inode);
+		BUG_ON(!d_unhashed(dentry));
 		d_add(dentry, inode);
 		err = 0;
 	}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index aca82d55cc53..26f883c275e8 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -886,6 +886,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	struct inode *in = NULL;
 	struct ceph_mds_reply_inode *ininfo;
 	struct ceph_vino vino;
+	struct ceph_client *client = ceph_sb_to_client(sb);
 	int i = 0;
 	int err = 0;
 
@@ -949,7 +950,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			return err;
 	}
 
-	if (rinfo->head->is_dentry && !req->r_aborted) {
+	/*
+	 * ignore null lease/binding on snapdir ENOENT, or else we
+	 * will have trouble splicing in the virtual snapdir later
+	 */
+	if (rinfo->head->is_dentry && !req->r_aborted &&
+	    (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
+					       client->mount_args->snapdir_name,
+					       req->r_dentry->d_name.len))) {
 		/*
 		 * lookup link rename   : null -> possibly existing inode
 		 * mknod symlink mkdir  : null -> new inode
-- 
cgit v1.2.2


From 2f3014fc2ab1e25c36531e19164c48182c168995 Mon Sep 17 00:00:00 2001
From: Andrea Gelmini <andrea.gelmini@gelma.net>
Date: Thu, 25 Mar 2010 17:22:45 +0000
Subject: Btrfs: remove duplicate include in ioctl.c

fs/btrfs/ioctl.c: ctree.h is included more than once.

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2845c6ceecd2..5c9f8b30608c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -48,7 +48,6 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "locking.h"
-#include "ctree.h"
 
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
-- 
cgit v1.2.2


From 90d2c51dbb4db05c040bc7db264bb7ab35e35455 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 25 Mar 2010 12:37:12 +0000
Subject: Btrfs: add NULL check for do_walk_down()

btrfs_find_create_tree_block() may return NULL, so we must check the returned
value, or we will access a NULL pointer.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1727b26fb194..503a18eaef52 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5205,6 +5205,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	next = btrfs_find_tree_block(root, bytenr, blocksize);
 	if (!next) {
 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+		if (!next)
+			return -ENOMEM;
 		reada = 1;
 	}
 	btrfs_tree_lock(next);
@@ -5417,7 +5419,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 		if (ret > 0) {
 			path->slots[level]++;
 			continue;
-		}
+		} else if (ret < 0)
+			return ret;
 		level = wc->level;
 	}
 	return 0;
-- 
cgit v1.2.2


From 471fa17dff556ad38caf26de097c0630530d8cbe Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Thu, 25 Mar 2010 12:35:14 +0000
Subject: Btrfs: Remove unnecessary finish_wait() in wait_current_trans()

We only need to call finish_wait() after wait loop.

By the way, this patch makes code of waiting loop similar to
example in wait.h(no functional change)

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2d654c1c794d..43054285f638 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -147,18 +147,13 @@ static void wait_current_trans(struct btrfs_root *root)
 		while (1) {
 			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
 					TASK_UNINTERRUPTIBLE);
-			if (cur_trans->blocked) {
-				mutex_unlock(&root->fs_info->trans_mutex);
-				schedule();
-				mutex_lock(&root->fs_info->trans_mutex);
-				finish_wait(&root->fs_info->transaction_wait,
-					    &wait);
-			} else {
-				finish_wait(&root->fs_info->transaction_wait,
-					    &wait);
+			if (!cur_trans->blocked)
 				break;
-			}
+			mutex_unlock(&root->fs_info->trans_mutex);
+			schedule();
+			mutex_lock(&root->fs_info->trans_mutex);
 		}
+		finish_wait(&root->fs_info->transaction_wait, &wait);
 		put_transaction(cur_trans);
 	}
 }
-- 
cgit v1.2.2


From ab59381ea43f81c977cbd09add26950aaf6cb9fe Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Thu, 25 Mar 2010 12:34:49 +0000
Subject: Btrfs: Add error handle for btrfs_search_slot() in
 btrfs_read_chunk_tree()

We need to check return value of btrfs_search_slot() in
btrfs_read_chunk_tree() and do corresponding error handing.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9df8e3f1ccab..c6c80093eac0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3389,6 +3389,8 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
 	key.type = 0;
 again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
 	while (1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
-- 
cgit v1.2.2


From f3eae7e8a5ed124bbc781e18ea10c21856017322 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Thu, 25 Mar 2010 12:32:59 +0000
Subject: Btrfs: Simplify num_stripes's calculation logical for
 __btrfs_alloc_chunk()

We can use this simple method to make source more readable.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c6c80093eac0..bf3bec3e4130 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2198,9 +2198,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-		num_stripes = min_t(u64, 2, fs_devices->rw_devices);
-		if (num_stripes < 2)
+		if (fs_devices->rw_devices < 2)
 			return -ENOSPC;
+		num_stripes = 2;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-- 
cgit v1.2.2


From 683be16eb6e19a35aca2473668652259ed074094 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 20 Mar 2010 11:24:48 +0000
Subject: Btrfs: dereferencing freed memory

The original code dereferenced range on the next line.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5c9f8b30608c..874d36e5f167 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1374,6 +1374,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 					   sizeof(*range))) {
 				ret = -EFAULT;
 				kfree(range);
+				goto out;
 			}
 			/* compression requires us to start the IO */
 			if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
-- 
cgit v1.2.2


From c2b96929e2ca6914cf4a66cd8fe2a34c4a98277f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 20 Mar 2010 11:24:15 +0000
Subject: Btrfs: handle kmalloc() failure in inode lookup ioctl

Return -ENOMEM if kmalloc() fails.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 874d36e5f167..74d89133f768 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1211,6 +1211,9 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 		return -EPERM;
 
 	args = kmalloc(sizeof(*args), GFP_KERNEL);
+	if (!args)
+		return -ENOMEM;
+
 	if (copy_from_user(args, argp, sizeof(*args))) {
 		kfree(args);
 		return -EFAULT;
-- 
cgit v1.2.2


From 6cf8bfbf5e88edfb09a2bf0631a067060f534592 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 20 Mar 2010 11:22:10 +0000
Subject: Btrfs: check btrfs_get_extent return for IS_ERR()

btrfs_get_extent() never returns NULL, only a valid pointer or ERR_PTR()

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 74d89133f768..2b7dd88fc54f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -510,7 +510,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 		em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
 		unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
 
-		if (!em)
+		if (IS_ERR(em))
 			return 0;
 	}
 
-- 
cgit v1.2.2


From 1b1d1f6625e517a08640ddb4b8f8a0e025243fe3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Mar 2010 20:49:55 +0000
Subject: Btrfs: fail to mount if we have problems reading the block groups

We don't actually check the return value of btrfs_read_block_groups, so we can
possibly succeed to mount, but then fail to say read the superblock xattr for
selinux which will cause the vfs code to deactivate the super.

This is a problem because in find_free_extent we just assume that we
will find the right space_info for the allocation we want.  But if we
failed to read the block groups, we won't have setup any space_info's,
and we'll hit a NULL pointer deref in find_free_extent.

This patch fixes that problem by checking the return value of
btrfs_read_block_groups, and failing out properly.  I've also added a
check in find_free_extent so if for some reason we don't find an
appropriate space_info, we just return -ENOSPC.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 11 ++++++++---
 fs/btrfs/extent-tree.c |  5 ++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 11d0ad30e203..5ae1c0fcfce0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1922,7 +1922,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	csum_root->track_dirty = 1;
 
-	btrfs_read_block_groups(extent_root);
+	ret = btrfs_read_block_groups(extent_root);
+	if (ret) {
+		printk(KERN_ERR "Failed to read block groups: %d\n", ret);
+		goto fail_block_groups;
+	}
 
 	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
@@ -1932,7 +1936,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
 	if (IS_ERR(fs_info->cleaner_kthread))
-		goto fail_csum_root;
+		goto fail_block_groups;
 
 	fs_info->transaction_kthread = kthread_run(transaction_kthread,
 						   tree_root,
@@ -2020,7 +2024,8 @@ fail_cleaner:
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 
-fail_csum_root:
+fail_block_groups:
+	btrfs_free_block_groups(fs_info);
 	free_extent_buffer(csum_root->node);
 	free_extent_buffer(csum_root->commit_root);
 fail_dev_root:
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 503a18eaef52..4c910359c807 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4170,6 +4170,10 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	ins->offset = 0;
 
 	space_info = __find_space_info(root->fs_info, data);
+	if (!space_info) {
+		printk(KERN_ERR "No space info for %d\n", data);
+		return -ENOSPC;
+	}
 
 	if (orig_root->ref_cows || empty_size)
 		allowed_chunk_alloc = 1;
@@ -7372,7 +7376,6 @@ static int find_first_block_group(struct btrfs_root *root,
 		}
 		path->slots[0]++;
 	}
-	ret = -ENOENT;
 out:
 	return ret;
 }
-- 
cgit v1.2.2


From 287a0ab91d25ca982f895a76402e5893b47ed7a6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Mar 2010 18:07:23 +0000
Subject: Btrfs: kill max_extent mount option

As Yan pointed out, theres not much reason for all this complicated math to
account for file extents being split up into max_extent chunks, since they are
likely to all end up in the same leaf anyway.  Since there isn't much reason to
use max_extent, just remove the option altogether so we have one less thing we
need to test.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h        |  1 -
 fs/btrfs/disk-io.c      |  1 -
 fs/btrfs/extent-tree.c  |  4 ++--
 fs/btrfs/inode.c        | 59 ++++---------------------------------------------
 fs/btrfs/ordered-data.c |  6 +++--
 fs/btrfs/super.c        | 23 +++----------------
 6 files changed, 13 insertions(+), 81 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 11115847d875..ae8c40922c54 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -834,7 +834,6 @@ struct btrfs_fs_info {
 	u64 last_trans_log_full_commit;
 	u64 open_ioctl_trans;
 	unsigned long mount_opt;
-	u64 max_extent;
 	u64 max_inline;
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5ae1c0fcfce0..6632e5c4c8bb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1634,7 +1634,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
 	fs_info->sb = sb;
-	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
 	fs_info->metadata_ratio = 0;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4c910359c807..5f6bc283c0c8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2846,7 +2846,7 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
 	}
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
 
-	BTRFS_I(inode)->reserved_extents--;
+	BTRFS_I(inode)->reserved_extents -= num_items;
 	BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
 
 	if (meta_sinfo->bytes_delalloc < num_bytes) {
@@ -3111,7 +3111,7 @@ again:
 		return -ENOSPC;
 	}
 
-	BTRFS_I(inode)->reserved_extents++;
+	BTRFS_I(inode)->reserved_extents += num_items;
 	check_force_delalloc(meta_sinfo);
 	spin_unlock(&meta_sinfo->lock);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2a337a09c650..a85b90c86cb0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -796,7 +796,7 @@ static noinline int cow_file_range(struct inode *inode,
 	while (disk_num_bytes > 0) {
 		unsigned long op;
 
-		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+		cur_alloc_size = disk_num_bytes;
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
 					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
@@ -1227,30 +1227,9 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 static int btrfs_split_extent_hook(struct inode *inode,
 				    struct extent_state *orig, u64 split)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 size;
-
 	if (!(orig->state & EXTENT_DELALLOC))
 		return 0;
 
-	size = orig->end - orig->start + 1;
-	if (size > root->fs_info->max_extent) {
-		u64 num_extents;
-		u64 new_size;
-
-		new_size = orig->end - split + 1;
-		num_extents = div64_u64(size + root->fs_info->max_extent - 1,
-					root->fs_info->max_extent);
-
-		/*
-		 * if we break a large extent up then leave oustanding_extents
-		 * be, since we've already accounted for the large extent.
-		 */
-		if (div64_u64(new_size + root->fs_info->max_extent - 1,
-			      root->fs_info->max_extent) < num_extents)
-			return 0;
-	}
-
 	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	BTRFS_I(inode)->outstanding_extents++;
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1268,38 +1247,10 @@ static int btrfs_merge_extent_hook(struct inode *inode,
 				   struct extent_state *new,
 				   struct extent_state *other)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 new_size, old_size;
-	u64 num_extents;
-
 	/* not delalloc, ignore it */
 	if (!(other->state & EXTENT_DELALLOC))
 		return 0;
 
-	old_size = other->end - other->start + 1;
-	if (new->start < other->start)
-		new_size = other->end - new->start + 1;
-	else
-		new_size = new->end - other->start + 1;
-
-	/* we're not bigger than the max, unreserve the space and go */
-	if (new_size <= root->fs_info->max_extent) {
-		spin_lock(&BTRFS_I(inode)->accounting_lock);
-		BTRFS_I(inode)->outstanding_extents--;
-		spin_unlock(&BTRFS_I(inode)->accounting_lock);
-		return 0;
-	}
-
-	/*
-	 * If we grew by another max_extent, just return, we want to keep that
-	 * reserved amount.
-	 */
-	num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
-				root->fs_info->max_extent);
-	if (div64_u64(new_size + root->fs_info->max_extent - 1,
-		      root->fs_info->max_extent) > num_extents)
-		return 0;
-
 	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	BTRFS_I(inode)->outstanding_extents--;
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -1328,6 +1279,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 		BTRFS_I(inode)->outstanding_extents++;
 		spin_unlock(&BTRFS_I(inode)->accounting_lock);
 		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+
 		spin_lock(&root->fs_info->delalloc_lock);
 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
 		root->fs_info->delalloc_bytes += end - start + 1;
@@ -1356,6 +1308,7 @@ static int btrfs_clear_bit_hook(struct inode *inode,
 
 		if (bits & EXTENT_DO_ACCOUNTING) {
 			spin_lock(&BTRFS_I(inode)->accounting_lock);
+			WARN_ON(!BTRFS_I(inode)->outstanding_extents);
 			BTRFS_I(inode)->outstanding_extents--;
 			spin_unlock(&BTRFS_I(inode)->accounting_lock);
 			btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -5384,7 +5337,6 @@ free:
 void btrfs_drop_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-
 	if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
 		generic_delete_inode(inode);
 	else
@@ -5788,18 +5740,15 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
-	u64 alloc_size;
 	u64 cur_offset = start;
 	u64 num_bytes = end - start;
 	int ret = 0;
 	u64 i_size;
 
 	while (num_bytes > 0) {
-		alloc_size = min(num_bytes, root->fs_info->max_extent);
-
 		trans = btrfs_start_transaction(root, 1);
 
-		ret = btrfs_reserve_extent(trans, root, alloc_size,
+		ret = btrfs_reserve_extent(trans, root, num_bytes,
 					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
 		if (ret) {
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a8ffecd0b491..5c99882b9763 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -303,6 +303,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry)
 {
 	struct btrfs_ordered_inode_tree *tree;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct rb_node *node;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
@@ -312,12 +313,13 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
 
 	spin_lock(&BTRFS_I(inode)->accounting_lock);
+	WARN_ON(!BTRFS_I(inode)->outstanding_extents);
 	BTRFS_I(inode)->outstanding_extents--;
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
 	btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
 					      inode, 1);
 
-	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+	spin_lock(&root->fs_info->ordered_extent_lock);
 	list_del_init(&entry->root_extent_list);
 
 	/*
@@ -329,7 +331,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
 	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
 		list_del_init(&BTRFS_I(inode)->ordered_operations);
 	}
-	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+	spin_unlock(&root->fs_info->ordered_extent_lock);
 
 	return 0;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9ac612e6ca60..d11b12fc086b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -64,10 +64,9 @@ static void btrfs_put_super(struct super_block *sb)
 
 enum {
 	Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
-	Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start,
-	Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool,
-	Opt_noacl, Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio,
-	Opt_flushoncommit,
+	Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
+	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
+	Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
 	Opt_discard, Opt_err,
 };
 
@@ -79,7 +78,6 @@ static match_table_t tokens = {
 	{Opt_nodatasum, "nodatasum"},
 	{Opt_nodatacow, "nodatacow"},
 	{Opt_nobarrier, "nobarrier"},
-	{Opt_max_extent, "max_extent=%s"},
 	{Opt_max_inline, "max_inline=%s"},
 	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_thread_pool, "thread_pool=%d"},
@@ -188,18 +186,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				       info->thread_pool_size);
 			}
 			break;
-		case Opt_max_extent:
-			num = match_strdup(&args[0]);
-			if (num) {
-				info->max_extent = memparse(num, NULL);
-				kfree(num);
-
-				info->max_extent = max_t(u64,
-					info->max_extent, root->sectorsize);
-				printk(KERN_INFO "btrfs: max_extent at %llu\n",
-				       (unsigned long long)info->max_extent);
-			}
-			break;
 		case Opt_max_inline:
 			num = match_strdup(&args[0]);
 			if (num) {
@@ -529,9 +515,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nodatacow");
 	if (btrfs_test_opt(root, NOBARRIER))
 		seq_puts(seq, ",nobarrier");
-	if (info->max_extent != (u64)-1)
-		seq_printf(seq, ",max_extent=%llu",
-			   (unsigned long long)info->max_extent);
 	if (info->max_inline != 8192 * 1024)
 		seq_printf(seq, ",max_inline=%llu",
 			   (unsigned long long)info->max_inline);
-- 
cgit v1.2.2


From 0cad8a1130f77c7c445e3298c0e3593b3c0ef439 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 17 Mar 2010 20:45:56 +0000
Subject: Btrfs: fix chunk allocate size calculation

If the amount of free space left in a device is less than what we think should
be the minimum size, just ignore the minimum size and use the amount we have.  I
ran into this running tests on a 600mb volume, the chunk allocator wouldn't let
me allocate the last 52mb of the disk for data because we want to have at least
64mb chunks for data.  This patch fixes that problem.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bf3bec3e4130..9bf1f581b872 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2244,8 +2244,10 @@ again:
 		do_div(calc_size, stripe_len);
 		calc_size *= stripe_len;
 	}
+
 	/* we don't want tiny stripes */
-	calc_size = max_t(u64, min_stripe_size, calc_size);
+	if (!looped)
+		calc_size = max_t(u64, min_stripe_size, calc_size);
 
 	do_div(calc_size, stripe_len);
 	calc_size *= stripe_len;
-- 
cgit v1.2.2


From 753234007f4ac2c96921cfb19ec1ba535ac29790 Mon Sep 17 00:00:00 2001
From: Li Hong <lihong.hi@gmail.com>
Date: Wed, 31 Mar 2010 15:41:00 +0800
Subject: nilfs2: fix a wrong type conversion in nilfs_ioctl()

(void * __user *) should be (void __user *)

Signed-off-by: Li Hong <lihong.hi@gmail.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 313d0a21da48..c446017141d8 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -648,7 +648,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
 long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	void __user *argp = (void * __user *)arg;
+	void __user *argp = (void __user *)arg;
 
 	switch (cmd) {
 	case NILFS_IOCTL_CHANGE_CPMODE:
-- 
cgit v1.2.2


From 30d1872d9eb3663b4cf7bdebcbf5cd465674cced Mon Sep 17 00:00:00 2001
From: Nikolaus Schulz <microschulz@web.de>
Date: Thu, 1 Apr 2010 02:21:10 +0900
Subject: fat: fix buffer overflow in vfat_create_shortname()

When using the string representation of a random counter as part of the base
name, ensure that it is no longer than 4 bytes.

Since we are repeatedly decrementing the counter in a loop until we have found a
unique base name, the counter may wrap around zero; therefore, it is not enough
to mask its higher bits before entering the loop, this must be done inside the
loop.

[hirofumi@mail.parknet.co.jp: use snprintf()]
Signed-off-by: Nikolaus Schulz <microschulz@web.de>
Cc: stable@kernel.org
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/namei_vfat.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c1ef50154868..6fcc7e71fbaa 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
 {
 	struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
 	wchar_t *ip, *ext_start, *end, *name_start;
-	unsigned char base[9], ext[4], buf[8], *p;
+	unsigned char base[9], ext[4], buf[5], *p;
 	unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
 	int chl, chi;
 	int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
 			return 0;
 	}
 
-	i = jiffies & 0xffff;
+	i = jiffies;
 	sz = (jiffies >> 16) & 0x7;
 	if (baselen > 2) {
 		baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
 	name_res[baselen + 4] = '~';
 	name_res[baselen + 5] = '1' + sz;
 	while (1) {
-		sprintf(buf, "%04X", i);
+		snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
 		memcpy(&name_res[baselen], buf, 4);
 		if (vfat_find_form(dir, name_res) < 0)
 			break;
-- 
cgit v1.2.2


From b95c35e76b29ba812e5dabdd91592e25ec640e93 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 1 Apr 2010 15:13:57 +0200
Subject: oom: fix the unsafe usage of badness() in proc_oom_score()

proc_oom_score(task) has a reference to task_struct, but that is all.
If this task was already released before we take tasklist_lock

	- we can't use task->group_leader, it points to nowhere

	- it is not safe to call badness() even if this task is
	  ->group_leader, has_intersects_mems_allowed() assumes
	  it is safe to iterate over ->thread_group list.

	- even worse, badness() can hit ->signal == NULL

Add the pid_alive() check to ensure __unhash_process() was not called.

Also, use "task" instead of task->group_leader. badness() should return
the same result for any sub-thread. Currently this is not true, but
this should be changed anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index a7310841c831..b1f6e62773d3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -442,12 +442,13 @@ static const struct file_operations proc_lstats_operations = {
 unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
-	unsigned long points;
+	unsigned long points = 0;
 	struct timespec uptime;
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	read_lock(&tasklist_lock);
-	points = badness(task->group_leader, uptime.tv_sec);
+	if (pid_alive(task))
+		points = badness(task, uptime.tv_sec);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }
-- 
cgit v1.2.2


From 80e755fedebc8de0599a79efad2c656503df2e62 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 31 Mar 2010 21:52:10 -0700
Subject: ceph: allow writeback of snapped pages older than 'oldest' snapc

On snap deletion, we don't regenerate ceph_cap_snaps for inodes with dirty
pages because deletion does not affect metadata writeback.  However, we
did run into problems when we went to write back the pages because the
'oldest' snapc is determined by the oldest cap_snap, and that may be the
newer snapc that reflects the deletion.  This caused confusion and an
infinite loop in ceph_update_writeable_page().

Change the snapc checks to allow writeback of any snapc that is equal to
OR older than the 'oldest' snapc.

When there are no cap_snaps, we were also using the realm's latest snapc
for writeback, which complicates ceph_put_wrbufffer_cap_refs().  Instead,
use i_head_snapc, the most snapc used for the most recent ('head') data.
This makes the writeback snapc (ceph_osd_request.r_snapc) _always_ match a
capsnap or i_head_snapc.

Also, in writepags_finish(), drop the snapc referenced by the _page_
and do not assume it matches the request snapc (it may not anymore).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index ce8ef6107727..a313e9baeed0 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -356,8 +356,8 @@ static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
 			break;
 		}
 	}
-	if (!snapc && ci->i_snap_realm) {
-		snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+	if (!snapc && ci->i_head_snapc) {
+		snapc = ceph_get_snap_context(ci->i_head_snapc);
 		dout(" head snapc %p has %d dirty pages\n",
 		     snapc, ci->i_wrbuffer_ref_head);
 	}
@@ -412,7 +412,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 		dout("writepage %p page %p not dirty?\n", inode, page);
 		goto out;
 	}
-	if (snapc != get_oldest_context(inode, &snap_size)) {
+	if (snapc->seq > get_oldest_context(inode, &snap_size)->seq) {
 		dout("writepage %p page %p snapc %p not writeable - noop\n",
 		     inode, page, (void *)page->private);
 		/* we should only noop if called by kswapd */
@@ -557,9 +557,9 @@ static void writepages_finish(struct ceph_osd_request *req,
 			dout("inode %p skipping page %p\n", inode, page);
 			wbc->pages_skipped++;
 		}
+		ceph_put_snap_context((void *)page->private);
 		page->private = 0;
 		ClearPagePrivate(page);
-		ceph_put_snap_context(snapc);
 		dout("unlocking %d %p\n", i, page);
 		end_page_writeback(page);
 
@@ -617,7 +617,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	int range_whole = 0;
 	int should_loop = 1;
 	pgoff_t max_pages = 0, max_pages_ever = 0;
-	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
+	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
 	struct pagevec pvec;
 	int done = 0;
 	int rc = 0;
@@ -769,9 +769,10 @@ get_more_pages:
 			}
 
 			/* only if matching snap context */
-			if (snapc != (void *)page->private) {
-				dout("page snapc %p != oldest %p\n",
-				     (void *)page->private, snapc);
+			pgsnapc = (void *)page->private;
+			if (pgsnapc->seq > snapc->seq) {
+				dout("page snapc %p %lld > oldest %p %lld\n",
+				     pgsnapc, pgsnapc->seq, snapc, snapc->seq);
 				unlock_page(page);
 				if (!locked_pages)
 					continue; /* keep looking for snap */
@@ -935,8 +936,8 @@ static int ceph_update_writeable_page(struct file *file,
 	int pos_in_page = pos & ~PAGE_CACHE_MASK;
 	int end_in_page = pos_in_page + len;
 	loff_t i_size;
-	struct ceph_snap_context *snapc;
 	int r;
+	struct ceph_snap_context *snapc, *oldest;
 
 retry_locked:
 	/* writepages currently holds page lock, but if we change that later, */
@@ -946,16 +947,16 @@ retry_locked:
 	BUG_ON(!ci->i_snap_realm);
 	down_read(&mdsc->snap_rwsem);
 	BUG_ON(!ci->i_snap_realm->cached_context);
-	if (page->private &&
-	    (void *)page->private != ci->i_snap_realm->cached_context) {
+	snapc = (void *)page->private;
+	if (snapc && snapc != ci->i_head_snapc) {
 		/*
 		 * this page is already dirty in another (older) snap
 		 * context!  is it writeable now?
 		 */
-		snapc = get_oldest_context(inode, NULL);
+		oldest = get_oldest_context(inode, NULL);
 		up_read(&mdsc->snap_rwsem);
 
-		if (snapc != (void *)page->private) {
+		if (snapc->seq > oldest->seq) {
 			dout(" page %p snapc %p not current or oldest\n",
 			     page, (void *)page->private);
 			/*
-- 
cgit v1.2.2


From 6298a33757ba7361bb8f506c106daad77e5ac8cf Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 31 Mar 2010 22:01:38 -0700
Subject: ceph: fix snap context reference leaks

The get_oldest_context() helper takes a reference to the returned snap
context, but most callers weren't dropping that reference.  Fix them.

Also drop the unused locked __get_oldest_context() variant.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a313e9baeed0..41f1f713b7ba 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -336,16 +336,15 @@ out:
 /*
  * Get ref for the oldest snapc for an inode with dirty data... that is, the
  * only snap context we are allowed to write back.
- *
- * Caller holds i_lock.
  */
-static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
-						      u64 *snap_size)
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
+						    u64 *snap_size)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_snap_context *snapc = NULL;
 	struct ceph_cap_snap *capsnap = NULL;
 
+	spin_lock(&inode->i_lock);
 	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
 		dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
 		     capsnap->context, capsnap->dirty_pages);
@@ -361,16 +360,6 @@ static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
 		dout(" head snapc %p has %d dirty pages\n",
 		     snapc, ci->i_wrbuffer_ref_head);
 	}
-	return snapc;
-}
-
-static struct ceph_snap_context *get_oldest_context(struct inode *inode,
-						    u64 *snap_size)
-{
-	struct ceph_snap_context *snapc = NULL;
-
-	spin_lock(&inode->i_lock);
-	snapc = __get_oldest_context(inode, snap_size);
 	spin_unlock(&inode->i_lock);
 	return snapc;
 }
@@ -391,7 +380,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	int len = PAGE_CACHE_SIZE;
 	loff_t i_size;
 	int err = 0;
-	struct ceph_snap_context *snapc;
+	struct ceph_snap_context *snapc, *oldest;
 	u64 snap_size = 0;
 	long writeback_stat;
 
@@ -412,13 +401,16 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 		dout("writepage %p page %p not dirty?\n", inode, page);
 		goto out;
 	}
-	if (snapc->seq > get_oldest_context(inode, &snap_size)->seq) {
+	oldest = get_oldest_context(inode, &snap_size);
+	if (snapc->seq > oldest->seq) {
 		dout("writepage %p page %p snapc %p not writeable - noop\n",
 		     inode, page, (void *)page->private);
 		/* we should only noop if called by kswapd */
 		WARN_ON((current->flags & PF_MEMALLOC) == 0);
+		ceph_put_snap_context(oldest);
 		goto out;
 	}
+	ceph_put_snap_context(oldest);
 
 	/* is this a partial page at end of file? */
 	if (snap_size)
@@ -457,7 +449,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	ClearPagePrivate(page);
 	end_page_writeback(page);
 	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
-	ceph_put_snap_context(snapc);
+	ceph_put_snap_context(snapc);  /* page's reference */
 out:
 	return err;
 }
@@ -914,7 +906,10 @@ static int context_is_writeable_or_written(struct inode *inode,
 					   struct ceph_snap_context *snapc)
 {
 	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
-	return !oldest || snapc->seq <= oldest->seq;
+	int ret = !oldest || snapc->seq <= oldest->seq;
+
+	ceph_put_snap_context(oldest);
+	return ret;
 }
 
 /*
@@ -957,13 +952,14 @@ retry_locked:
 		up_read(&mdsc->snap_rwsem);
 
 		if (snapc->seq > oldest->seq) {
+			ceph_put_snap_context(oldest);
 			dout(" page %p snapc %p not current or oldest\n",
-			     page, (void *)page->private);
+			     page, snapc);
 			/*
 			 * queue for writeback, and wait for snapc to
 			 * be writeable or written
 			 */
-			snapc = ceph_get_snap_context((void *)page->private);
+			snapc = ceph_get_snap_context(snapc);
 			unlock_page(page);
 			ceph_queue_writeback(inode);
 			r = wait_event_interruptible(ci->i_cap_wq,
@@ -973,6 +969,7 @@ retry_locked:
 				return r;
 			return -EAGAIN;
 		}
+		ceph_put_snap_context(oldest);
 
 		/* yay, writeable, do it now (without dropping page lock) */
 		dout(" page %p snapc %p not current, but oldest\n",
-- 
cgit v1.2.2


From 819ccbfa448403992ceafc05d6d7097aaa74d4c3 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 1 Apr 2010 09:33:46 -0700
Subject: ceph: fix leaked inode ref due to snap metadata writeback race

We create a ceph_cap_snap if there is dirty cap metadata (for writeback to
mds) OR dirty pages (for writeback to osd).  It is thus possible that the
metadata has been written back to the MDS but the OSD data has not when
the cap_snap is created.  This results in a cap_snap with dirty(caps) == 0.
The problem is that cap writeback to the MDS isn't necessary, and a
FLUSHSNAP cap op gets no ack from the MDS.  This leaves the cap_snap
attached to the inode along with its inode reference.

Fix the problem by dropping the cap_snap if it becomes 'complete' (all
pages written out) and dirty(caps) == 0 in ceph_put_wrbuffer_cap_refs().

Also, BUG() in __ceph_flush_snaps() if we encounter a cap_snap with
dirty(caps) == 0.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 42 ++++++++++++++++++++++++++++++++----------
 fs/ceph/snap.c | 10 ++++++----
 2 files changed, 38 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7d0a0d0adc18..b6fdf010749b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1204,6 +1204,12 @@ retry:
 		if (capsnap->dirty_pages || capsnap->writing)
 			continue;
 
+		/*
+		 * if cap writeback already occurred, we should have dropped
+		 * the capsnap in ceph_put_wrbuffer_cap_refs.
+		 */
+		BUG_ON(capsnap->dirty == 0);
+
 		/* pick mds, take s_mutex */
 		mds = __ceph_get_cap_mds(ci, &mseq);
 		if (session && session->s_mds != mds) {
@@ -2117,8 +2123,8 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 		}
 	spin_unlock(&inode->i_lock);
 
-	dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
-	     last ? "last" : "");
+	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
+	     last ? " last" : "", put ? " put" : "");
 
 	if (last && !flushsnaps)
 		ceph_check_caps(ci, 0, NULL);
@@ -2142,7 +2148,8 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 {
 	struct inode *inode = &ci->vfs_inode;
 	int last = 0;
-	int last_snap = 0;
+	int complete_capsnap = 0;
+	int drop_capsnap = 0;
 	int found = 0;
 	struct ceph_cap_snap *capsnap = NULL;
 
@@ -2165,19 +2172,32 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 		list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
 			if (capsnap->context == snapc) {
 				found = 1;
-				capsnap->dirty_pages -= nr;
-				last_snap = !capsnap->dirty_pages;
 				break;
 			}
 		}
 		BUG_ON(!found);
+		capsnap->dirty_pages -= nr;
+		if (capsnap->dirty_pages == 0) {
+			complete_capsnap = 1;
+			if (capsnap->dirty == 0)
+				/* cap writeback completed before we created
+				 * the cap_snap; no FLUSHSNAP is needed */
+				drop_capsnap = 1;
+		}
 		dout("put_wrbuffer_cap_refs on %p cap_snap %p "
-		     " snap %lld %d/%d -> %d/%d %s%s\n",
+		     " snap %lld %d/%d -> %d/%d %s%s%s\n",
 		     inode, capsnap, capsnap->context->seq,
 		     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
 		     ci->i_wrbuffer_ref, capsnap->dirty_pages,
 		     last ? " (wrbuffer last)" : "",
-		     last_snap ? " (capsnap last)" : "");
+		     complete_capsnap ? " (complete capsnap)" : "",
+		     drop_capsnap ? " (drop capsnap)" : "");
+		if (drop_capsnap) {
+			ceph_put_snap_context(capsnap->context);
+			list_del(&capsnap->ci_item);
+			list_del(&capsnap->flushing_item);
+			ceph_put_cap_snap(capsnap);
+		}
 	}
 
 	spin_unlock(&inode->i_lock);
@@ -2185,10 +2205,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 	if (last) {
 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
 		iput(inode);
-	} else if (last_snap) {
+	} else if (complete_capsnap) {
 		ceph_flush_snaps(ci);
 		wake_up(&ci->i_cap_wq);
 	}
+	if (drop_capsnap)
+		iput(inode);
 }
 
 /*
@@ -2464,8 +2486,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
 				break;
 			}
 			WARN_ON(capsnap->dirty_pages || capsnap->writing);
-			dout(" removing cap_snap %p follows %lld\n",
-			     capsnap, follows);
+			dout(" removing %p cap_snap %p follows %lld\n",
+			     inode, capsnap, follows);
 			ceph_put_snap_context(capsnap->context);
 			list_del(&capsnap->ci_item);
 			list_del(&capsnap->flushing_item);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index df04e210a055..7e3e5f9edaa4 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -521,15 +521,17 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 	capsnap->ctime = inode->i_ctime;
 	capsnap->time_warp_seq = ci->i_time_warp_seq;
 	if (capsnap->dirty_pages) {
-		dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
+		dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
 		     "still has %d dirty pages\n", inode, capsnap,
 		     capsnap->context, capsnap->context->seq,
-		     capsnap->size, capsnap->dirty_pages);
+		     ceph_cap_string(capsnap->dirty), capsnap->size,
+		     capsnap->dirty_pages);
 		return 0;
 	}
-	dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
+	dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
 	     inode, capsnap, capsnap->context,
-	     capsnap->context->seq, capsnap->size);
+	     capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+	     capsnap->size);
 
 	spin_lock(&mdsc->snap_flush_lock);
 	list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
-- 
cgit v1.2.2


From 308f44193f796b1c522b3b87760e43d8d8e316d2 Mon Sep 17 00:00:00 2001
From: Li Hong <lihong.hi@gmail.com>
Date: Fri, 2 Apr 2010 18:40:39 +0800
Subject: nilfs2: Remove an uninitialization warning in
 nilfs_btree_propagate_v()

`make CONFIG_NILFS2_FS=m M=fs/nilfs2/` will give the following warnings:

fs/nilfs2/btree.c: In function 'nilfs_btree_propagate':
fs/nilfs2/btree.c:1882: warning: 'maxlevel' may be used uninitialized in this function
fs/nilfs2/btree.c:1882: note: 'maxlevel' was declared here

Set maxlevel = 0 to fix it.

Signed-off-by: Li Hong <lihong.hi@gmail.com>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/btree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 7cdd98b8d514..76c38e3e19d2 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1879,7 +1879,7 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
 				   struct nilfs_btree_path *path,
 				   int level, struct buffer_head *bh)
 {
-	int maxlevel, ret;
+	int maxlevel = 0, ret;
 	struct nilfs_btree_node *parent;
 	struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
 	__u64 ptr;
-- 
cgit v1.2.2


From 0e0d5e0c4bb0476d53a43bfc87d03a25ec4b5579 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 2 Apr 2010 16:07:19 -0700
Subject: ceph: fix ack counter reset on connection reset

If in_seq_acked isn't reset along with in_seq, we don't ack received
messages until we reach the old count, consuming gobs memory on the other
end of the connection and introducing a large delay when those messages
are eventually deleted.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index a32f0f896d9f..f35b4945a9c3 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -332,6 +332,7 @@ static void reset_connection(struct ceph_connection *con)
 		con->out_msg = NULL;
 	}
 	con->in_seq = 0;
+	con->in_seq_acked = 0;
 }
 
 /*
-- 
cgit v1.2.2


From a24e2d7d8f512340991ef0a59cb5d08d491b8e98 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sat, 3 Apr 2010 17:20:21 +0000
Subject: [CIFS] initialize nbytes at the beginning of CIFSSMBWrite()

By doing this we always overwrite nbytes value that is being passed on to
CIFSSMBWrite() and need not rely on the callers to initialize. CIFSSMBWrite2 is
doing this already.

CC: Stable <stable@kernel.org>
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7cc7f83e9314..e1f90a3a0162 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1430,6 +1430,8 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 	__u32 bytes_sent;
 	__u16 byte_count;
 
+	*nbytes = 0;
+
 	/* cFYI(1, ("write at %lld %d bytes", offset, count));*/
 	if (tcon->ses == NULL)
 		return -ECONNABORTED;
@@ -1512,7 +1514,6 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 	cifs_stats_inc(&tcon->num_writes);
 	if (rc) {
 		cFYI(1, ("Send error in write = %d", rc));
-		*nbytes = 0;
 	} else {
 		*nbytes = le16_to_cpu(pSMBr->CountHigh);
 		*nbytes = (*nbytes) << 16;
-- 
cgit v1.2.2


From 6513a81e9325d712f1bfb9a1d7b750134e49ff18 Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Wed, 31 Mar 2010 12:00:03 +0530
Subject: cifs: Fix a kernel BUG with remote OS/2 server (try #3)

While chasing a bug report involving a OS/2 server, I noticed the server sets
pSMBr->CountHigh to a incorrect value even in case of normal writes. This
results in 'nbytes' being computed wrongly and triggers a kernel BUG at
mm/filemap.c.

void iov_iter_advance(struct iov_iter *i, size_t bytes)
{
        BUG_ON(i->count < bytes);    <--- BUG here

Why the server is setting 'CountHigh' is not clear but only does so after
writing 64k bytes. Though this looks like the server bug, the client side
crash may not be acceptable.

The workaround is to mask off high 16 bits if the number of bytes written as
returned by the server is greater than the bytes requested by the client as
suggested by Jeff Layton.

CC: Stable <stable@kernel.org>
Reviewed-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index e1f90a3a0162..f213b8ae43c1 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1518,6 +1518,14 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 		*nbytes = le16_to_cpu(pSMBr->CountHigh);
 		*nbytes = (*nbytes) << 16;
 		*nbytes += le16_to_cpu(pSMBr->Count);
+
+		/*
+		 * Mask off high 16 bits when bytes written as returned by the
+		 * server is greater than bytes requested by the client. Some
+		 * OS/2 servers are known to set incorrect CountHigh values.
+		 */
+		if (*nbytes > count)
+			*nbytes &= 0xFFFF;
 	}
 
 	cifs_buf_release(pSMB);
@@ -1606,6 +1614,14 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 		*nbytes = le16_to_cpu(pSMBr->CountHigh);
 		*nbytes = (*nbytes) << 16;
 		*nbytes += le16_to_cpu(pSMBr->Count);
+
+		/*
+		 * Mask off high 16 bits when bytes written as returned by the
+		 * server is greater than bytes requested by the client. OS/2
+		 * servers are known to set incorrect CountHigh values.
+		 */
+		if (*nbytes > count)
+			*nbytes &= 0xFFFF;
 	}
 
 /*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
-- 
cgit v1.2.2


From d82ef020cf31504c816803b1def94eb5ff173363 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Fri, 2 Apr 2010 09:11:29 +0900
Subject: proc: pagemap: Hold mmap_sem during page walk

In initial design, walk_page_range() was designed just for walking page
table and it didn't require mmap_sem.  Now, find_vma() etc..  are used
in walk_page_range() and we need mmap_sem around it.

This patch adds mmap_sem around walk_page_range().

Because /proc/<pid>/pagemap's callback routine use put_user(), we have
to get rid of it to do sane fix.

Changelog: 2010/Apr/2
 - fixed start_vaddr and end overflow
Changelog: 2010/Apr/1
 - fixed start_vaddr calculation
 - removed unnecessary cast.
 - removed unnecessary change in smaps.
 - use GFP_TEMPORARY instead of GFP_KERNEL

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: San Mehat <san@google.com>
Cc: Brian Swetland <swetland@google.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
[ Fixed kmalloc failure return code as per Matt ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 87 ++++++++++++++++++++++++------------------------------
 1 file changed, 38 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 183f8ff5f400..096273984c3b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -406,6 +406,7 @@ static int show_smap(struct seq_file *m, void *v)
 
 	memset(&mss, 0, sizeof mss);
 	mss.vma = vma;
+	/* mmap_sem is held in m_start */
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
 		walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
 
@@ -552,7 +553,8 @@ const struct file_operations proc_clear_refs_operations = {
 };
 
 struct pagemapread {
-	u64 __user *out, *end;
+	int pos, len;
+	u64 *buffer;
 };
 
 #define PM_ENTRY_BYTES      sizeof(u64)
@@ -575,10 +577,8 @@ struct pagemapread {
 static int add_to_pagemap(unsigned long addr, u64 pfn,
 			  struct pagemapread *pm)
 {
-	if (put_user(pfn, pm->out))
-		return -EFAULT;
-	pm->out++;
-	if (pm->out >= pm->end)
+	pm->buffer[pm->pos++] = pfn;
+	if (pm->pos >= pm->len)
 		return PM_END_OF_BUFFER;
 	return 0;
 }
@@ -720,21 +720,20 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
  * determine which areas of memory are actually mapped and llseek to
  * skip over unmapped regions.
  */
+#define PAGEMAP_WALK_SIZE	(PMD_SIZE)
 static ssize_t pagemap_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
 	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-	struct page **pages, *page;
-	unsigned long uaddr, uend;
 	struct mm_struct *mm;
 	struct pagemapread pm;
-	int pagecount;
 	int ret = -ESRCH;
 	struct mm_walk pagemap_walk = {};
 	unsigned long src;
 	unsigned long svpfn;
 	unsigned long start_vaddr;
 	unsigned long end_vaddr;
+	int copied = 0;
 
 	if (!task)
 		goto out;
@@ -757,35 +756,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	if (!mm)
 		goto out_task;
 
-
-	uaddr = (unsigned long)buf & PAGE_MASK;
-	uend = (unsigned long)(buf + count);
-	pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
-	ret = 0;
-	if (pagecount == 0)
-		goto out_mm;
-	pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+	pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
+	pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
 	ret = -ENOMEM;
-	if (!pages)
+	if (!pm.buffer)
 		goto out_mm;
 
-	down_read(&current->mm->mmap_sem);
-	ret = get_user_pages(current, current->mm, uaddr, pagecount,
-			     1, 0, pages, NULL);
-	up_read(&current->mm->mmap_sem);
-
-	if (ret < 0)
-		goto out_free;
-
-	if (ret != pagecount) {
-		pagecount = ret;
-		ret = -EFAULT;
-		goto out_pages;
-	}
-
-	pm.out = (u64 __user *)buf;
-	pm.end = (u64 __user *)(buf + count);
-
 	pagemap_walk.pmd_entry = pagemap_pte_range;
 	pagemap_walk.pte_hole = pagemap_pte_hole;
 	pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -807,23 +783,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	 * user buffer is tracked in "pm", and the walk
 	 * will stop when we hit the end of the buffer.
 	 */
-	ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk);
-	if (ret == PM_END_OF_BUFFER)
-		ret = 0;
-	/* don't need mmap_sem for these, but this looks cleaner */
-	*ppos += (char __user *)pm.out - buf;
-	if (!ret)
-		ret = (char __user *)pm.out - buf;
-
-out_pages:
-	for (; pagecount; pagecount--) {
-		page = pages[pagecount-1];
-		if (!PageReserved(page))
-			SetPageDirty(page);
-		page_cache_release(page);
+	ret = 0;
+	while (count && (start_vaddr < end_vaddr)) {
+		int len;
+		unsigned long end;
+
+		pm.pos = 0;
+		end = start_vaddr + PAGEMAP_WALK_SIZE;
+		/* overflow ? */
+		if (end < start_vaddr || end > end_vaddr)
+			end = end_vaddr;
+		down_read(&mm->mmap_sem);
+		ret = walk_page_range(start_vaddr, end, &pagemap_walk);
+		up_read(&mm->mmap_sem);
+		start_vaddr = end;
+
+		len = min(count, PM_ENTRY_BYTES * pm.pos);
+		if (copy_to_user(buf, pm.buffer, len) < 0) {
+			ret = -EFAULT;
+			goto out_free;
+		}
+		copied += len;
+		buf += len;
+		count -= len;
 	}
+	*ppos += copied;
+	if (!ret || ret == PM_END_OF_BUFFER)
+		ret = copied;
+
 out_free:
-	kfree(pages);
+	kfree(pm.buffer);
 out_mm:
 	mmput(mm);
 out_task:
-- 
cgit v1.2.2


From 476ada0436351672fbf482db54cb94b8ba877709 Mon Sep 17 00:00:00 2001
From: Sripathi Kodi <sripathik@in.ibm.com>
Date: Wed, 17 Mar 2010 17:02:38 +0000
Subject: 9p: Fix setting of protocol flags in v9fs_session_info structure.

This patch fixes a simple bug I left behind in my earlier protocol
negotiation patch.

Thanks,
Sripathi.

Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6c7f6a251115..2f814cf0c731 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -241,7 +241,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	list_add(&v9ses->slist, &v9fs_sessionlist);
 	spin_unlock(&v9fs_sessionlist_lock);
 
-	v9ses->flags = V9FS_PROTO_2000U | V9FS_ACCESS_USER;
+	v9ses->flags = V9FS_ACCESS_USER;
 	strcpy(v9ses->uname, V9FS_DEFUSER);
 	strcpy(v9ses->aname, V9FS_DEFANAME);
 	v9ses->uid = ~0;
@@ -262,8 +262,10 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		goto error;
 	}
 
-	if (!p9_is_proto_dotu(v9ses->clnt))
-		v9ses->flags &= ~V9FS_PROTO_2000U;
+	if (p9_is_proto_dotl(v9ses->clnt))
+		v9ses->flags |= V9FS_PROTO_2000L;
+	else if (p9_is_proto_dotu(v9ses->clnt))
+		v9ses->flags |= V9FS_PROTO_2000U;
 
 	v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
 
-- 
cgit v1.2.2


From 5b0fa207d1a6f27c9a2f2d707147dce01af21db7 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 19 Mar 2010 12:47:26 +0000
Subject: fs/9p: Clunk the fid resulting from partial walk of the name

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/fid.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 08b2eb157048..b0a23c7e736e 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -110,7 +110,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 {
 	int i, n, l, clone, any, access;
 	u32 uid;
-	struct p9_fid *fid;
+	struct p9_fid *fid, *old_fid = NULL;
 	struct dentry *d, *ds;
 	struct v9fs_session_info *v9ses;
 	char **wnames, *uname;
@@ -183,10 +183,18 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 		l = min(n - i, P9_MAXWELEM);
 		fid = p9_client_walk(fid, l, &wnames[i], clone);
 		if (IS_ERR(fid)) {
+			if (old_fid) {
+				/*
+				 * If we fail, clunk fid which are mapping
+				 * to path component and not the last component
+				 * of the path.
+				 */
+				p9_client_clunk(old_fid);
+			}
 			kfree(wnames);
 			return fid;
 		}
-
+		old_fid = fid;
 		i += l;
 		clone = 0;
 	}
-- 
cgit v1.2.2


From d994f4058d9f9be7e44529b55fc6be6552901ead Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 29 Mar 2010 18:14:50 -0500
Subject: 9p: drop nlink remove

We need to drop the link count on the inode of a sucessfull remove

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5fe45d692c9f..ae8396707707 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -431,6 +431,7 @@ error:
 
 static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 {
+	int retval;
 	struct inode *file_inode;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *v9fid;
@@ -444,7 +445,10 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 	if (IS_ERR(v9fid))
 		return PTR_ERR(v9fid);
 
-	return p9_client_remove(v9fid);
+	retval = p9_client_remove(v9fid);
+	if (!retval)
+		drop_nlink(file_inode);
+	return retval;
 }
 
 static int
-- 
cgit v1.2.2


From 6d96d3ab7aea5f0e75205a0c97f8d1fdf82c5287 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 29 Mar 2010 18:13:59 -0500
Subject: 9p: Make sure we are able to clunk the cached fid on umount

dcache prune happen on umount. So we cannot mark the client
satus disconnect. That will prevent a 9p call to the server

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.c      | 13 +++++++++++++
 fs/9p/v9fs.h      |  1 +
 fs/9p/vfs_super.c |  3 ++-
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2f814cf0c731..0b78ae8157b9 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -342,6 +342,19 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
 	p9_client_disconnect(v9ses->clnt);
 }
 
+/**
+ * v9fs_session_begin_cancel - Begin terminate of a session
+ * @v9ses: session to terminate
+ *
+ * After this call we don't allow any request other than clunk.
+ */
+
+void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
+{
+	P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
+	p9_client_begin_disconnect(v9ses->clnt);
+}
+
 extern int v9fs_error_init(void);
 
 static struct kobject *v9fs_kobj;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 6b801d1ddf4b..a0a8d3dd1361 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -108,6 +108,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
 									char *);
 void v9fs_session_close(struct v9fs_session_info *v9ses);
 void v9fs_session_cancel(struct v9fs_session_info *v9ses);
+void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses);
 
 #define V9FS_MAGIC 0x01021997
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 69357c0d9899..d1a3c809a291 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -193,6 +193,7 @@ static void v9fs_kill_super(struct super_block *s)
 
 	kill_anon_super(s);
 
+	v9fs_session_cancel(v9ses);
 	v9fs_session_close(v9ses);
 	kfree(v9ses);
 	s->s_fs_info = NULL;
@@ -205,7 +206,7 @@ v9fs_umount_begin(struct super_block *sb)
 	struct v9fs_session_info *v9ses;
 
 	v9ses = sb->s_fs_info;
-	v9fs_session_cancel(v9ses);
+	v9fs_session_begin_cancel(v9ses);
 }
 
 static const struct super_operations v9fs_super_ops = {
-- 
cgit v1.2.2


From 11e9b49b7fa056bfc00a56de8956d1d5fe8b84ea Mon Sep 17 00:00:00 2001
From: Sripathi Kodi <sripathik@in.ibm.com>
Date: Mon, 29 Mar 2010 18:13:59 -0500
Subject: 9p: Creating files with names too long should fail with ENAMETOOLONG.

Signed-off-by: Sripathi Kodi <sripathik@in.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index ae8396707707..905f664b8bc6 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -660,6 +660,9 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 	P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
 		dir, dentry->d_name.name, dentry, nameidata);
 
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
 	sb = dir->i_sb;
 	v9ses = v9fs_inode2v9ses(dir);
 	dfid = v9fs_fid_lookup(dentry->d_parent);
-- 
cgit v1.2.2


From 28ecb60906e86e74e9ad4ac7e0218d8631e73a94 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 17 Mar 2010 13:31:04 +0000
Subject: Btrfs: use add_to_page_cache_lru, use __page_cache_alloc

Pagecache pages should be allocated with __page_cache_alloc, so they
obey pagecache memory policies.

add_to_page_cache_lru is exported, so it should be used. Benefits over
using a private pagevec: neater code, 128 bytes fewer stack used, percpu
lru ordering is preserved, and finally don't need to flush pagevec
before returning so batching may be shared with other LRU insertions.

Signed-off-by: Nick Piggin <npiggin@suse.de>:
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 22 ++++------------------
 fs/btrfs/extent_io.c   | 15 +--------------
 2 files changed, 5 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 28b92a7218ab..1d54c5308df5 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,7 +31,6 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
-#include <linux/pagevec.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -445,7 +444,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 	unsigned long nr_pages = 0;
 	struct extent_map *em;
 	struct address_space *mapping = inode->i_mapping;
-	struct pagevec pvec;
 	struct extent_map_tree *em_tree;
 	struct extent_io_tree *tree;
 	u64 end;
@@ -461,7 +459,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 
 	end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
 
-	pagevec_init(&pvec, 0);
 	while (last_offset < compressed_end) {
 		page_index = last_offset >> PAGE_CACHE_SHIFT;
 
@@ -478,26 +475,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 			goto next;
 		}
 
-		page = alloc_page(mapping_gfp_mask(mapping) & ~__GFP_FS);
+		page = __page_cache_alloc(mapping_gfp_mask(mapping) &
+								~__GFP_FS);
 		if (!page)
 			break;
 
-		page->index = page_index;
-		/*
-		 * what we want to do here is call add_to_page_cache_lru,
-		 * but that isn't exported, so we reproduce it here
-		 */
-		if (add_to_page_cache(page, mapping,
-				      page->index, GFP_NOFS)) {
+		if (add_to_page_cache_lru(page, mapping, page_index,
+								GFP_NOFS)) {
 			page_cache_release(page);
 			goto next;
 		}
 
-		/* open coding of lru_cache_add, also not exported */
-		page_cache_get(page);
-		if (!pagevec_add(&pvec, page))
-			__pagevec_lru_add_file(&pvec);
-
 		end = last_offset + PAGE_CACHE_SIZE - 1;
 		/*
 		 * at this point, we have a locked page in the page cache
@@ -551,8 +539,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 next:
 		last_offset += PAGE_CACHE_SIZE;
 	}
-	if (pagevec_count(&pvec))
-		__pagevec_lru_add_file(&pvec);
 	return 0;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c99121ac5d6b..fc742e59815e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2679,33 +2679,20 @@ int extent_readpages(struct extent_io_tree *tree,
 {
 	struct bio *bio = NULL;
 	unsigned page_idx;
-	struct pagevec pvec;
 	unsigned long bio_flags = 0;
 
-	pagevec_init(&pvec, 0);
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_entry(pages->prev, struct page, lru);
 
 		prefetchw(&page->flags);
 		list_del(&page->lru);
-		/*
-		 * what we want to do here is call add_to_page_cache_lru,
-		 * but that isn't exported, so we reproduce it here
-		 */
-		if (!add_to_page_cache(page, mapping,
+		if (!add_to_page_cache_lru(page, mapping,
 					page->index, GFP_KERNEL)) {
-
-			/* open coding of lru_cache_add, also not exported */
-			page_cache_get(page);
-			if (!pagevec_add(&pvec, page))
-				__pagevec_lru_add_file(&pvec);
 			__extent_read_full_page(tree, page, get_extent,
 						&bio, 0, &bio_flags);
 		}
 		page_cache_release(page);
 	}
-	if (pagevec_count(&pvec))
-		__pagevec_lru_add_file(&pvec);
 	BUG_ON(!list_empty(pages));
 	if (bio)
 		submit_one_bio(READ, bio, 0, bio_flags);
-- 
cgit v1.2.2


From b5cb160084fad438c513d0952849e597ffe9e3d9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 12 Mar 2010 19:28:18 +0000
Subject: Btrfs: fix small race with delalloc flushing waitqueue's

Everytime we start a new flushing thread, we init the waitqueue if there isn't a
flushing thread running.  The problem with this is we check
space_info->flushing, which we clear right before doing a wake_up on the
flushing waitqueue, which causes problems if we init the waitqueue in the middle
of clearing the flushing flagh and calling wake_up.  This is hard to hit, but
the code is wrong anyway, so init the flushing/allocating waitqueue when
creating the space info and let it be.  I haven't seen the panic since I've been
using this patch.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5f6bc283c0c8..101041d4d2b2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2676,6 +2676,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 
 	INIT_LIST_HEAD(&found->block_groups);
 	init_rwsem(&found->groups_sem);
+	init_waitqueue_head(&found->flush_wait);
+	init_waitqueue_head(&found->allocate_wait);
 	spin_lock_init(&found->lock);
 	found->flags = flags;
 	found->total_bytes = total_bytes;
@@ -2944,12 +2946,10 @@ static void flush_delalloc(struct btrfs_root *root,
 
 	spin_lock(&info->lock);
 
-	if (!info->flushing) {
+	if (!info->flushing)
 		info->flushing = 1;
-		init_waitqueue_head(&info->flush_wait);
-	} else {
+	else
 		wait = true;
-	}
 
 	spin_unlock(&info->lock);
 
@@ -3011,7 +3011,6 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
 	if (!info->allocating_chunk) {
 		info->force_alloc = 1;
 		info->allocating_chunk = 1;
-		init_waitqueue_head(&info->allocate_wait);
 	} else {
 		wait = true;
 	}
-- 
cgit v1.2.2


From 6bdb72ded1e281cd8844918c39d00cdd0e59f655 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 15 Mar 2010 17:27:13 +0000
Subject: Btrfs: create snapshot references in same commit as snapshot

This creates the reference to a new snapshot in the same commit as the
snapshot itself.  This avoids the need for a second commit in order for a
snapshot to be persistent, and also avoids the problem of "leaking" a
new snapshot tree root if the host crashes before the second commit takes
place.

It is not at all clear to me why it wasn't always done this way.  If there
is still a reason for the two-stage {create,finish}_pending_snapshots()
approach I'm missing something!  :)

I've been running this for a couple weeks under pretty heavy usage (a few
snapshots per minute) without obvious problems.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 97 ++++++++++++++++----------------------------------
 1 file changed, 31 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 43054285f638..01cebd661997 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -755,10 +755,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct btrfs_root_item *new_root_item;
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *root = pending->root;
+	struct btrfs_root *parent_root;
+	struct inode *parent_inode;
 	struct extent_buffer *tmp;
 	struct extent_buffer *old;
 	int ret;
 	u64 objectid;
+	int namelen;
+	u64 index = 0;
+
+	parent_inode = pending->dentry->d_parent->d_inode;
+	parent_root = BTRFS_I(parent_inode)->root;
 
 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
 	if (!new_root_item) {
@@ -769,79 +776,59 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto fail;
 
-	record_root_in_trans(trans, root);
-	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
-	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
-
 	key.objectid = objectid;
 	/* record when the snapshot was created in key.offset */
 	key.offset = trans->transid;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
-	old = btrfs_lock_root_node(root);
-	btrfs_cow_block(trans, root, old, NULL, 0, &old);
-	btrfs_set_lock_blocking(old);
-
-	btrfs_copy_root(trans, root, old, &tmp, objectid);
-	btrfs_tree_unlock(old);
-	free_extent_buffer(old);
-
-	btrfs_set_root_node(new_root_item, tmp);
-	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
-				new_root_item);
-	btrfs_tree_unlock(tmp);
-	free_extent_buffer(tmp);
-	if (ret)
-		goto fail;
-
-	key.offset = (u64)-1;
 	memcpy(&pending->root_key, &key, sizeof(key));
-fail:
-	kfree(new_root_item);
-	return ret;
-}
-
-static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
-				   struct btrfs_pending_snapshot *pending)
-{
-	int ret;
-	int namelen;
-	u64 index = 0;
-	struct btrfs_trans_handle *trans;
-	struct inode *parent_inode;
-	struct btrfs_root *parent_root;
-
-	parent_inode = pending->dentry->d_parent->d_inode;
-	parent_root = BTRFS_I(parent_inode)->root;
-	trans = btrfs_join_transaction(parent_root, 1);
+	pending->root_key.offset = (u64)-1;
 
+	record_root_in_trans(trans, parent_root);
 	/*
 	 * insert the directory item
 	 */
 	namelen = strlen(pending->name);
 	ret = btrfs_set_inode_index(parent_inode, &index);
+	BUG_ON(ret);
 	ret = btrfs_insert_dir_item(trans, parent_root,
 			    pending->name, namelen,
 			    parent_inode->i_ino,
 			    &pending->root_key, BTRFS_FT_DIR, index);
-
-	if (ret)
-		goto fail;
+	BUG_ON(ret);
 
 	btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
 	ret = btrfs_update_inode(trans, parent_root, parent_inode);
 	BUG_ON(ret);
 
+	record_root_in_trans(trans, root);
+	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
+	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+
+	old = btrfs_lock_root_node(root);
+	btrfs_cow_block(trans, root, old, NULL, 0, &old);
+	btrfs_set_lock_blocking(old);
+
+	btrfs_copy_root(trans, root, old, &tmp, objectid);
+	btrfs_tree_unlock(old);
+	free_extent_buffer(old);
+
+	btrfs_set_root_node(new_root_item, tmp);
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+				new_root_item);
+	BUG_ON(ret);
+	btrfs_tree_unlock(tmp);
+	free_extent_buffer(tmp);
+
 	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
 				 pending->root_key.objectid,
 				 parent_root->root_key.objectid,
 				 parent_inode->i_ino, index, pending->name,
 				 namelen);
-
 	BUG_ON(ret);
 
 fail:
-	btrfs_end_transaction(trans, fs_info->fs_root);
+	kfree(new_root_item);
 	return ret;
 }
 
@@ -862,25 +849,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
-					     struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_pending_snapshot *pending;
-	struct list_head *head = &trans->transaction->pending_snapshots;
-	int ret;
-
-	while (!list_empty(head)) {
-		pending = list_entry(head->next,
-				     struct btrfs_pending_snapshot, list);
-		ret = finish_pending_snapshot(fs_info, pending);
-		BUG_ON(ret);
-		list_del(&pending->list);
-		kfree(pending->name);
-		kfree(pending);
-	}
-	return 0;
-}
-
 static void update_super_roots(struct btrfs_root *root)
 {
 	struct btrfs_root_item *root_item;
@@ -1092,9 +1060,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_finish_extent_commit(trans, root);
 
-	/* do the directory inserts of any pending snapshot creations */
-	finish_pending_snapshots(trans, root->fs_info);
-
 	mutex_lock(&root->fs_info->trans_mutex);
 
 	cur_trans->commit_done = 1;
-- 
cgit v1.2.2


From 109f6aef5fc436f355ad027f4d97bd696df2049a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 2 Apr 2010 09:20:18 -0400
Subject: Btrfs: add check for changed leaves in setup_leaf_for_split

setup_leaf_for_split needs to drop the path and search again, and has
checks to see if the item we want to split changed size.  But, it misses
the case where the leaf changed and now has enough room for the item
we want to insert.

This adds an extra check to make sure the leaf really needs splitting
before we call btrfs_split_leaf(), which keeps us from trying to split
a leaf with a single item.

btrfs_split_leaf() will blindly split the single item leaf, leaving us
with one good leaf and one empty leaf and then a crash.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4bc570a396e..babf7fbaec84 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3040,6 +3040,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
 	if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
 		goto err;
 
+	/* the leaf has  changed, it now has room.  return now */
+	if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
+		goto err;
+
 	if (key.type == BTRFS_EXTENT_DATA_KEY) {
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
-- 
cgit v1.2.2


From 85a770a8889035625466a4cfb1393cd7d2ffd165 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Tue, 30 Mar 2010 09:41:25 +0000
Subject: 9p: return on mutex_lock_interruptible()

If "err" is -EINTR here the original code calls mutex_unlock() and then
returns, but it should just return directly.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>

------------------------------------------------------------------------------
Download Intel&#174; Parallel Studio Eval
Try the new software tools for yourself. Speed compiling, find bugs
proactively, and fine-tune applications for parallel performance.
See why Intel Parallel Studio got high marks during beta.
http://p.sf.net/sfu/intel-sw-dev
---
 fs/9p/vfs_dir.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index d8a3afe4ff72..bbe00cf799fa 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -130,6 +130,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	rdir = (struct p9_rdir *) fid->rdir;
 
 	err = mutex_lock_interruptible(&rdir->mutex);
+	if (err)
+		return err;
 	while (err == 0) {
 		if (rdir->tail == rdir->head) {
 			err = v9fs_file_readn(filp, rdir->buf, NULL,
-- 
cgit v1.2.2


From ab6e24103cbd215e922938a4f58c75194761a60e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Mar 2010 14:38:13 +0000
Subject: Btrfs: fix data enospc check overflow

Because we account for reserved space we get from the allocator before we
actually account for allocating delalloc space, we can have a small window where
the amount of "used" space in a space_info is more than the total amount of
space in the space_info.  This will cause a overflow in our check, so it will
seem like we have _tons_ of free space, and we'll allow reservations to occur
that will end up larger than the amount of space we have.  I've seen users
report ENOSPC panic's in cow_file_range a few times recently, so I tried to
reproduce this problem and found I could reproduce it if I ran one of my tests
in a loop for like 20 minutes.  With this patch my test ran all night without
issues.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 101041d4d2b2..786899da3eb9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3234,7 +3234,8 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 				u64 bytes)
 {
 	struct btrfs_space_info *data_sinfo;
-	int ret = 0, committed = 0;
+	u64 used;
+	int ret = 0, committed = 0, flushed = 0;
 
 	/* make sure bytes are sectorsize aligned */
 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3246,12 +3247,21 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 again:
 	/* make sure we have enough space to handle the data first */
 	spin_lock(&data_sinfo->lock);
-	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
-	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
-	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
-	    data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
+	used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
+		data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
+		data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
+		data_sinfo->bytes_super;
+
+	if (used + bytes > data_sinfo->total_bytes) {
 		struct btrfs_trans_handle *trans;
 
+		if (!flushed) {
+			spin_unlock(&data_sinfo->lock);
+			flush_delalloc(root, data_sinfo);
+			flushed = 1;
+			goto again;
+		}
+
 		/*
 		 * if we don't have enough free bytes in this space then we need
 		 * to alloc a new chunk.
-- 
cgit v1.2.2


From 9f680ce04ea19dabbbafe01b57b61930a9b70741 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 6 Apr 2010 09:37:47 -0400
Subject: Btrfs: make sure the chunk allocator doesn't create zero length
 chunks

A recent commit allowed for smaller chunks to be created, but didn't
make sure they were always bigger than a stripe.  After some divides,
this led to zero length stripes.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9bf1f581b872..b584e9a2add2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2249,6 +2249,12 @@ again:
 	if (!looped)
 		calc_size = max_t(u64, min_stripe_size, calc_size);
 
+	/*
+	 * we're about to do_div by the stripe_len so lets make sure
+	 * we end up with something bigger than a stripe
+	 */
+	calc_size = max_t(u64, calc_size, stripe_len * 4);
+
 	do_div(calc_size, stripe_len);
 	calc_size *= stripe_len;
 
-- 
cgit v1.2.2


From 309361e09ca9e9670dc8664e5d14125bf82078af Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Tue, 6 Apr 2010 13:45:39 +0300
Subject: proc: copy_to_user() returns unsigned

copy_to_user() returns the number of bytes left to be copied.

This was a typo from: d82ef020cf31 "proc: pagemap: Hold mmap_sem during
page walk".

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index caf0337dff73..a05a669510a4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -800,7 +800,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		start_vaddr = end;
 
 		len = min(count, PM_ENTRY_BYTES * pm.pos);
-		if (copy_to_user(buf, pm.buffer, len) < 0) {
+		if (copy_to_user(buf, pm.buffer, len)) {
 			ret = -EFAULT;
 			goto out_free;
 		}
-- 
cgit v1.2.2


From f05337c6ac48d19d354e0640a8eb8fc884f82bcc Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Mon, 5 Apr 2010 09:59:14 +0400
Subject: not overwriting file_lock structure after GET_LK

If we have preventing lock, cifs should overwrite file_lock structure
with info about preventing lock. If we haven't preventing lock, cifs
should leave it unchanged except for the lock type (change it to F_UNLCK).

Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 15 ++++++++++++++-
 fs/cifs/file.c    | 28 ++++++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f213b8ae43c1..184a399749a6 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1810,8 +1810,21 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 		}
 		parm_data = (struct cifs_posix_lock *)
 			((char *)&pSMBr->hdr.Protocol + data_offset);
-		if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
+		if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK))
 			pLockData->fl_type = F_UNLCK;
+		else {
+			if (parm_data->lock_type ==
+					__constant_cpu_to_le16(CIFS_RDLCK))
+				pLockData->fl_type = F_RDLCK;
+			else if (parm_data->lock_type ==
+					__constant_cpu_to_le16(CIFS_WRLCK))
+				pLockData->fl_type = F_WRLCK;
+
+			pLockData->fl_start = parm_data->start;
+			pLockData->fl_end = parm_data->start +
+						parm_data->length - 1;
+			pLockData->fl_pid = parm_data->pid;
+		}
 	}
 
 plk_err_exit:
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ca2ba7a0193c..d9e86504b9d4 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -838,8 +838,32 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 
 		} else {
 			/* if rc == ERR_SHARING_VIOLATION ? */
-			rc = 0;	/* do not change lock type to unlock
-				   since range in use */
+			rc = 0;
+
+			if (lockType & LOCKING_ANDX_SHARED_LOCK) {
+				pfLock->fl_type = F_WRLCK;
+			} else {
+				rc = CIFSSMBLock(xid, tcon, netfid, length,
+					pfLock->fl_start, 0, 1,
+					lockType | LOCKING_ANDX_SHARED_LOCK,
+					0 /* wait flag */);
+				if (rc == 0) {
+					rc = CIFSSMBLock(xid, tcon, netfid,
+						length, pfLock->fl_start, 1, 0,
+						lockType |
+						LOCKING_ANDX_SHARED_LOCK,
+						0 /* wait flag */);
+					pfLock->fl_type = F_RDLCK;
+					if (rc != 0)
+						cERROR(1, ("Error unlocking "
+						"previously locked range %d "
+						"during test of lock", rc));
+					rc = 0;
+				} else {
+					pfLock->fl_type = F_WRLCK;
+					rc = 0;
+				}
+			}
 		}
 
 		FreeXid(xid);
-- 
cgit v1.2.2


From 55ab3a1ff843e3f0e24d2da44e71bffa5d853010 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 6 Apr 2010 14:34:58 -0700
Subject: raw: fsync method is now required

Commit 148f948ba877f4d3cdef036b1ff6d9f68986706a (vfs: Introduce new
helpers for syncing after writing to O_SYNC file or IS_SYNC inode) broke
the raw driver.

We now call through generic_file_aio_write -> generic_write_sync ->
vfs_fsync_range.  vfs_fsync_range has:

        if (!fop || !fop->fsync) {
                ret = -EINVAL;
                goto out;
        }

But drivers/char/raw.c doesn't set an fsync method.

We have two options: fix it or remove the raw driver completely.  I'm
happy to do either, the fact this has been broken for so long suggests it
is rarely used.

The patch below adds an fsync method to the raw driver.  My knowledge of
the block layer is pretty sketchy so this could do with a once over.

If we instead decide to remove the raw driver, this patch might still be
useful as a backport to 2.6.33 and 2.6.32.

Signed-off-by: Anton Blanchard <anton@samba.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <jens.axboe@oracle.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Tested-by: Jeff Moyer <jmoyer@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index d11d0289f3d2..8db62b2b6df8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -404,7 +404,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
  *	NULL first argument is nfsd_sync_dir() and that's not a directory.
  */
  
-static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
 	int error;
@@ -418,6 +418,7 @@ static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
 		error = 0;
 	return error;
 }
+EXPORT_SYMBOL(block_fsync);
 
 /*
  * pseudo-fs
-- 
cgit v1.2.2


From b1dd3b2843b3b73b7fc2ee47d96310cd1c051371 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 6 Apr 2010 14:35:00 -0700
Subject: vfs: rename block_fsync() to blkdev_fsync()

Requested by hch, for consistency now it is exported.

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Anton Blanchard <anton@samba.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jan Kara <jack@suse.cz>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8db62b2b6df8..2a6d0193f139 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -404,7 +404,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
  *	NULL first argument is nfsd_sync_dir() and that's not a directory.
  */
  
-int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
 	int error;
@@ -418,7 +418,7 @@ int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
 		error = 0;
 	return error;
 }
-EXPORT_SYMBOL(block_fsync);
+EXPORT_SYMBOL(blkdev_fsync);
 
 /*
  * pseudo-fs
@@ -1482,7 +1482,7 @@ const struct file_operations def_blk_fops = {
   	.aio_read	= generic_file_aio_read,
 	.aio_write	= blkdev_aio_write,
 	.mmap		= generic_file_mmap,
-	.fsync		= block_fsync,
+	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_blkdev_ioctl,
-- 
cgit v1.2.2


From 116354d177ba2da37e91cf884e3d11e67f825efd Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Tue, 6 Apr 2010 14:35:04 -0700
Subject: pagemap: fix pfn calculation for hugepage

When we look into pagemap using page-types with option -p, the value of
pfn for hugepages looks wrong (see below.) This is because pte was
evaluated only once for one vma although it should be updated for each
hugepage.  This patch fixes it.

  $ page-types -p 3277 -Nl -b huge
  voffset   offset  len     flags
  7f21e8a00 11e400  1       ___U___________H_G________________
  7f21e8a01 11e401  1ff     ________________TG________________
               ^^^
  7f21e8c00 11e400  1       ___U___________H_G________________
  7f21e8c01 11e401  1ff     ________________TG________________
               ^^^

One hugepage contains 1 head page and 511 tail pages in x86_64 and each
two lines represent each hugepage.  Voffset and offset mean virtual
address and physical address in the page unit, respectively.  The
different hugepages should not have the same offset value.

With this patch applied:

  $ page-types -p 3386 -Nl -b huge
  voffset   offset   len    flags
  7fec7a600 112c00   1      ___UD__________H_G________________
  7fec7a601 112c01   1ff    ________________TG________________
               ^^^
  7fec7a800 113200   1      ___UD__________H_G________________
  7fec7a801 113201   1ff    ________________TG________________
               ^^^
               OK

More info:

- This patch modifies walk_page_range()'s hugepage walker.  But the
  change only affects pagemap_read(), which is the only caller of hugepage
  callback.

- Without this patch, hugetlb_entry() callback is called per vma, that
  doesn't match the natural expectation from its name.

- With this patch, hugetlb_entry() is called per hugepte entry and the
  callback can become much simpler.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a05a669510a4..070553427dd5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -662,31 +662,18 @@ static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
 	return pme;
 }
 
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr,
-				 unsigned long end, struct mm_walk *walk)
+/* This function walks within one hugetlb entry in the single call */
+static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+				 unsigned long addr, unsigned long end,
+				 struct mm_walk *walk)
 {
-	struct vm_area_struct *vma;
 	struct pagemapread *pm = walk->private;
-	struct hstate *hs = NULL;
 	int err = 0;
+	u64 pfn;
 
-	vma = find_vma(walk->mm, addr);
-	if (vma)
-		hs = hstate_vma(vma);
 	for (; addr != end; addr += PAGE_SIZE) {
-		u64 pfn = PM_NOT_PRESENT;
-
-		if (vma && (addr >= vma->vm_end)) {
-			vma = find_vma(walk->mm, addr);
-			if (vma)
-				hs = hstate_vma(vma);
-		}
-
-		if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) {
-			/* calculate pfn of the "raw" page in the hugepage. */
-			int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT;
-			pfn = huge_pte_to_pagemap_entry(*pte, offset);
-		}
+		int offset = (addr & ~hmask) >> PAGE_SHIFT;
+		pfn = huge_pte_to_pagemap_entry(*pte, offset);
 		err = add_to_pagemap(addr, pfn, pm);
 		if (err)
 			return err;
-- 
cgit v1.2.2


From cc4fc29e59c386919a7674e203be7822dc968dc0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 6 Apr 2010 14:35:09 -0700
Subject: fs-cache: order the debugfs stats correctly

Order the debugfs statistics correctly.  The values displayed through a
seq_printf() statement should be in the same order as the names in the
format string.

In the 'Lookups' line, objects created ('crt=') and lookups timed out
('tmo=') have their values transposed.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fscache/stats.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 46435f3aae68..4765190d537f 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -165,8 +165,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&fscache_n_object_lookups),
 		   atomic_read(&fscache_n_object_lookups_negative),
 		   atomic_read(&fscache_n_object_lookups_positive),
-		   atomic_read(&fscache_n_object_lookups_timed_out),
-		   atomic_read(&fscache_n_object_created));
+		   atomic_read(&fscache_n_object_created),
+		   atomic_read(&fscache_n_object_lookups_timed_out));
 
 	seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
 		   atomic_read(&fscache_n_updates),
-- 
cgit v1.2.2


From 04287f975e68038051eb9c79896866d36610b8e0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Thu, 8 Apr 2010 00:06:07 +0100
Subject: Have nfs ->d_revalidate() report errors properly

	If nfs atomic open implementation ends up doing open request from
->d_revalidate() codepath and gets an error from server, return that error
to caller explicitly and don't bother with lookup_instantiate_filp() at all.
->d_revalidate() can return an error itself just fine...

See
	http://bugzilla.kernel.org/show_bug.cgi?id=15674
	http://marc.info/?l=linux-kernel&m=126988782722711&w=2

for original report.

Reported-by: Daniel J Blueman <daniel.blueman@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/nfs4proc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d79a7b37e56c..fe0cd9eb1d4d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2068,8 +2068,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
 			case -EDQUOT:
 			case -ENOSPC:
 			case -EROFS:
-				lookup_instantiate_filp(nd, (struct dentry *)state, NULL);
-				return 1;
+				return PTR_ERR(state);
 			default:
 				goto out_drop;
 		}
-- 
cgit v1.2.2


From 69ecbbedac8e353bbd924fad16fed0c7c54e6382 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Mon, 15 Mar 2010 11:21:13 +0300
Subject: udf: potential integer overflow

bloc->logicalBlockNum is unsigned so it's never less than zero.

When I saw that, it made me worry that "bloc->logicalBlockNum + count"
could overflow.  That's why I changed the check for less than zero
to an overflow check.  (The test works because "count" is also
unsigned.)

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/balloc.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 19626e2491c4..9a9378b4eb5a 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -125,9 +125,8 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
 
 	mutex_lock(&sbi->s_alloc_mutex);
 	partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
-	if (bloc->logicalBlockNum < 0 ||
-	    (bloc->logicalBlockNum + count) >
-		partmap->s_partition_len) {
+	if (bloc->logicalBlockNum + count < count ||
+	    (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
 		udf_debug("%d < %d || %d + %d > %d\n",
 			  bloc->logicalBlockNum, 0, bloc->logicalBlockNum,
 			  count, partmap->s_partition_len);
@@ -393,9 +392,8 @@ static void udf_table_free_blocks(struct super_block *sb,
 
 	mutex_lock(&sbi->s_alloc_mutex);
 	partmap = &sbi->s_partmaps[bloc->partitionReferenceNum];
-	if (bloc->logicalBlockNum < 0 ||
-	    (bloc->logicalBlockNum + count) >
-		partmap->s_partition_len) {
+	if (bloc->logicalBlockNum + count < count ||
+	    (bloc->logicalBlockNum + count) > partmap->s_partition_len) {
 		udf_debug("%d < %d || %d + %d > %d\n",
 			  bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count,
 			  partmap->s_partition_len);
-- 
cgit v1.2.2


From c15d0fc0fc399d2639240b35ad7ed93ed5a59412 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Mon, 29 Mar 2010 11:05:21 +0400
Subject: udf: add speciffic ->setattr callback

generic setattr not longer responsible for quota transfer.
use udf_setattr for all udf's inodes.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/file.c    | 2 +-
 fs/udf/inode.c   | 2 +-
 fs/udf/namei.c   | 9 ++++++++-
 fs/udf/udfdecl.h | 3 ++-
 4 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1eb06774ed90..4b6a46ccbf46 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -218,7 +218,7 @@ const struct file_operations udf_file_operations = {
 	.llseek			= generic_file_llseek,
 };
 
-static int udf_setattr(struct dentry *dentry, struct iattr *iattr)
+int udf_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
 	int error;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index bb863fe579ac..8a3fbd177cab 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1314,7 +1314,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 		break;
 	case ICBTAG_FILE_TYPE_SYMLINK:
 		inode->i_data.a_ops = &udf_symlink_aops;
-		inode->i_op = &page_symlink_inode_operations;
+		inode->i_op = &udf_symlink_inode_operations;
 		inode->i_mode = S_IFLNK | S_IRWXUGO;
 		break;
 	case ICBTAG_FILE_TYPE_MAIN:
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index db423ab078b1..75816025f95f 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -925,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	iinfo = UDF_I(inode);
 	inode->i_mode = S_IFLNK | S_IRWXUGO;
 	inode->i_data.a_ops = &udf_symlink_aops;
-	inode->i_op = &page_symlink_inode_operations;
+	inode->i_op = &udf_symlink_inode_operations;
 
 	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
 		struct kernel_lb_addr eloc;
@@ -1393,6 +1393,7 @@ const struct export_operations udf_export_ops = {
 const struct inode_operations udf_dir_inode_operations = {
 	.lookup				= udf_lookup,
 	.create				= udf_create,
+	.setattr			= udf_setattr,
 	.link				= udf_link,
 	.unlink				= udf_unlink,
 	.symlink			= udf_symlink,
@@ -1401,3 +1402,9 @@ const struct inode_operations udf_dir_inode_operations = {
 	.mknod				= udf_mknod,
 	.rename				= udf_rename,
 };
+const struct inode_operations udf_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= udf_setattr,
+};
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 4223ac855da9..702a1148e702 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -76,6 +76,7 @@ extern const struct inode_operations udf_dir_inode_operations;
 extern const struct file_operations udf_dir_operations;
 extern const struct inode_operations udf_file_inode_operations;
 extern const struct file_operations udf_file_operations;
+extern const struct inode_operations udf_symlink_inode_operations;
 extern const struct address_space_operations udf_aops;
 extern const struct address_space_operations udf_adinicb_aops;
 extern const struct address_space_operations udf_symlink_aops;
@@ -131,7 +132,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 /* file.c */
 extern int udf_ioctl(struct inode *, struct file *, unsigned int,
 		     unsigned long);
-
+extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
 extern int udf_sync_inode(struct inode *);
-- 
cgit v1.2.2


From 2844a76a25a2fc2f5025cf128c95a14d86146d33 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 9 Apr 2010 15:46:42 -0700
Subject: ceph: decode v5 of osdmap (pool names) [protocol change]

Teach the client to decode an updated format for the osdmap.  The new
format includes pool names, which will be useful shortly.  Get this change
in earlier rather than later.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 180 +++++++++++++++++++++++++++++++++----------------------
 fs/ceph/osdmap.h |   1 +
 fs/ceph/rados.h  |   6 +-
 3 files changed, 114 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index d82fe87c2a6e..7526d6d33016 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -312,71 +312,6 @@ bad:
 	return ERR_PTR(err);
 }
 
-
-/*
- * osd map
- */
-void ceph_osdmap_destroy(struct ceph_osdmap *map)
-{
-	dout("osdmap_destroy %p\n", map);
-	if (map->crush)
-		crush_destroy(map->crush);
-	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
-		struct ceph_pg_mapping *pg =
-			rb_entry(rb_first(&map->pg_temp),
-				 struct ceph_pg_mapping, node);
-		rb_erase(&pg->node, &map->pg_temp);
-		kfree(pg);
-	}
-	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
-		struct ceph_pg_pool_info *pi =
-			rb_entry(rb_first(&map->pg_pools),
-				 struct ceph_pg_pool_info, node);
-		rb_erase(&pi->node, &map->pg_pools);
-		kfree(pi);
-	}
-	kfree(map->osd_state);
-	kfree(map->osd_weight);
-	kfree(map->osd_addr);
-	kfree(map);
-}
-
-/*
- * adjust max osd value.  reallocate arrays.
- */
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
-{
-	u8 *state;
-	struct ceph_entity_addr *addr;
-	u32 *weight;
-
-	state = kcalloc(max, sizeof(*state), GFP_NOFS);
-	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
-	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
-	if (state == NULL || addr == NULL || weight == NULL) {
-		kfree(state);
-		kfree(addr);
-		kfree(weight);
-		return -ENOMEM;
-	}
-
-	/* copy old? */
-	if (map->osd_state) {
-		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
-		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
-		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
-		kfree(map->osd_state);
-		kfree(map->osd_addr);
-		kfree(map->osd_weight);
-	}
-
-	map->osd_state = state;
-	map->osd_weight = weight;
-	map->osd_addr = addr;
-	map->max_osd = max;
-	return 0;
-}
-
 /*
  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
  * to a set of osds)
@@ -480,6 +415,13 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
 	return NULL;
 }
 
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+	rb_erase(&pi->node, root);
+	kfree(pi->name);
+	kfree(pi);
+}
+
 void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
 {
 	ceph_decode_copy(p, &pi->v, sizeof(pi->v));
@@ -488,6 +430,98 @@ void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
 	*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
 }
 
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+	struct ceph_pg_pool_info *pi;
+	u32 num, len, pool;
+
+	ceph_decode_32_safe(p, end, num, bad);
+	dout(" %d pool names\n", num);
+	while (num--) {
+		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_32_safe(p, end, len, bad);
+		dout("  pool %d len %d\n", pool, len);
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (pi) {
+			kfree(pi->name);
+			pi->name = kmalloc(len + 1, GFP_NOFS);
+			if (pi->name) {
+				memcpy(pi->name, *p, len);
+				pi->name[len] = '\0';
+				dout("  name is %s\n", pi->name);
+			}
+		}
+		*p += len;
+	}
+	return 0;
+
+bad:
+	return -EINVAL;
+}
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+	dout("osdmap_destroy %p\n", map);
+	if (map->crush)
+		crush_destroy(map->crush);
+	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(rb_first(&map->pg_temp),
+				 struct ceph_pg_mapping, node);
+		rb_erase(&pg->node, &map->pg_temp);
+		kfree(pg);
+	}
+	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+		struct ceph_pg_pool_info *pi =
+			rb_entry(rb_first(&map->pg_pools),
+				 struct ceph_pg_pool_info, node);
+		__remove_pg_pool(&map->pg_pools, pi);
+	}
+	kfree(map->osd_state);
+	kfree(map->osd_weight);
+	kfree(map->osd_addr);
+	kfree(map);
+}
+
+/*
+ * adjust max osd value.  reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+	u8 *state;
+	struct ceph_entity_addr *addr;
+	u32 *weight;
+
+	state = kcalloc(max, sizeof(*state), GFP_NOFS);
+	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+	if (state == NULL || addr == NULL || weight == NULL) {
+		kfree(state);
+		kfree(addr);
+		kfree(weight);
+		return -ENOMEM;
+	}
+
+	/* copy old? */
+	if (map->osd_state) {
+		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+		kfree(map->osd_state);
+		kfree(map->osd_addr);
+		kfree(map->osd_weight);
+	}
+
+	map->osd_state = state;
+	map->osd_weight = weight;
+	map->osd_addr = addr;
+	map->max_osd = max;
+	return 0;
+}
+
 /*
  * decode a full map.
  */
@@ -524,7 +558,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	ceph_decode_32_safe(p, end, max, bad);
 	while (max--) {
 		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
-		pi = kmalloc(sizeof(*pi), GFP_NOFS);
+		pi = kzalloc(sizeof(*pi), GFP_NOFS);
 		if (!pi)
 			goto bad;
 		pi->id = ceph_decode_32(p);
@@ -537,6 +571,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		__decode_pool(p, pi);
 		__insert_pg_pool(&map->pg_pools, pi);
 	}
+
+	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+		goto bad;
+
 	ceph_decode_32_safe(p, end, map->pool_max, bad);
 
 	ceph_decode_32_safe(p, end, map->flags, bad);
@@ -710,7 +748,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		}
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
 		if (!pi) {
-			pi = kmalloc(sizeof(*pi), GFP_NOFS);
+			pi = kzalloc(sizeof(*pi), GFP_NOFS);
 			if (!pi) {
 				err = -ENOMEM;
 				goto bad;
@@ -720,6 +758,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		}
 		__decode_pool(p, pi);
 	}
+	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+		goto bad;
 
 	/* old_pool */
 	ceph_decode_32_safe(p, end, len, bad);
@@ -728,10 +768,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 
 		ceph_decode_32_safe(p, end, pool, bad);
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
-		if (pi) {
-			rb_erase(&pi->node, &map->pg_pools);
-			kfree(pi);
-		}
+		if (pi)
+			__remove_pg_pool(&map->pg_pools, pi);
 	}
 
 	/* new_up */
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
index 1fb55afb2642..8bc9f1e4f562 100644
--- a/fs/ceph/osdmap.h
+++ b/fs/ceph/osdmap.h
@@ -23,6 +23,7 @@ struct ceph_pg_pool_info {
 	int id;
 	struct ceph_pg_pool v;
 	int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+	char *name;
 };
 
 struct ceph_pg_mapping {
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 26ac8b89a676..a1fc1d017b58 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -11,8 +11,10 @@
 /*
  * osdmap encoding versions
  */
-#define CEPH_OSDMAP_INC_VERSION 4
-#define CEPH_OSDMAP_VERSION     4
+#define CEPH_OSDMAP_INC_VERSION     5
+#define CEPH_OSDMAP_INC_VERSION_EXT 5
+#define CEPH_OSDMAP_VERSION         5
+#define CEPH_OSDMAP_VERSION_EXT     5
 
 /*
  * fs id
-- 
cgit v1.2.2


From 80e60639f1b7c121a7fea53920c5a4b94009361a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 Mar 2010 13:51:05 -0400
Subject: NFSv4: Fall back to ordinary lookup if nfs4_atomic_open() returns
 EISDIR

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c6f2750648f4..be46f26c9a56 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1025,12 +1025,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 				res = NULL;
 				goto out;
 			/* This turned out not to be a regular file */
+			case -EISDIR:
 			case -ENOTDIR:
 				goto no_open;
 			case -ELOOP:
 				if (!(nd->intent.open.flags & O_NOFOLLOW))
 					goto no_open;
-			/* case -EISDIR: */
 			/* case -EINVAL: */
 			default:
 				goto out;
-- 
cgit v1.2.2


From 1544fa0f7a46241582abc48f07b74f3d846379e4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 Mar 2010 13:54:49 -0400
Subject: NFS: Fix the mode calculation in nfs_find_open_context

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 737128f777f3..50a56edca0b5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -623,10 +623,10 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
 	list_for_each_entry(pos, &nfsi->open_files, list) {
 		if (cred != NULL && pos->cred != cred)
 			continue;
-		if ((pos->mode & mode) == mode) {
-			ctx = get_nfs_open_context(pos);
-			break;
-		}
+		if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
+			continue;
+		ctx = get_nfs_open_context(pos);
+		break;
 	}
 	spin_unlock(&inode->i_lock);
 	return ctx;
-- 
cgit v1.2.2


From b80c3cb628f0ebc241b02e38dd028969fb8026a2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Apr 2010 19:07:07 -0400
Subject: NFS: Ensure that writeback_single_inode() calls write_inode() when
 syncing

Since writeback_single_inode() checks the inode->i_state flags _before_ it
flushes out the data, we need to ensure that the I_DIRTY_DATASYNC flag is
already set. Otherwise we risk not seeing a call to write_inode(), which
again means that we break fsync() et al...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53ff70e23993..7f40ea305543 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -421,6 +421,7 @@ static void
 nfs_mark_request_dirty(struct nfs_page *req)
 {
 	__set_page_dirty_nobuffers(req->wb_page);
+	__mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -660,6 +661,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
 	req = nfs_setup_write_request(ctx, page, offset, count);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
+	nfs_mark_request_dirty(req);
 	/* Update file length */
 	nfs_grow_file(page, offset, count);
 	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
@@ -739,8 +741,6 @@ int nfs_updatepage(struct file *file, struct page *page,
 	status = nfs_writepage_setup(ctx, page, offset, count);
 	if (status < 0)
 		nfs_set_pageerror(page);
-	else
-		__set_page_dirty_nobuffers(page);
 
 	dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
 			status, (long long)i_size_read(inode));
-- 
cgit v1.2.2


From a6305ddb080fb483ca41ca56cacb6f96089f0c8e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Apr 2010 19:07:08 -0400
Subject: NFS: Fix a race with the new commit code

This patch fixes a race which occurs due to the fact that we release the
PG_writeback flag while still holding the nfs_page locked.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7f40ea305543..40297c4f1ee0 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -201,6 +201,7 @@ static int nfs_set_page_writeback(struct page *page)
 		struct inode *inode = page->mapping->host;
 		struct nfs_server *nfss = NFS_SERVER(inode);
 
+		page_cache_get(page);
 		if (atomic_long_inc_return(&nfss->writeback) >
 				NFS_CONGESTION_ON_THRESH) {
 			set_bdi_congested(&nfss->backing_dev_info,
@@ -216,6 +217,7 @@ static void nfs_end_page_writeback(struct page *page)
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
 	end_page_writeback(page);
+	page_cache_release(page);
 	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
 		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
@@ -665,6 +667,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
 	/* Update file length */
 	nfs_grow_file(page, offset, count);
 	nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+	nfs_mark_request_dirty(req);
 	nfs_clear_page_tag_locked(req);
 	return 0;
 }
@@ -749,13 +752,12 @@ int nfs_updatepage(struct file *file, struct page *page,
 
 static void nfs_writepage_release(struct nfs_page *req)
 {
+	struct page *page = req->wb_page;
 
-	if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
-		nfs_end_page_writeback(req->wb_page);
+	if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req))
 		nfs_inode_remove_request(req);
-	} else
-		nfs_end_page_writeback(req->wb_page);
 	nfs_clear_page_tag_locked(req);
+	nfs_end_page_writeback(page);
 }
 
 static int flush_task_priority(int how)
@@ -847,9 +849,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
  */
 static void nfs_redirty_request(struct nfs_page *req)
 {
+	struct page *page = req->wb_page;
+
 	nfs_mark_request_dirty(req);
-	nfs_end_page_writeback(req->wb_page);
 	nfs_clear_page_tag_locked(req);
+	nfs_end_page_writeback(page);
 }
 
 /*
@@ -1084,16 +1088,15 @@ static void nfs_writeback_release_full(void *calldata)
 		if (nfs_write_need_commit(data)) {
 			memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
 			nfs_mark_request_commit(req);
-			nfs_end_page_writeback(page);
 			dprintk(" marked for commit\n");
 			goto next;
 		}
 		dprintk(" OK\n");
 remove_request:
-		nfs_end_page_writeback(page);
 		nfs_inode_remove_request(req);
 	next:
 		nfs_clear_page_tag_locked(req);
+		nfs_end_page_writeback(page);
 	}
 	nfs_writedata_release(calldata);
 }
-- 
cgit v1.2.2


From 2c61be0a9478258f77b66208a0c4b1f5f8161c3c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Apr 2010 19:54:50 -0400
Subject: NFS: Ensure that the WRITE and COMMIT RPC calls are always
 uninterruptible

We always want to ensure that WRITE and COMMIT completes, whether or not
the user presses ^C. Do this by making the call asynchronous, and allowing
the user to do an interruptible wait for rpc_task completion.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 40297c4f1ee0..de38d63aa920 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -781,7 +781,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 		int how)
 {
 	struct inode *inode = req->wb_context->path.dentry->d_inode;
-	int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 	int priority = flush_task_priority(how);
 	struct rpc_task *task;
 	struct rpc_message msg = {
@@ -796,9 +795,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 		.callback_ops = call_ops,
 		.callback_data = data,
 		.workqueue = nfsiod_workqueue,
-		.flags = flags,
+		.flags = RPC_TASK_ASYNC,
 		.priority = priority,
 	};
+	int ret = 0;
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
@@ -837,10 +837,18 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 		(unsigned long long)data->args.offset);
 
 	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task))
-		return PTR_ERR(task);
+	if (IS_ERR(task)) {
+		ret = PTR_ERR(task);
+		goto out;
+	}
+	if (how & FLUSH_SYNC) {
+		ret = rpc_wait_for_completion_task(task);
+		if (ret == 0)
+			ret = task->tk_status;
+	}
 	rpc_put_task(task);
-	return 0;
+out:
+	return ret;
 }
 
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -1210,7 +1218,6 @@ static int nfs_commit_rpcsetup(struct list_head *head,
 {
 	struct nfs_page *first = nfs_list_entry(head->next);
 	struct inode *inode = first->wb_context->path.dentry->d_inode;
-	int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
 	int priority = flush_task_priority(how);
 	struct rpc_task *task;
 	struct rpc_message msg = {
@@ -1225,7 +1232,7 @@ static int nfs_commit_rpcsetup(struct list_head *head,
 		.callback_ops = &nfs_commit_ops,
 		.callback_data = data,
 		.workqueue = nfsiod_workqueue,
-		.flags = flags,
+		.flags = RPC_TASK_ASYNC,
 		.priority = priority,
 	};
 
@@ -1255,6 +1262,8 @@ static int nfs_commit_rpcsetup(struct list_head *head,
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
+	if (how & FLUSH_SYNC)
+		rpc_wait_for_completion_task(task);
 	rpc_put_task(task);
 	return 0;
 }
-- 
cgit v1.2.2


From be3bd2223b89d270853302ab0a5909fa875fd831 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 12 Apr 2010 01:51:03 +0900
Subject: nilfs2: fix typo "numer" -> "number" in alloc.c

Fixes the typo found in a warning message of a persistent object
allocator function.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 3f959f1879d8..c2a13870c605 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -425,7 +425,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
 	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
 				    group_offset, bitmap))
-		printk(KERN_WARNING "%s: entry numer %llu already freed\n",
+		printk(KERN_WARNING "%s: entry number %llu already freed\n",
 		       __func__, (unsigned long long)req->pr_entry_nr);
 
 	nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
-- 
cgit v1.2.2


From 0df5dd4aae211edeeeb84f7f84f6d093406d7c22 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 11 Apr 2010 16:48:44 -0400
Subject: NFSv4: fix delegated locking

Arnaud Giersch reports that NFSv4 locking is broken when we hold a
delegation since commit 8e469ebd6dc32cbaf620e134d79f740bf0ebab79 (NFSv4:
Don't allow posix locking against servers that don't support it).

According to Arnaud, the lock succeeds the first time he opens the file
(since we cannot do a delegated open) but then fails after we start using
delegated opens.

The following patch fixes it by ensuring that locking behaviour is
governed by a per-filesystem capability flag that is initially set, but
gets cleared if the server ever returns an OPEN without the
NFS4_OPEN_RESULT_LOCKTYPE_POSIX flag being set.

Reported-by: Arnaud Giersch <arnaud.giersch@iut-bm.univ-fcomte.fr>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/client.c   | 3 ++-
 fs/nfs/nfs4proc.c | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2a3d352c0bff..a8766c4ef2e0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1294,7 +1294,8 @@ static int nfs4_init_server(struct nfs_server *server,
 
 	/* Initialise the client representation from the mount data */
 	server->flags = data->flags;
-	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
+	server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
+		NFS_CAP_POSIX_LOCK;
 	server->options = data->options;
 
 	/* Get a client record */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fe0cd9eb1d4d..638067007c65 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1523,6 +1523,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 		nfs_post_op_update_inode(dir, o_res->dir_attr);
 	} else
 		nfs_refresh_inode(dir, o_res->dir_attr);
+	if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
+		server->caps &= ~NFS_CAP_POSIX_LOCK;
 	if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
 		status = _nfs4_proc_open_confirm(data);
 		if (status != 0)
@@ -1664,7 +1666,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
 	status = PTR_ERR(state);
 	if (IS_ERR(state))
 		goto err_opendata_put;
-	if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0)
+	if (server->caps & NFS_CAP_POSIX_LOCK)
 		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
 	nfs4_opendata_put(opendata);
 	nfs4_put_state_owner(sp);
-- 
cgit v1.2.2


From fc7683a3c30c22131b1651271d6bf9ea113b77c5 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Fri, 26 Mar 2010 19:29:54 +0300
Subject: ext2: symlink must be handled via filesystem specific operation

generic setattr implementation is no longer responsible for
quota transfer so synlinks must be handled via ext2_setattr.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/symlink.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 4e2426e22bbe..565cf817bbf1 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -32,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
+	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -43,6 +44,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
 const struct inode_operations ext2_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext2_follow_link,
+	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
-- 
cgit v1.2.2


From 774f03fb2cf89951b5f5f363b7739a2835d5924e Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Fri, 26 Mar 2010 19:29:55 +0300
Subject: ext3: symlink must be handled via filesystem specific operation

generic setattr implementation is no longer responsible for
quota transfer so synlinks must be handled via ext3_setattr.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/symlink.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index ff7b4ccd8983..7c4898207776 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
+	.setattr	= ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext3_symlink_inode_operations = {
 const struct inode_operations ext3_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext3_follow_link,
+	.setattr	= ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
-- 
cgit v1.2.2


From 4c5e6c0e70fd6ca2fa67184fd36a261b3b7b38d0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 6 Apr 2010 18:52:47 +0200
Subject: quota: Hide warnings about writes to the filesystem before quota was
 turned on

For a root filesystem write to the filesystem before quota is turned on happens
regularly and there's no way around it because of writes to syslog, /etc/mtab,
and similar. So the warning is rather pointless for ordinary users. It's
still useful during development so we just hide the warning behind
__DQUOT_PARANOIA config option.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index e0b870f4749f..f9a37513f24c 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -874,14 +874,18 @@ static int dqinit_needed(struct inode *inode, int type)
 static void add_dquot_ref(struct super_block *sb, int type)
 {
 	struct inode *inode, *old_inode = NULL;
+#ifdef __DQUOT_PARANOIA
 	int reserved = 0;
+#endif
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
 			continue;
+#ifdef __DQUOT_PARANOIA
 		if (unlikely(inode_get_rsv_space(inode) > 0))
 			reserved = 1;
+#endif
 		if (!atomic_read(&inode->i_writecount))
 			continue;
 		if (!dqinit_needed(inode, type))
@@ -903,11 +907,13 @@ static void add_dquot_ref(struct super_block *sb, int type)
 	spin_unlock(&inode_lock);
 	iput(old_inode);
 
+#ifdef __DQUOT_PARANOIA
 	if (reserved) {
 		printk(KERN_WARNING "VFS (%s): Writes happened before quota"
 			" was turned on thus quota information is probably "
 			"inconsistent. Please run quotacheck(8).\n", sb->s_id);
 	}
+#endif
 }
 
 /*
-- 
cgit v1.2.2


From 08261673cb6dc638c39f44d69b76fffb57b92a8b Mon Sep 17 00:00:00 2001
From: Andrew Perepechko <andrew.perepechko@sun.com>
Date: Mon, 12 Apr 2010 22:16:50 +0400
Subject: quota: Fix possible dq_flags corruption

dq_flags are modified non-atomically in do_set_dqblk via __set_bit calls and
atomically for example in mark_dquot_dirty or clear_dquot_dirty.  Hence a
change done by an atomic operation can be overwritten by a change done by a
non-atomic one. Fix the problem by using atomic bitops even in do_set_dqblk.

Signed-off-by: Andrew Perepechko <andrew.perepechko@sun.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index f9a37513f24c..a0a9405b202a 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2328,34 +2328,34 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	if (di->dqb_valid & QIF_SPACE) {
 		dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
 		check_blim = 1;
-		__set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+		set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
 	}
 	if (di->dqb_valid & QIF_BLIMITS) {
 		dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
 		dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
 		check_blim = 1;
-		__set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+		set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
 	}
 	if (di->dqb_valid & QIF_INODES) {
 		dm->dqb_curinodes = di->dqb_curinodes;
 		check_ilim = 1;
-		__set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+		set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
 	}
 	if (di->dqb_valid & QIF_ILIMITS) {
 		dm->dqb_isoftlimit = di->dqb_isoftlimit;
 		dm->dqb_ihardlimit = di->dqb_ihardlimit;
 		check_ilim = 1;
-		__set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+		set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
 	}
 	if (di->dqb_valid & QIF_BTIME) {
 		dm->dqb_btime = di->dqb_btime;
 		check_blim = 1;
-		__set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+		set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
 	}
 	if (di->dqb_valid & QIF_ITIME) {
 		dm->dqb_itime = di->dqb_itime;
 		check_ilim = 1;
-		__set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+		set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
 	}
 
 	if (check_blim) {
-- 
cgit v1.2.2


From f5b066287c74b624583b993395a65d03a6487b3a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 12 Apr 2010 14:24:28 -0700
Subject: ceph: fix dentry reference leak in dcache readdir

When filldir returned an error (e.g. buffer full for a large directory),
we would leak a dentry reference, causing an oops on umount.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index aed8fda33024..7505b4f1f597 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -170,11 +170,11 @@ more:
 	spin_lock(&inode->i_lock);
 	spin_lock(&dcache_lock);
 
+	last = dentry;
+
 	if (err < 0)
 		goto out_unlock;
 
-	last = dentry;
-
 	p = p->prev;
 	filp->f_pos++;
 
-- 
cgit v1.2.2


From 032d8f7268444a0f5d4ee02d9513d682d5b8edfc Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Tue, 13 Apr 2010 17:46:37 +0200
Subject: [LogFS] Prevent memory corruption on large deletes

Removing sufficiently large files would create aliases for a large
number of segments.  This in turn results in a large number of journal
entries and an overflow of s_je_array.

Cheap fix is to add a BUG_ON, turning memory corruption into something
annoying, but less dangerous.  Real fix is to count the number of
affected segments and prevent the problem completely.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/gc.c        |  8 ++++++++
 fs/logfs/journal.c   |  3 +++
 fs/logfs/logfs.h     |  8 +++++++-
 fs/logfs/readwrite.c | 14 ++++++++++++++
 fs/logfs/super.c     |  2 ++
 5 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index 92949f95a901..e8253e7fb6b2 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -458,6 +458,14 @@ static void __logfs_gc_pass(struct super_block *sb, int target)
 	struct logfs_block *block;
 	int round, progress, last_progress = 0;
 
+	/*
+	 * Doing too many changes to the segfile at once would result
+	 * in a large number of aliases.  Write the journal before
+	 * things get out of hand.
+	 */
+	if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
+		logfs_write_anchor(sb);
+
 	if (no_free_segments(sb) >= target &&
 			super->s_no_object_aliases < MAX_OBJ_ALIASES)
 		return;
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index d57c7b07b60b..2c22a4ad5329 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -493,6 +493,8 @@ static void account_shadows(struct super_block *sb)
 
 	btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
 	btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
+	btree_grim_visitor32(&tree->segment_map, 0, NULL);
+	tree->no_shadowed_segments = 0;
 
 	if (li->li_block) {
 		/*
@@ -660,6 +662,7 @@ static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
 	if (ofs < 0)
 		return ofs;
 	logfs_buf_write(area, ofs, super->s_compressed_je, len);
+	BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
 	super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
 	return 0;
 }
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 97195b9e93a5..c9929eed80b1 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -257,10 +257,14 @@ struct logfs_shadow {
  * struct shadow_tree
  * @new:			shadows where old_ofs==0, indexed by new_ofs
  * @old:			shadows where old_ofs!=0, indexed by old_ofs
+ * @segment_map:		bitfield of segments containing shadows
+ * @no_shadowed_segment:	number of segments containing shadows
  */
 struct shadow_tree {
 	struct btree_head64 new;
 	struct btree_head64 old;
+	struct btree_head32 segment_map;
+	int no_shadowed_segments;
 };
 
 struct object_alias_item {
@@ -311,6 +315,8 @@ struct logfs_block_ops {
 			write_alias_t *write_one_alias);
 };
 
+#define MAX_JOURNAL_ENTRIES 256
+
 struct logfs_super {
 	struct mtd_info *s_mtd;			/* underlying device */
 	struct block_device *s_bdev;		/* underlying device */
@@ -377,7 +383,7 @@ struct logfs_super {
 	u32	 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
 	u64	 s_last_version;
 	struct logfs_area *s_journal_area;	/* open journal segment */
-	__be64	s_je_array[64];
+	__be64	s_je_array[MAX_JOURNAL_ENTRIES];
 	int	s_no_je;
 
 	int	 s_sum_index;			/* for the 12 summaries */
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 3659c37fbd72..7e0c39c49719 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1219,6 +1219,18 @@ static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
 	mempool_free(shadow, super->s_shadow_pool);
 }
 
+static void mark_segment(struct shadow_tree *tree, u32 segno)
+{
+	int err;
+
+	if (!btree_lookup32(&tree->segment_map, segno)) {
+		err = btree_insert32(&tree->segment_map, segno, (void *)1,
+				GFP_NOFS);
+		BUG_ON(err);
+		tree->no_shadowed_segments++;
+	}
+}
+
 /**
  * fill_shadow_tree - Propagate shadow tree changes due to a write
  * @inode:	Inode owning the page
@@ -1266,6 +1278,8 @@ static void fill_shadow_tree(struct inode *inode, struct page *page,
 
 		super->s_dirty_used_bytes += shadow->new_len;
 		super->s_dirty_free_bytes += shadow->old_len;
+		mark_segment(tree, shadow->old_ofs >> super->s_segshift);
+		mark_segment(tree, shadow->new_ofs >> super->s_segshift);
 	}
 }
 
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 9d856c49afc5..d6e1f4fc3115 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -451,6 +451,8 @@ static int logfs_read_sb(struct super_block *sb, int read_only)
 
 	btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
 	btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
+	btree_init_mempool32(&super->s_shadow_tree.segment_map,
+			super->s_btree_pool);
 
 	ret = logfs_init_mapping(sb);
 	if (ret)
-- 
cgit v1.2.2


From d3a03f8031000f8297823b80e36db536fd020884 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Tue, 13 Apr 2010 17:54:27 +0200
Subject: [LogFS] Plug 8 byte information leak

Within each journal segment, 8 bytes at offset 24 would remain
uninitialized.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/journal.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 2c22a4ad5329..2957bfc21927 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -388,7 +388,10 @@ static void journal_get_erase_count(struct logfs_area *area)
 static int journal_erase_segment(struct logfs_area *area)
 {
 	struct super_block *sb = area->a_sb;
-	struct logfs_segment_header sh;
+	union {
+		struct logfs_segment_header sh;
+		unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
+	} u;
 	u64 ofs;
 	int err;
 
@@ -396,20 +399,21 @@ static int journal_erase_segment(struct logfs_area *area)
 	if (err)
 		return err;
 
-	sh.pad = 0;
-	sh.type = SEG_JOURNAL;
-	sh.level = 0;
-	sh.segno = cpu_to_be32(area->a_segno);
-	sh.ec = cpu_to_be32(area->a_erase_count);
-	sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
-	sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
+	memset(&u, 0, sizeof(u));
+	u.sh.pad = 0;
+	u.sh.type = SEG_JOURNAL;
+	u.sh.level = 0;
+	u.sh.segno = cpu_to_be32(area->a_segno);
+	u.sh.ec = cpu_to_be32(area->a_erase_count);
+	u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+	u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
 
 	/* This causes a bug in segment.c.  Not yet. */
 	//logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
 
 	ofs = dev_ofs(sb, area->a_segno, 0);
-	area->a_used_bytes = ALIGN(sizeof(sh), 16);
-	logfs_buf_write(area, ofs, &sh, sizeof(sh));
+	area->a_used_bytes = sizeof(u);
+	logfs_buf_write(area, ofs, &u, sizeof(u));
 	return 0;
 }
 
-- 
cgit v1.2.2


From ead88af5f577fd2b399a0fcdfe52605116fac489 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Tue, 13 Apr 2010 17:57:21 +0200
Subject: [LogFS] Move assertion

The assertion is valid independently of the condition.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/journal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 2957bfc21927..fd44eeea5135 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -612,9 +612,9 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
 	if (len == 0)
 		return logfs_write_header(super, header, 0, type);
 
+	BUG_ON(len > sb->s_blocksize);
 	compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
 	if (compr_len < 0 || type == JE_ANCHOR) {
-		BUG_ON(len > sb->s_blocksize);
 		memcpy(data, buf, len);
 		compr_len = len;
 		compr = COMPR_NONE;
-- 
cgit v1.2.2


From fc837c8f0446b73a1661339db406c0238dd1d184 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 13 Apr 2010 11:41:22 -0700
Subject: ceph: queue_cap_snap should always queue dirty context

This simplifies the calling convention, and fixes a bug where we queue a
capsnap with a context other than i_head_snapc (the one that matches the
dirty pages).  The result was a BUG at fs/ceph/caps.c:2178 on writeback
completion when a capsnap matching the writeback snapc could not be found.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/snap.c  | 16 +++++++---------
 fs/ceph/super.h |  3 +--
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 7e3e5f9edaa4..d197431532c0 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -430,8 +430,7 @@ static int dup_array(u64 **dst, __le64 *src, int num)
  * Caller must hold snap_rwsem for read (i.e., the realm topology won't
  * change).
  */
-void ceph_queue_cap_snap(struct ceph_inode_info *ci,
-			 struct ceph_snap_context *snapc)
+void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 {
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap_snap *capsnap;
@@ -450,10 +449,11 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
 		   as no new writes are allowed to start when pending, so any
 		   writes in progress now were started before the previous
 		   cap_snap.  lucky us. */
-		dout("queue_cap_snap %p snapc %p seq %llu used %d"
-		     " already pending\n", inode, snapc, snapc->seq, used);
+		dout("queue_cap_snap %p already pending\n", inode);
 		kfree(capsnap);
 	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+		struct ceph_snap_context *snapc = ci->i_head_snapc;
+
 		igrab(inode);
 
 		atomic_set(&capsnap->nref, 1);
@@ -462,7 +462,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
 		INIT_LIST_HEAD(&capsnap->flushing_item);
 
 		capsnap->follows = snapc->seq - 1;
-		capsnap->context = ceph_get_snap_context(snapc);
 		capsnap->issued = __ceph_caps_issued(ci, NULL);
 		capsnap->dirty = __ceph_caps_dirty(ci);
 
@@ -479,7 +478,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
 		   snapshot. */
 		capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
 		ci->i_wrbuffer_ref_head = 0;
-		ceph_put_snap_context(ci->i_head_snapc);
+		capsnap->context = snapc;
 		ci->i_head_snapc = NULL;
 		list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
 
@@ -603,7 +602,7 @@ more:
 				if (lastinode)
 					iput(lastinode);
 				lastinode = inode;
-				ceph_queue_cap_snap(ci, realm->cached_context);
+				ceph_queue_cap_snap(ci);
 				spin_lock(&realm->inodes_with_caps_lock);
 			}
 			spin_unlock(&realm->inodes_with_caps_lock);
@@ -825,8 +824,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 			spin_unlock(&realm->inodes_with_caps_lock);
 			spin_unlock(&inode->i_lock);
 
-			ceph_queue_cap_snap(ci,
-					    ci->i_snap_realm->cached_context);
+			ceph_queue_cap_snap(ci);
 
 			iput(inode);
 			continue;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 65d12036b670..4c07acaf21e3 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -714,8 +714,7 @@ extern int ceph_update_snap_trace(struct ceph_mds_client *m,
 extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
 			     struct ceph_mds_session *session,
 			     struct ceph_msg *msg);
-extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
-				struct ceph_snap_context *snapc);
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
 extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 				  struct ceph_cap_snap *capsnap);
 extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
-- 
cgit v1.2.2


From e1e4dd0caa63e166afa46a1ccc947bebb4f66bcf Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 13 Apr 2010 11:45:56 -0700
Subject: ceph: reserve one more caps space when doing readdir

We were missing space for the directory cap.  The result was a BUG at
fs/ceph/caps.c:2178.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 7505b4f1f597..159f588ca160 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -311,7 +311,7 @@ more:
 		req->r_readdir_offset = fi->next_offset;
 		req->r_args.readdir.frag = cpu_to_le32(frag);
 		req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
-		req->r_num_caps = max_entries;
+		req->r_num_caps = max_entries + 1;
 		err = ceph_mdsc_do_request(mdsc, NULL, req);
 		if (err < 0) {
 			ceph_mdsc_put_request(req);
-- 
cgit v1.2.2


From a6a5349d17f2a5c37079826f1a1474c3d08c6b53 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 13 Apr 2010 14:07:07 -0700
Subject: ceph: use separate class for ceph sockets' sk_lock

Use a separate class for ceph sockets to prevent lockdep confusion.
Because ceph sockets only get passed kernel pointers, there is no
dependency from sk_lock -> mmap_sem.  If we share the same class as other
sockets, lockdep detects a circular dependency from

	mmap_sem (page fault) -> fs mutex -> sk_lock -> mmap_sem

because dependencies are noted from both ceph and user contexts.  Using
a separate class prevents the sk_lock(ceph) -> mmap_sem dependency and
makes lockdep happy.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index f35b4945a9c3..5c75d5d32b27 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -29,6 +29,10 @@ static char tag_msg = CEPH_MSGR_TAG_MSG;
 static char tag_ack = CEPH_MSGR_TAG_ACK;
 static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
 
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key socket_class;
+#endif
+
 
 static void queue_con(struct ceph_connection *con);
 static void con_work(struct work_struct *);
@@ -227,6 +231,10 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
 	con->sock = sock;
 	sock->sk->sk_allocation = GFP_NOFS;
 
+#ifdef CONFIG_LOCKDEP
+	lockdep_set_class(&sock->sk->sk_lock, &socket_class);
+#endif
+
 	set_sock_callbacks(sock, con);
 
 	dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
-- 
cgit v1.2.2


From 1f1b0008e8dd1930d6e89522c70f4a438374302a Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Thu, 15 Apr 2010 08:03:57 +0200
Subject: [LogFS] Prevent mempool_destroy NULL pointer dereference

It would probably be better to just accept NULL pointers in
mempool_destroy().  But for the current -rc series let's keep things
simple.

This patch was lost in the cracks for a while.
Kevin Cernekee <cernekee@gmail.com> had to rediscover the problem and
send a similar patch because of it. :(

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/logfs.h     | 6 ++++++
 fs/logfs/readwrite.c | 6 ++----
 fs/logfs/segment.c   | 2 +-
 fs/logfs/super.c     | 4 ++--
 4 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index c9929eed80b1..0a3df1a0c936 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -727,4 +727,10 @@ static inline struct logfs_area *get_area(struct super_block *sb,
 	return logfs_super(sb)->s_area[(__force u8)gc_level];
 }
 
+static inline void logfs_mempool_destroy(mempool_t *pool)
+{
+	if (pool)
+		mempool_destroy(pool);
+}
+
 #endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 7e0c39c49719..aca6c56a107a 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -2243,8 +2243,6 @@ void logfs_cleanup_rw(struct super_block *sb)
 	struct logfs_super *super = logfs_super(sb);
 
 	destroy_meta_inode(super->s_segfile_inode);
-	if (super->s_block_pool)
-		mempool_destroy(super->s_block_pool);
-	if (super->s_shadow_pool)
-		mempool_destroy(super->s_shadow_pool);
+	logfs_mempool_destroy(super->s_block_pool);
+	logfs_mempool_destroy(super->s_shadow_pool);
 }
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 02db22ebbf13..8c82fe05d3e1 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -912,7 +912,7 @@ err:
 	for (i--; i >= 0; i--)
 		free_area(super->s_area[i]);
 	free_area(super->s_journal_area);
-	mempool_destroy(super->s_alias_pool);
+	logfs_mempool_destroy(super->s_alias_pool);
 	return -ENOMEM;
 }
 
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index d6e1f4fc3115..d4531eb46d0a 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -517,8 +517,8 @@ static void logfs_kill_sb(struct super_block *sb)
 	if (super->s_erase_page)
 		__free_page(super->s_erase_page);
 	super->s_devops->put_device(sb);
-	mempool_destroy(super->s_btree_pool);
-	mempool_destroy(super->s_alias_pool);
+	logfs_mempool_destroy(super->s_btree_pool);
+	logfs_mempool_destroy(super->s_alias_pool);
 	kfree(super);
 	log_super("LogFS: Finished unmounting\n");
 }
-- 
cgit v1.2.2


From 2b0b39517d1af5294128dbc2fd7ed39c8effa540 Mon Sep 17 00:00:00 2001
From: Bill Pemberton <wfp5p@virginia.edu>
Date: Fri, 16 Apr 2010 08:01:20 -0500
Subject: jfs: fix diAllocExt error in resizing filesystem

Resizing the filesystem would result in an diAllocExt error in some
instances because changes in bmp->db_agsize would not get noticed if
goto extendBmap was called.

Signed-off-by: Bill Pemberton <wfp5p@virginia.edu>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: jfs-discussion@lists.sourceforge.net
Cc: linux-kernel@vger.kernel.org
---
 fs/jfs/resize.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 7f24a0bb08ca..1aba0039f1c9 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -81,6 +81,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	struct inode *iplist[1];
 	struct jfs_superblock *j_sb, *j_sb2;
 	uint old_agsize;
+	int agsizechanged = 0;
 	struct buffer_head *bh, *bh2;
 
 	/* If the volume hasn't grown, get out now */
@@ -333,6 +334,9 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	 */
 	if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
 		goto error_out;
+
+	agsizechanged |= (bmp->db_agsize != old_agsize);
+
 	/*
 	 * the map now has extended to cover additional nblocks:
 	 * dn_mapsize = oldMapsize + nblocks;
@@ -432,7 +436,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 	 * will correctly identify the new ag);
 	 */
 	/* if new AG size the same as old AG size, done! */
-	if (bmp->db_agsize != old_agsize) {
+	if (agsizechanged) {
 		if ((rc = diExtendFS(ipimap, ipbmap)))
 			goto error_out;
 
-- 
cgit v1.2.2


From c7f2e1f0ac142a657a1de00d404e1c8345b20598 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Fri, 16 Apr 2010 08:05:50 -0500
Subject: jfs: add jfs specific ->setattr call

generic setattr not longer responsible for quota transfer.
use jfs_setattr for all jfs's inodes.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/inode.c     |  2 +-
 fs/jfs/jfs_inode.h |  1 +
 fs/jfs/namei.c     |  4 ++--
 fs/jfs/symlink.c   | 14 +++++++++++++-
 4 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b2ae190a77ba..97cd11954d08 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -60,7 +60,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
 			inode->i_op = &page_symlink_inode_operations;
 			inode->i_mapping->a_ops = &jfs_aops;
 		} else {
-			inode->i_op = &jfs_symlink_inode_operations;
+			inode->i_op = &jfs_fast_symlink_inode_operations;
 			/*
 			 * The inline data should be null-terminated, but
 			 * don't let on-disk corruption crash the kernel
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 1eff7db34d63..f8b56b238bb8 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -47,5 +47,6 @@ extern const struct file_operations jfs_dir_operations;
 extern const struct inode_operations jfs_file_inode_operations;
 extern const struct file_operations jfs_file_operations;
 extern const struct inode_operations jfs_symlink_inode_operations;
+extern const struct inode_operations jfs_fast_symlink_inode_operations;
 extern const struct dentry_operations jfs_ci_dentry_operations;
 #endif				/* _H_JFS_INODE */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index c79a4270f083..114e60071108 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -946,7 +946,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	 */
 
 	if (ssize <= IDATASIZE) {
-		ip->i_op = &jfs_symlink_inode_operations;
+		ip->i_op = &jfs_fast_symlink_inode_operations;
 
 		i_fastsymlink = JFS_IP(ip)->i_inline;
 		memcpy(i_fastsymlink, name, ssize);
@@ -968,7 +968,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	else {
 		jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
 
-		ip->i_op = &page_symlink_inode_operations;
+		ip->i_op = &jfs_symlink_inode_operations;
 		ip->i_mapping->a_ops = &jfs_aops;
 
 		/*
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 4af1a05aad0a..205b946d8e0d 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -29,9 +29,21 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	return NULL;
 }
 
-const struct inode_operations jfs_symlink_inode_operations = {
+const struct inode_operations jfs_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= jfs_follow_link,
+	.setattr	= jfs_setattr,
+	.setxattr	= jfs_setxattr,
+	.getxattr	= jfs_getxattr,
+	.listxattr	= jfs_listxattr,
+	.removexattr	= jfs_removexattr,
+};
+
+const struct inode_operations jfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.setattr	= jfs_setattr,
 	.setxattr	= jfs_setxattr,
 	.getxattr	= jfs_getxattr,
 	.listxattr	= jfs_listxattr,
-- 
cgit v1.2.2


From b6f8dd49dbdbfa60a33bba3d4b766fe341109b4b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 13 Apr 2010 15:06:44 +1000
Subject: xfs: ensure that sync updates the log tail correctly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updates to the VFS layer removed an extra ->sync_fs call into the
filesystem during the sync process (from the quota code).
Unfortunately the sync code was unknowingly relying on this call to
make sure metadata buffers were flushed via a xfs_buftarg_flush()
call to move the tail of the log forward in memory before the final
transactions of the sync process were issued.

As a result, the old code would write a very recent log tail value
to the log by the end of the sync process, and so a subsequent crash
would leave nothing for log recovery to do. Hence in qa test 182,
log recovery only replayed a small handle for inode fsync
transactions in this case.

However, with the removal of the extra ->sync_fs call, the log tail
was now not moved forward with the inode fsync transactions near the
end of the sync procese the first (and only) buftarg flush occurred
after these transactions went to disk. The result is that log
recovery now sees a large number of transactions for metadata that
is already on disk.

This usually isn't a problem, but when the transactions include
inode chunk allocation, the inode create transactions and all
subsequent changes are replayed as we cannt rely on what is on disk
is valid. As a result, if the inode was written and contains
unlogged changes, the unlogged changes are lost, thereby violating
sync semantics.

The fix is to always issue a transaction after the buftarg flush
occurs is the log iѕ not idle or covered. This results in a dummy
transaction being written that contains the up-to-date log tail
value, which will be very recent. Indeed, it will be at least as
recent as the old code would have left on disk, so log recovery
will behave exactly as it used to in this situation.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_log.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e8fba92d7cd9..2be019136287 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -745,9 +745,16 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 
 /*
  * Determine if we have a transaction that has gone to disk
- * that needs to be covered. Log activity needs to be idle (no AIL and
- * nothing in the iclogs). And, we need to be in the right state indicating
- * something has gone out.
+ * that needs to be covered. To begin the transition to the idle state
+ * firstly the log needs to be idle (no AIL and nothing in the iclogs).
+ * If we are then in a state where covering is needed, the caller is informed
+ * that dummy transactions are required to move the log into the idle state.
+ *
+ * Because this is called as part of the sync process, we should also indicate
+ * that dummy transactions should be issued in anything but the covered or
+ * idle states. This ensures that the log tail is accurately reflected in
+ * the log at the end of the sync, hence if a crash occurrs avoids replay
+ * of transactions where the metadata is already on disk.
  */
 int
 xfs_log_need_covered(xfs_mount_t *mp)
@@ -759,17 +766,24 @@ xfs_log_need_covered(xfs_mount_t *mp)
 		return 0;
 
 	spin_lock(&log->l_icloglock);
-	if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
-		(log->l_covered_state == XLOG_STATE_COVER_NEED2))
-			&& !xfs_trans_ail_tail(log->l_ailp)
-			&& xlog_iclogs_empty(log)) {
-		if (log->l_covered_state == XLOG_STATE_COVER_NEED)
-			log->l_covered_state = XLOG_STATE_COVER_DONE;
-		else {
-			ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2);
-			log->l_covered_state = XLOG_STATE_COVER_DONE2;
+	switch (log->l_covered_state) {
+	case XLOG_STATE_COVER_DONE:
+	case XLOG_STATE_COVER_DONE2:
+	case XLOG_STATE_COVER_IDLE:
+		break;
+	case XLOG_STATE_COVER_NEED:
+	case XLOG_STATE_COVER_NEED2:
+		if (!xfs_trans_ail_tail(log->l_ailp) &&
+		    xlog_iclogs_empty(log)) {
+			if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+				log->l_covered_state = XLOG_STATE_COVER_DONE;
+			else
+				log->l_covered_state = XLOG_STATE_COVER_DONE2;
 		}
+		/* FALLTHRU */
+	default:
 		needed = 1;
+		break;
 	}
 	spin_unlock(&log->l_icloglock);
 	return needed;
-- 
cgit v1.2.2


From f1d486a3617a2f620b31224e4ace1496c4627e39 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 13 Apr 2010 15:06:45 +1000
Subject: xfs: don't warn on EAGAIN in inode reclaim

Any inode reclaim flush that returns EAGAIN will result in the inode
reclaim being attempted again later. There is no need to issue a
warning into the logs about this situation.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 05cd85317f6f..fd9698215759 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -820,10 +820,10 @@ xfs_reclaim_inode(
 	 * call into reclaim to find it in a clean state instead of waiting for
 	 * it now. We also don't return errors here - if the error is transient
 	 * then the next reclaim pass will flush the inode, and if the error
-	 * is permanent then the next sync reclaim will relcaim the inode and
+	 * is permanent then the next sync reclaim will reclaim the inode and
 	 * pass on the error.
 	 */
-	if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+	if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
 			"inode 0x%llx background reclaim flush failed with %d",
 			(long long)ip->i_ino, error);
-- 
cgit v1.2.2


From b8639077abf034824046ed09e779b74c4393031f Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Sat, 17 Apr 2010 19:54:27 +0200
Subject: [LogFS] Set s_bdi

Since 32a88aa1 sync() was turned into a NOP for logfs.  Worse, sync()
would not return an error, giving the illusion that writeout had
actually happened.

Afaics jffs2 was broken as well.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/super.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index d4531eb46d0a..dacce3a917ae 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -11,6 +11,7 @@
  */
 #include "logfs.h"
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 #include <linux/mtd/mtd.h>
 #include <linux/statfs.h>
 #include <linux/buffer_head.h>
@@ -136,6 +137,10 @@ static int logfs_sb_set(struct super_block *sb, void *_super)
 	sb->s_fs_info = super;
 	sb->s_mtd = super->s_mtd;
 	sb->s_bdev = super->s_bdev;
+	if (sb->s_bdev)
+		sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
+	if (sb->s_mtd)
+		sb->s_bdi = sb->s_mtd->backing_dev_info;
 	return 0;
 }
 
-- 
cgit v1.2.2


From 3a60a1686f0d51c99bd0df8ac93050fb6dfce647 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Mon, 22 Mar 2010 00:41:35 -0500
Subject: eCryptfs: Decrypt symlink target for stat size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create a getattr handler for eCryptfs symlinks that is capable of
reading the lower target and decrypting its path.  Prior to this patch,
a stat's st_size field would represent the strlen of the encrypted path,
while readlink() would return the strlen of the decrypted path.  This
could lead to confusion in some userspace applications, since the two
values should be equal.

https://bugs.launchpad.net/bugs/524919

Reported-by: Loïc Minier <loic.minier@canonical.com>
Cc: stable@kernel.org
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/inode.c | 100 +++++++++++++++++++++++++++-------------------------
 1 file changed, 52 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ddbd096c7406..605f514f3b47 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -648,38 +648,17 @@ out_lock:
 	return rc;
 }
 
-static int
-ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
+				   size_t *bufsiz)
 {
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	char *lower_buf;
-	size_t lower_bufsiz;
-	struct dentry *lower_dentry;
-	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
-	char *plaintext_name;
-	size_t plaintext_name_size;
+	size_t lower_bufsiz = PATH_MAX;
 	mm_segment_t old_fs;
 	int rc;
 
-	lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	if (!lower_dentry->d_inode->i_op->readlink) {
-		rc = -EINVAL;
-		goto out;
-	}
-	mount_crypt_stat = &ecryptfs_superblock_to_private(
-						dentry->d_sb)->mount_crypt_stat;
-	/*
-	 * If the lower filename is encrypted, it will result in a significantly
-	 * longer name.  If needed, truncate the name after decode and decrypt.
-	 */
-	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
-		lower_bufsiz = PATH_MAX;
-	else
-		lower_bufsiz = bufsiz;
-	/* Released in this function */
 	lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
-	if (lower_buf == NULL) {
-		printk(KERN_ERR "%s: Out of memory whilst attempting to "
-		       "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
+	if (!lower_buf) {
 		rc = -ENOMEM;
 		goto out;
 	}
@@ -689,29 +668,31 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 						   (char __user *)lower_buf,
 						   lower_bufsiz);
 	set_fs(old_fs);
-	if (rc >= 0) {
-		rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
-							  &plaintext_name_size,
-							  dentry, lower_buf,
-							  rc);
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to decode and "
-			       "decrypt filename; rc = [%d]\n", __func__,
-				rc);
-			goto out_free_lower_buf;
-		}
-		/* Check for bufsiz <= 0 done in sys_readlinkat() */
-		rc = copy_to_user(buf, plaintext_name,
-				  min((size_t) bufsiz, plaintext_name_size));
-		if (rc)
-			rc = -EFAULT;
-		else
-			rc = plaintext_name_size;
-		kfree(plaintext_name);
-		fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
-	}
-out_free_lower_buf:
+	if (rc < 0)
+		goto out;
+	lower_bufsiz = rc;
+	rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
+						  lower_buf, lower_bufsiz);
+out:
 	kfree(lower_buf);
+	return rc;
+}
+
+static int
+ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+	char *kbuf;
+	size_t kbufsiz, copied;
+	int rc;
+
+	rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
+	if (rc)
+		goto out;
+	copied = min_t(size_t, bufsiz, kbufsiz);
+	rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
+	kfree(kbuf);
+	fsstack_copy_attr_atime(dentry->d_inode,
+				ecryptfs_dentry_to_lower(dentry)->d_inode);
 out:
 	return rc;
 }
@@ -1016,6 +997,28 @@ out:
 	return rc;
 }
 
+int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
+			  struct kstat *stat)
+{
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	int rc = 0;
+
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+						dentry->d_sb)->mount_crypt_stat;
+	generic_fillattr(dentry->d_inode, stat);
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+		char *target;
+		size_t targetsiz;
+
+		rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
+		if (!rc) {
+			kfree(target);
+			stat->size = targetsiz;
+		}
+	}
+	return rc;
+}
+
 int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		     struct kstat *stat)
 {
@@ -1133,6 +1136,7 @@ const struct inode_operations ecryptfs_symlink_iops = {
 	.put_link = ecryptfs_put_link,
 	.permission = ecryptfs_permission,
 	.setattr = ecryptfs_setattr,
+	.getattr = ecryptfs_getattr_link,
 	.setxattr = ecryptfs_setxattr,
 	.getxattr = ecryptfs_getxattr,
 	.listxattr = ecryptfs_listxattr,
-- 
cgit v1.2.2


From cfce08c6bdfb20ade979284e55001ca1f100ed51 Mon Sep 17 00:00:00 2001
From: Christian Pulvermacher <pulvermacher@gmx.de>
Date: Tue, 23 Mar 2010 11:51:38 -0500
Subject: ecryptfs: fix error code for missing xattrs in lower fs

If the lower file system driver has extended attributes disabled,
ecryptfs' own access functions return -ENOSYS instead of -EOPNOTSUPP.
This breaks execution of programs in the ecryptfs mount, since the
kernel expects the latter error when checking for security
capabilities in xattrs.

Signed-off-by: Christian Pulvermacher <pulvermacher@gmx.de>
Cc: stable@kernel.org
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/inode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 605f514f3b47..efd4f47d530c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1043,7 +1043,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	if (!lower_dentry->d_inode->i_op->setxattr) {
-		rc = -ENOSYS;
+		rc = -EOPNOTSUPP;
 		goto out;
 	}
 	mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1061,7 +1061,7 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
 	int rc = 0;
 
 	if (!lower_dentry->d_inode->i_op->getxattr) {
-		rc = -ENOSYS;
+		rc = -EOPNOTSUPP;
 		goto out;
 	}
 	mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1088,7 +1088,7 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	if (!lower_dentry->d_inode->i_op->listxattr) {
-		rc = -ENOSYS;
+		rc = -EOPNOTSUPP;
 		goto out;
 	}
 	mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1105,7 +1105,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	if (!lower_dentry->d_inode->i_op->removexattr) {
-		rc = -ENOSYS;
+		rc = -EOPNOTSUPP;
 		goto out;
 	}
 	mutex_lock(&lower_dentry->d_inode->i_mutex);
-- 
cgit v1.2.2


From 133b8f9d632cc23715c6d72d1c5ac449e054a12a Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@jeffreymahoney.com>
Date: Fri, 19 Mar 2010 15:35:46 -0400
Subject: ecryptfs: fix use with tmpfs by removing d_drop from
 ecryptfs_destroy_inode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since tmpfs has no persistent storage, it pins all its dentries in memory
so they have d_count=1 when other file systems would have d_count=0.
->lookup is only used to create new dentries. If the caller doesn't
instantiate it, it's freed immediately at dput(). ->readdir reads
directly from the dcache and depends on the dentries being hashed.

When an ecryptfs mount is mounted, it associates the lower file and dentry
with the ecryptfs files as they're accessed. When it's umounted and
destroys all the in-memory ecryptfs inodes, it fput's the lower_files and
d_drop's the lower_dentries. Commit 4981e081 added this and a d_delete in
2008 and several months later commit caeeeecf removed the d_delete. I
believe the d_drop() needs to be removed as well.

The d_drop effectively hides any file that has been accessed via ecryptfs
from the underlying tmpfs since it depends on it being hashed for it to
be accessible. I've removed the d_drop on my development node and see no
ill effects with basic testing on both tmpfs and persistent storage.

As a side effect, after ecryptfs d_drops the dentries on tmpfs, tmpfs
BUGs on umount. This is due to the dentries being unhashed.
tmpfs->kill_sb is kill_litter_super which calls d_genocide to drop
the reference pinning the dentry. It skips unhashed and negative dentries,
but shrink_dcache_for_umount_subtree doesn't. Since those dentries
still have an elevated d_count, we get a BUG().

This patch removes the d_drop call and fixes both issues.

This issue was reported at:
https://bugzilla.novell.com/show_bug.cgi?id=567887

Reported-by:  Árpád Bíró <biroa@demasz.hu>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Cc: Dustin Kirkland <kirkland@canonical.com>
Cc: stable@kernel.org
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/super.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index b15a43a80ab7..1a037f77aa52 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -85,7 +85,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
 		if (lower_dentry->d_inode) {
 			fput(inode_info->lower_file);
 			inode_info->lower_file = NULL;
-			d_drop(lower_dentry);
 		}
 	}
 	ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
-- 
cgit v1.2.2


From 3a8380c0754a7972668a46f645930910e304095c Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Tue, 23 Mar 2010 18:09:02 -0500
Subject: eCryptfs: Copy lower directory inode times and size on link

The timestamps and size of a lower inode involved in a link() call was
being copied to the upper parent inode.  Instead, we should be
copying lower parent inode's timestamps and size to the upper parent
inode.  I discovered this bug using the POSIX test suite at Tuxera.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index efd4f47d530c..15b424837c18 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -456,8 +456,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 	rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
 	if (rc)
 		goto out_lock;
-	fsstack_copy_attr_times(dir, lower_new_dentry->d_inode);
-	fsstack_copy_inode_size(dir, lower_new_dentry->d_inode);
+	fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
+	fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
 	old_dentry->d_inode->i_nlink =
 		ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
 	i_size_write(new_dentry->d_inode, file_size_save);
-- 
cgit v1.2.2


From 9f37622f897a90ad3c3da5c14d94d8f3ffc62b70 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Thu, 25 Mar 2010 11:16:56 -0500
Subject: eCryptfs: Turn lower lookup error messages into debug messages

Vaugue warnings about ENAMETOOLONG errors when looking up an encrypted
file name have caused many users to become concerned about their data.
Since this is a rather harmless condition, I'm moving this warning to
only be printed when the ecryptfs_verbosity module param is 1.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/inode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 15b424837c18..b980fce984c6 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -388,9 +388,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
-		printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
-		       "lower_dentry = [%s]\n", __func__, rc,
-		       ecryptfs_dentry->d_name.name);
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+				"[%d] on lower_dentry = [%s]\n", __func__, rc,
+				encrypted_and_encoded_name);
 		goto out_d_drop;
 	}
 	if (lower_dentry->d_inode)
@@ -417,9 +417,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
-		printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
-		       "lower_dentry = [%s]\n", __func__, rc,
-		       encrypted_and_encoded_name);
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+				"[%d] on lower_dentry = [%s]\n", __func__, rc,
+				encrypted_and_encoded_name);
 		goto out_d_drop;
 	}
 lookup_and_interpose:
-- 
cgit v1.2.2


From 62af9b520513d78484f22f874916dfacbc889ce0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 19 Apr 2010 16:47:20 +0200
Subject: quota: Convert __DQUOT_PARANOIA symbol to standard config option

Make __DQUOT_PARANOIA define from the old days a standard config option
and turn it off by default.

This gets rid of a quota warning about writes before quota is turned on
for systems with ext4 root filesystem. Currently there's no way to legally
solve this because /etc/mtab has to be written before quota is turned on
on most systems.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/Kconfig |  8 ++++++++
 fs/quota/dquot.c | 16 +++++++---------
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index dad7fb247ddc..3e21b1e2ad3a 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -33,6 +33,14 @@ config PRINT_QUOTA_WARNING
 	  Note that this behavior is currently deprecated and may go away in
 	  future. Please use notification via netlink socket instead.
 
+config QUOTA_DEBUG
+	bool "Additional quota sanity checks"
+	depends on QUOTA
+	default n
+	help
+	  If you say Y here, quota subsystem will perform some additional
+	  sanity checks of quota internal structures. If unsure, say N.
+
 # Generic support for tree structured quota files. Selected when needed.
 config QUOTA_TREE
 	 tristate
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a0a9405b202a..788b5802a7ce 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -80,8 +80,6 @@
 
 #include <asm/uaccess.h>
 
-#define __DQUOT_PARANOIA
-
 /*
  * There are three quota SMP locks. dq_list_lock protects all lists with quotas
  * and quota formats, dqstats structure containing statistics about the lists
@@ -695,7 +693,7 @@ void dqput(struct dquot *dquot)
 
 	if (!dquot)
 		return;
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
 	if (!atomic_read(&dquot->dq_count)) {
 		printk("VFS: dqput: trying to free free dquot\n");
 		printk("VFS: device %s, dquot of %s %d\n",
@@ -748,7 +746,7 @@ we_slept:
 		goto we_slept;
 	}
 	atomic_dec(&dquot->dq_count);
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
 	/* sanity check */
 	BUG_ON(!list_empty(&dquot->dq_free));
 #endif
@@ -845,7 +843,7 @@ we_slept:
 		dquot = NULL;
 		goto out;
 	}
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
 	BUG_ON(!dquot->dq_sb);	/* Has somebody invalidated entry under us? */
 #endif
 out:
@@ -874,7 +872,7 @@ static int dqinit_needed(struct inode *inode, int type)
 static void add_dquot_ref(struct super_block *sb, int type)
 {
 	struct inode *inode, *old_inode = NULL;
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
 	int reserved = 0;
 #endif
 
@@ -882,7 +880,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
 			continue;
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
 		if (unlikely(inode_get_rsv_space(inode) > 0))
 			reserved = 1;
 #endif
@@ -907,7 +905,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
 	spin_unlock(&inode_lock);
 	iput(old_inode);
 
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
 	if (reserved) {
 		printk(KERN_WARNING "VFS (%s): Writes happened before quota"
 			" was turned on thus quota information is probably "
@@ -940,7 +938,7 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
 	inode->i_dquot[type] = NULL;
 	if (dquot) {
 		if (dqput_blocks(dquot)) {
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
 			if (atomic_read(&dquot->dq_count) != 1)
 				printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
 #endif
-- 
cgit v1.2.2


From b6349ac89eacb813f6963f7263da05bc3f483351 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Tue, 20 Apr 2010 21:44:10 +0200
Subject: [LogFS] Split large truncated into smaller chunks

Truncate would do an almost limitless amount of work without invoking
the garbage collector in between.  Split it up into more manageable,
though still large, chunks.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/readwrite.c | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index aca6c56a107a..7e3a1e5fd76d 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1837,19 +1837,37 @@ static int __logfs_truncate(struct inode *inode, u64 size)
 	return logfs_truncate_direct(inode, size);
 }
 
-int logfs_truncate(struct inode *inode, u64 size)
+/*
+ * Truncate, by changing the segment file, can consume a fair amount
+ * of resources.  So back off from time to time and do some GC.
+ * 8 or 2048 blocks should be well within safety limits even if
+ * every single block resided in a different segment.
+ */
+#define TRUNCATE_STEP	(8 * 1024 * 1024)
+int logfs_truncate(struct inode *inode, u64 target)
 {
 	struct super_block *sb = inode->i_sb;
-	int err;
+	u64 size = i_size_read(inode);
+	int err = 0;
 
-	logfs_get_wblocks(sb, NULL, 1);
-	err = __logfs_truncate(inode, size);
-	if (!err)
-		err = __logfs_write_inode(inode, 0);
-	logfs_put_wblocks(sb, NULL, 1);
+	size = ALIGN(size, TRUNCATE_STEP);
+	while (size > target) {
+		if (size > TRUNCATE_STEP)
+			size -= TRUNCATE_STEP;
+		else
+			size = 0;
+		if (size < target)
+			size = target;
+
+		logfs_get_wblocks(sb, NULL, 1);
+		err = __logfs_truncate(inode, target);
+		if (!err)
+			err = __logfs_write_inode(inode, 0);
+		logfs_put_wblocks(sb, NULL, 1);
+	}
 
 	if (!err)
-		err = vmtruncate(inode, size);
+		err = vmtruncate(inode, target);
 
 	/* I don't trust error recovery yet. */
 	WARN_ON(err);
-- 
cgit v1.2.2


From d7dfee3f5db5575b1d838744559c3c9bb351f74f Mon Sep 17 00:00:00 2001
From: Jun Sun <jsun@junsun.net>
Date: Thu, 31 Dec 2009 17:28:52 -0800
Subject: uclinux: error message when FLAT reloc symbol is invalid, v2

This patch fixes a cosmetic error in printk. Text segment and data/bss
segment are allocated from two different areas. It is not meaningful to
give the diff between them in the error reporting messages.

Signed-off-by: Jun Sun <jsun@junsun.net>
Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 fs/binfmt_flat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e0e769bdca59..49566c1687d8 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -355,7 +355,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
 
 	if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
 		printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
-		       (int) r,(int)(start_brk-start_code),(int)text_len);
+		       (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
 		goto failed;
 	}
 
-- 
cgit v1.2.2


From 083fd8b21a13742b37ab347089c73f895a896672 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 21 Apr 2010 12:01:23 +0100
Subject: AFS: Don't pass error value to page_cache_release() in error handling

In the error handling in afs_mntpt_do_automount(), we pass an error
pointer to page_cache_release() if read_mapping_page() failed.  Instead,
we should extend the gotos around the error handling we don't need.

Reported-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/mntpt.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 5e813a816ce4..b3feddc4f7d6 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -138,9 +138,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 {
 	struct afs_super_info *super;
 	struct vfsmount *mnt;
-	struct page *page = NULL;
+	struct page *page;
 	size_t size;
-	char *buf, *devname = NULL, *options = NULL;
+	char *buf, *devname, *options;
 	int ret;
 
 	_enter("{%s}", mntpt->d_name.name);
@@ -150,22 +150,22 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	ret = -EINVAL;
 	size = mntpt->d_inode->i_size;
 	if (size > PAGE_SIZE - 1)
-		goto error;
+		goto error_no_devname;
 
 	ret = -ENOMEM;
 	devname = (char *) get_zeroed_page(GFP_KERNEL);
 	if (!devname)
-		goto error;
+		goto error_no_devname;
 
 	options = (char *) get_zeroed_page(GFP_KERNEL);
 	if (!options)
-		goto error;
+		goto error_no_options;
 
 	/* read the contents of the AFS special symlink */
 	page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
-		goto error;
+		goto error_no_page;
 	}
 
 	ret = -EIO;
@@ -196,12 +196,12 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	return mnt;
 
 error:
-	if (page)
-		page_cache_release(page);
-	if (devname)
-		free_page((unsigned long) devname);
-	if (options)
-		free_page((unsigned long) options);
+	page_cache_release(page);
+error_no_page:
+	free_page((unsigned long) options);
+error_no_options:
+	free_page((unsigned long) devname);
+error_no_devname:
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
 }
-- 
cgit v1.2.2